Get rid of some legacy code

This commit is contained in:
Justine Tunney 2024-08-24 17:53:30 -07:00
parent 37ca1badaf
commit 38cc4b3c68
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
27 changed files with 123 additions and 600 deletions

View file

@ -17,8 +17,6 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/bsf.h"
#include "libc/intrin/pcmpgtb.h"
#include "libc/intrin/pmovmskb.h"
#include "libc/macros.h"
#include "libc/str/str.h"
#include "libc/str/thompike.h"

View file

@ -18,35 +18,55 @@
*/
#include "libc/dce.h"
#include "libc/fmt/conv.h"
#include "libc/intrin/packsswb.h"
#include "libc/intrin/pandn.h"
#include "libc/intrin/pcmpgtw.h"
#include "libc/intrin/pmovmskb.h"
#include "libc/str/str.h"
#include "libc/str/utf16.h"
#include "third_party/aarch64/arm_neon.internal.h"
#include "third_party/intel/emmintrin.internal.h"
static const int16_t kDel16[8] = {127, 127, 127, 127, 127, 127, 127, 127};
#if !IsModeDbg()
#if defined(__x86_64__)
/* 10x speedup for ascii */
static axdx_t tprecode16to8_sse2(char *dst, size_t dstsize, const char16_t *src,
axdx_t r) {
int16_t v1[8], v2[8], v3[8], vz[8];
memset(vz, 0, 16);
__m128i v1, v2, v3, vz;
vz = _mm_setzero_si128();
while (r.ax + 8 < dstsize) {
memcpy(v1, src + r.dx, 16);
pcmpgtw(v2, v1, vz);
pcmpgtw(v3, v1, kDel16);
pandn((void *)v2, (void *)v3, (void *)v2);
if (pmovmskb((void *)v2) != 0xFFFF)
v1 = _mm_loadu_si128((__m128i *)(src + r.dx));
v2 = _mm_cmpgt_epi16(v1, vz);
v3 = _mm_cmpgt_epi16(v1, _mm_set1_epi16(0x7F));
v2 = _mm_andnot_si128(v3, v2);
if (_mm_movemask_epi8(v2) != 0xFFFF)
break;
packsswb((void *)v1, v1, v1);
memcpy(dst + r.ax, v1, 8);
v1 = _mm_packs_epi16(v1, v1);
_mm_storel_epi64((__m128i *)(dst + r.ax), v1);
r.ax += 8;
r.dx += 8;
}
return r;
}
#elif defined(__aarch64__)
static axdx_t tprecode16to8_neon(char *dst, size_t dstsize, const char16_t *src,
axdx_t r) {
uint16x8_t v1, v2, v3;
while (r.ax + 8 < dstsize) {
v1 = vld1q_u16((const uint16_t *)(src + r.dx));
v2 = vcgtq_u16(v1, vdupq_n_u16(0));
v3 = vcgtq_u16(v1, vdupq_n_u16(0x7F));
v2 = vbicq_u16(v2, v3);
if (vaddvq_u16(v2) != 8 * 0xFFFF)
break;
vst1_u8((uint8_t *)(dst + r.ax), vqmovn_u16(v1));
r.ax += 8;
r.dx += 8;
}
return r;
}
#endif
#endif
/**
* Transcodes UTF-16 to UTF-8.
*
@ -66,10 +86,14 @@ axdx_t tprecode16to8(char *dst, size_t dstsize, const char16_t *src) {
r.ax = 0;
r.dx = 0;
for (;;) {
#if defined(__x86_64__) && !IsModeDbg() && !IsTiny()
if (!((uintptr_t)(src + r.dx) & 15)) {
#if !IsModeDbg()
#if defined(__x86_64__)
if (!((uintptr_t)(src + r.dx) & 15))
r = tprecode16to8_sse2(dst, dstsize, src, r);
}
#elif defined(__aarch64__)
if (!((uintptr_t)(src + r.dx) & 15))
r = tprecode16to8_neon(dst, dstsize, src, r);
#endif
#endif
if (!(x = src[r.dx++]))
break;

View file

@ -16,34 +16,61 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/pcmpgtb.h"
#include "libc/intrin/pmovmskb.h"
#include "libc/intrin/punpckhbw.h"
#include "libc/intrin/punpcklbw.h"
#include <arm_neon.h>
#include <stdint.h>
#include <string.h>
#include "libc/dce.h"
#include "libc/str/str.h"
#include "libc/str/thompike.h"
#include "libc/str/utf16.h"
#include "third_party/aarch64/arm_neon.internal.h"
#include "third_party/intel/emmintrin.internal.h"
#if !IsModeDbg()
#if defined(__x86_64__)
// 34x speedup for ascii
static inline axdx_t tprecode8to16_sse2(char16_t *dst, size_t dstsize,
const char *src, axdx_t r) {
uint8_t v1[16], v2[16], vz[16];
memset(vz, 0, 16);
__m128i v1, v2, vz;
vz = _mm_setzero_si128();
while (r.ax + 16 < dstsize) {
memcpy(v1, src + r.dx, 16);
pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz);
if (pmovmskb(v2) != 0xFFFF)
v1 = _mm_loadu_si128((__m128i *)(src + r.dx));
v2 = _mm_cmpgt_epi8(v1, vz);
if (_mm_movemask_epi8(v2) != 0xFFFF)
break;
punpcklbw(v2, v1, vz);
punpckhbw(v1, v1, vz);
memcpy(dst + r.ax + 0, v2, 16);
memcpy(dst + r.ax + 8, v1, 16);
__m128i lo = _mm_unpacklo_epi8(v1, vz);
__m128i hi = _mm_unpackhi_epi8(v1, vz);
_mm_storeu_si128((__m128i *)(dst + r.ax), lo);
_mm_storeu_si128((__m128i *)(dst + r.ax + 8), hi);
r.ax += 16;
r.dx += 16;
}
return r;
}
#elif defined(__aarch64__)
static inline axdx_t tprecode8to16_neon(char16_t *dst, size_t dstsize,
const char *src, axdx_t r) {
uint8x16_t v1;
while (r.ax + 16 < dstsize) {
v1 = vld1q_u8((const uint8_t *)(src + r.dx));
uint8x16_t cmp = vcgtq_u8(v1, vdupq_n_u8(0));
if (vaddvq_u8(cmp) != 16 * 0xFF)
break;
uint16x8_t lo = vmovl_u8(vget_low_u8(v1));
uint16x8_t hi = vmovl_u8(vget_high_u8(v1));
vst1q_u16((uint16_t *)(dst + r.ax), lo);
vst1q_u16((uint16_t *)(dst + r.ax + 8), hi);
r.ax += 16;
r.dx += 16;
}
return r;
}
#endif
#endif
/**
* Transcodes UTF-8 to UTF-16.
*
@ -64,10 +91,14 @@ axdx_t tprecode8to16(char16_t *dst, size_t dstsize, const char *src) {
r.ax = 0;
r.dx = 0;
for (;;) {
#if defined(__x86_64__) && !IsModeDbg()
if (!((uintptr_t)(src + r.dx) & 15)) {
#if !IsModeDbg()
#if defined(__x86_64__)
if (!((uintptr_t)(src + r.dx) & 15))
r = tprecode8to16_sse2(dst, dstsize, src, r);
}
#elif defined(__aarch64__)
if (!((uintptr_t)(src + r.dx) & 15))
r = tprecode8to16_neon(dst, dstsize, src, r);
#endif
#endif
x = src[r.dx++] & 0377;
if (x >= 0300) {