mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-05-31 17:52:27 +00:00
Get rid of some legacy code
This commit is contained in:
parent
37ca1badaf
commit
38cc4b3c68
27 changed files with 123 additions and 600 deletions
|
@ -16,34 +16,61 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/pcmpgtb.h"
|
||||
#include "libc/intrin/pmovmskb.h"
|
||||
#include "libc/intrin/punpckhbw.h"
|
||||
#include "libc/intrin/punpcklbw.h"
|
||||
#include <arm_neon.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "libc/dce.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/str/thompike.h"
|
||||
#include "libc/str/utf16.h"
|
||||
#include "third_party/aarch64/arm_neon.internal.h"
|
||||
#include "third_party/intel/emmintrin.internal.h"
|
||||
|
||||
#if !IsModeDbg()
|
||||
#if defined(__x86_64__)
|
||||
|
||||
// 34x speedup for ascii
|
||||
static inline axdx_t tprecode8to16_sse2(char16_t *dst, size_t dstsize,
|
||||
const char *src, axdx_t r) {
|
||||
uint8_t v1[16], v2[16], vz[16];
|
||||
memset(vz, 0, 16);
|
||||
__m128i v1, v2, vz;
|
||||
vz = _mm_setzero_si128();
|
||||
while (r.ax + 16 < dstsize) {
|
||||
memcpy(v1, src + r.dx, 16);
|
||||
pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz);
|
||||
if (pmovmskb(v2) != 0xFFFF)
|
||||
v1 = _mm_loadu_si128((__m128i *)(src + r.dx));
|
||||
v2 = _mm_cmpgt_epi8(v1, vz);
|
||||
if (_mm_movemask_epi8(v2) != 0xFFFF)
|
||||
break;
|
||||
punpcklbw(v2, v1, vz);
|
||||
punpckhbw(v1, v1, vz);
|
||||
memcpy(dst + r.ax + 0, v2, 16);
|
||||
memcpy(dst + r.ax + 8, v1, 16);
|
||||
__m128i lo = _mm_unpacklo_epi8(v1, vz);
|
||||
__m128i hi = _mm_unpackhi_epi8(v1, vz);
|
||||
_mm_storeu_si128((__m128i *)(dst + r.ax), lo);
|
||||
_mm_storeu_si128((__m128i *)(dst + r.ax + 8), hi);
|
||||
r.ax += 16;
|
||||
r.dx += 16;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
static inline axdx_t tprecode8to16_neon(char16_t *dst, size_t dstsize,
|
||||
const char *src, axdx_t r) {
|
||||
uint8x16_t v1;
|
||||
while (r.ax + 16 < dstsize) {
|
||||
v1 = vld1q_u8((const uint8_t *)(src + r.dx));
|
||||
uint8x16_t cmp = vcgtq_u8(v1, vdupq_n_u8(0));
|
||||
if (vaddvq_u8(cmp) != 16 * 0xFF)
|
||||
break;
|
||||
uint16x8_t lo = vmovl_u8(vget_low_u8(v1));
|
||||
uint16x8_t hi = vmovl_u8(vget_high_u8(v1));
|
||||
vst1q_u16((uint16_t *)(dst + r.ax), lo);
|
||||
vst1q_u16((uint16_t *)(dst + r.ax + 8), hi);
|
||||
r.ax += 16;
|
||||
r.dx += 16;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Transcodes UTF-8 to UTF-16.
|
||||
*
|
||||
|
@ -64,10 +91,14 @@ axdx_t tprecode8to16(char16_t *dst, size_t dstsize, const char *src) {
|
|||
r.ax = 0;
|
||||
r.dx = 0;
|
||||
for (;;) {
|
||||
#if defined(__x86_64__) && !IsModeDbg()
|
||||
if (!((uintptr_t)(src + r.dx) & 15)) {
|
||||
#if !IsModeDbg()
|
||||
#if defined(__x86_64__)
|
||||
if (!((uintptr_t)(src + r.dx) & 15))
|
||||
r = tprecode8to16_sse2(dst, dstsize, src, r);
|
||||
}
|
||||
#elif defined(__aarch64__)
|
||||
if (!((uintptr_t)(src + r.dx) & 15))
|
||||
r = tprecode8to16_neon(dst, dstsize, src, r);
|
||||
#endif
|
||||
#endif
|
||||
x = src[r.dx++] & 0377;
|
||||
if (x >= 0300) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue