ggml : optimize convert f32<->f16 for loongarch_asx
This commit is contained in:
parent
194b2e69f8
commit
e6d955ebcd
1 changed files with 9 additions and 15 deletions
|
@ -1078,29 +1078,23 @@ do { \
|
||||||
#define GGML_F16_STEP 32
|
#define GGML_F16_STEP 32
|
||||||
#define GGML_F16_EPR 8
|
#define GGML_F16_EPR 8
|
||||||
|
|
||||||
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
||||||
|
|
||||||
#define GGML_F32Cx8 __m256
|
#define GGML_F32Cx8 __m256
|
||||||
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
||||||
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
||||||
|
|
||||||
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
||||||
float tmp[8];
|
__m256i a;
|
||||||
|
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
||||||
for (int i = 0; i < 8; i++) {
|
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
||||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
return __lasx_xvfcvtl_s_h(a);
|
||||||
}
|
|
||||||
|
|
||||||
return (__m256)__lasx_xvld(tmp, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||||
float arr[8];
|
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
||||||
|
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
||||||
__lasx_xvst(y, arr, 0);
|
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
||||||
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue