diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 74a5f9120..64fa45974 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -457,6 +457,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { printf("\rGenerating (%d / %d tokens)", (1 + params.n_predict - remaining_tokens), params.n_predict); } + fflush(stdout); bool evalres = false; diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c index 4a96ab96c..5720788d6 100644 --- a/otherarch/ggml_v1.c +++ b/otherarch/ggml_v1.c @@ -787,8 +787,30 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { #define GGML_V1_F32Cx8 __m256 #define GGML_V1_F32Cx8_ZERO _mm256_setzero_ps() #define GGML_V1_F32Cx8_SET1(x) _mm256_set1_ps(x) +#if defined(__F16C__) #define GGML_V1_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) #define GGML_V1_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#else +static inline __m256 __avx_f32cx8_load(ggml_v1_fp16_t *x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) + tmp[i] = GGML_V1_FP16_TO_FP32(x[i]); + + return _mm256_loadu_ps(tmp); +} +static inline void __avx_f32cx8_store(ggml_v1_fp16_t *x, __m256 y) { + float arr[8]; + + _mm256_storeu_ps(arr, y); + + for (int i = 0; i < 8; i++) + x[i] = GGML_V1_FP32_TO_FP16(arr[i]); +} +#define GGML_V1_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_V1_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#endif + #define GGML_V1_F32Cx8_FMA GGML_V1_F32x8_FMA #define GGML_V1_F32Cx8_ADD _mm256_add_ps #define GGML_V1_F32Cx8_MUL _mm256_mul_ps