From 9a4b0df5e8c25d628922c26e685ba7950ab1ffb8 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 23 Oct 2024 12:21:27 +0200 Subject: [PATCH] Load little-endian models on s390x Introduce byteswap function for ggml data. Implement some of them. Currently tested on llama3.2. --- ggml/include/ggml.h | 2 + ggml/src/ggml.c | 265 +++++++++++++++++++++++++++++++++++++ ggml/src/gguf.cpp | 50 ++++++- src/llama-model-loader.cpp | 16 +++ 4 files changed, 332 insertions(+), 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1198dc1fd..f524a34fc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2144,6 +2144,7 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_byteswap_t) ( void * GGML_RESTRICT buffer, size_t elements); struct ggml_type_traits { const char * type_name; @@ -2153,6 +2154,7 @@ extern "C" { bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float_ref; + ggml_byteswap_t byteswap; }; GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b1d0d4913..b4740a1ba 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -36,6 +36,44 @@ #include #endif +#if defined(__gnu_linux__) +#include +#else +#define le64toh(x) (x) +#define le32toh(x) (x) +#define le16toh(x) (x) +#endif + +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define convert_from_le16(x) +#define convert_from_le32(x) +#define convert_from_le64(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +static inline void convert_from_le16(void * value) { + uint16_t temp; + memcpy(&temp, value, sizeof(uint16_t)); + temp = le16toh(temp); + memcpy(value, &temp, sizeof(uint16_t)); +} + +static inline void convert_from_le32(void * value) { + uint32_t temp; + memcpy(&temp, value, sizeof(uint32_t)); + temp = le32toh(temp); + memcpy(value, &temp, sizeof(uint32_t)); +} + +static inline void convert_from_le64(void * value) { + uint64_t temp; + memcpy(&temp, value, sizeof(uint64_t)); + temp = le64toh(temp); + memcpy(value, &temp, sizeof(uint64_t)); +} +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif + #if defined(__APPLE__) #include #include @@ -561,6 +599,34 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); +static void ggml_byteswap_i16 (void * restrict buffer, size_t elements); +static void ggml_byteswap_i32 (void * restrict buffer, size_t elements); +static void ggml_byteswap_i64 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q2_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q3_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q6_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_xxs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_xs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq3_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq1_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq1_m (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq4_nl (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq4_xs (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements); + static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -573,30 +639,35 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(int16_t), .is_quantized = false, + .byteswap = ggml_byteswap_i16, }, [GGML_TYPE_I32] = { .type_name = "i32", .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, + .byteswap = ggml_byteswap_i32, }, [GGML_TYPE_I64] = { .type_name = "i64", .blck_size = 1, .type_size = sizeof(int64_t), .is_quantized = false, + .byteswap = ggml_byteswap_i64, }, [GGML_TYPE_F64] = { .type_name = "f64", .blck_size = 1, .type_size = sizeof(double), .is_quantized = false, + .byteswap = ggml_byteswap_i64, }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, .type_size = sizeof(float), .is_quantized = false, + .byteswap = ggml_byteswap_i32, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -605,6 +676,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .byteswap = ggml_byteswap_i16, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -613,6 +685,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, + .byteswap = ggml_byteswap_q4_0, }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -621,6 +694,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, + .byteswap = ggml_byteswap_q4_1, }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -641,6 +715,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_0, .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref, + .byteswap = ggml_byteswap_q5_0, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -649,6 +724,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_1, .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, + .byteswap = ggml_byteswap_q5_1, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -657,6 +733,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q8_0, .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref, + .byteswap = ggml_byteswap_q8_0, }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -664,6 +741,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_1), .is_quantized = true, .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, + .byteswap = ggml_byteswap_q8_1, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -672,6 +750,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_K, .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref, + .byteswap = ggml_byteswap_q2_k, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -680,6 +759,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q3_K, .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref, + .byteswap = ggml_byteswap_q3_k, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -688,6 +768,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_K, .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref, + .byteswap = ggml_byteswap_q4_k, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -696,6 +777,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_K, .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref, + .byteswap = ggml_byteswap_q5_k, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -704,6 +786,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q6_K, .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref, + .byteswap = ggml_byteswap_q6_k, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -712,6 +795,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq2_xxs, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -720,6 +804,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq2_xs, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -728,6 +813,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref, + .byteswap = ggml_byteswap_iq3_xxs, }, [GGML_TYPE_IQ3_S] = { .type_name = "iq3_s", @@ -736,6 +822,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq3_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref, + .byteswap = ggml_byteswap_iq3_s, }, [GGML_TYPE_IQ2_S] = { .type_name = "iq2_s", @@ -744,6 +831,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref, + .byteswap = ggml_byteswap_iq2_s, }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", @@ -752,6 +840,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_s, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq1_s, }, [GGML_TYPE_IQ1_M] = { .type_name = "iq1_m", @@ -760,6 +849,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_m, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq1_m, }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", @@ -768,6 +858,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, + .byteswap = ggml_byteswap_iq4_nl, }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", @@ -776,12 +867,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref, + .byteswap = ggml_byteswap_iq4_xs, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, .type_size = sizeof(block_q8_K), .is_quantized = true, + .byteswap = ggml_byteswap_q8_k, }, [GGML_TYPE_BF16] = { .type_name = "bf16", @@ -790,24 +883,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, + .byteswap = ggml_byteswap_i16, }, [31] = { // GGML_TYPE_Q4_0_4_4 .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_4x4, }, [32] = { // GGML_TYPE_Q4_0_4_8 .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_4x8, }, [33] = { // GGML_TYPE_Q4_0_8_8 .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_8x8, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -6499,3 +6596,171 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons if (p0->strict_cpu != p1->strict_cpu ) return false; return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } + +static void ggml_byteswap_i16(void * restrict buffer, size_t elements) { + uint16_t *data_ptr = (uint16_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(data_ptr + i); + } +} + +static void ggml_byteswap_i32(void * restrict buffer, size_t elements) { + uint32_t *data_ptr = (uint32_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le32(data_ptr + i); + } +} + +static void ggml_byteswap_i64(void * restrict buffer, size_t elements) { + uint64_t *data_ptr = (uint64_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le64(data_ptr + i); + } +} + +static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { + block_q4_K *data_ptr = (block_q4_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].dmin)); + } +} + +static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { + block_q6_K *data_ptr = (block_q6_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq1_m(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab13669c5..bc47dd6cc 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,6 +15,52 @@ #include #include +#if defined(__gnu_linux__) +#include +#else +#define le64toh(x) (x) +#define le32toh(x) (x) +#define le16toh(x) (x) +#endif + +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define convert_from_le(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include + +template = 0> +static inline void convert_from_le(T * /*value*/) +{ +} + +template = 0> +static inline void convert_from_le(T * value) { + uint16_t temp; + memcpy(&temp, value, sizeof(uint16_t)); + temp = le16toh(temp); + memcpy(value, &temp, sizeof(uint16_t)); +} + +template = 0> +static inline void convert_from_le(T * value) { + uint32_t temp; + memcpy(&temp, value, sizeof(uint32_t)); + temp = le32toh(temp); + memcpy(value, &temp, sizeof(uint32_t)); +} + +template = 0> +static inline void convert_from_le(T * value) { + uint64_t temp; + memcpy(&temp, value, sizeof(uint64_t)); + temp = le64toh(temp); + memcpy(value, &temp, sizeof(uint64_t)); +} +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif + template struct type_to_gguf_type; @@ -223,7 +269,9 @@ struct gguf_reader { template bool read(T & dst) const { - return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + auto res = fread(&dst, 1, sizeof(dst), file); + convert_from_le(&dst); + return res == sizeof(dst); } template diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf61..be6e607b7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1024,6 +1024,14 @@ bool llama_model_loader::load_all_data( if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); + } +#endif + if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1052,6 +1060,14 @@ bool llama_model_loader::load_all_data( read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type)); + } +#endif + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));