From 9a4b0df5e8c25d628922c26e685ba7950ab1ffb8 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 23 Oct 2024 12:21:27 +0200 Subject: [PATCH 01/14] Load little-endian models on s390x Introduce byteswap function for ggml data. Implement some of them. Currently tested on llama3.2. --- ggml/include/ggml.h | 2 + ggml/src/ggml.c | 265 +++++++++++++++++++++++++++++++++++++ ggml/src/gguf.cpp | 50 ++++++- src/llama-model-loader.cpp | 16 +++ 4 files changed, 332 insertions(+), 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1198dc1fd..f524a34fc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2144,6 +2144,7 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_byteswap_t) ( void * GGML_RESTRICT buffer, size_t elements); struct ggml_type_traits { const char * type_name; @@ -2153,6 +2154,7 @@ extern "C" { bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float_ref; + ggml_byteswap_t byteswap; }; GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b1d0d4913..b4740a1ba 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -36,6 +36,44 @@ #include #endif +#if defined(__gnu_linux__) +#include +#else +#define le64toh(x) (x) +#define le32toh(x) (x) +#define le16toh(x) (x) +#endif + +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define convert_from_le16(x) +#define convert_from_le32(x) +#define convert_from_le64(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +static inline void convert_from_le16(void * value) { + uint16_t temp; + memcpy(&temp, value, sizeof(uint16_t)); + temp = le16toh(temp); + memcpy(value, &temp, sizeof(uint16_t)); +} + +static inline void convert_from_le32(void * value) { + uint32_t temp; + memcpy(&temp, value, sizeof(uint32_t)); + temp = le32toh(temp); + memcpy(value, &temp, sizeof(uint32_t)); +} + +static inline void convert_from_le64(void * value) { + uint64_t temp; + memcpy(&temp, value, sizeof(uint64_t)); + temp = le64toh(temp); + memcpy(value, &temp, sizeof(uint64_t)); +} +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif + #if defined(__APPLE__) #include #include @@ -561,6 +599,34 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); +static void ggml_byteswap_i16 (void * restrict buffer, size_t elements); +static void ggml_byteswap_i32 (void * restrict buffer, size_t elements); +static void ggml_byteswap_i64 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q2_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q3_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q6_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_xxs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_xs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq3_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq1_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq1_m (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq4_nl (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq4_xs (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements); + static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -573,30 +639,35 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(int16_t), .is_quantized = false, + .byteswap = ggml_byteswap_i16, }, [GGML_TYPE_I32] = { .type_name = "i32", .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, + .byteswap = ggml_byteswap_i32, }, [GGML_TYPE_I64] = { .type_name = "i64", .blck_size = 1, .type_size = sizeof(int64_t), .is_quantized = false, + .byteswap = ggml_byteswap_i64, }, [GGML_TYPE_F64] = { .type_name = "f64", .blck_size = 1, .type_size = sizeof(double), .is_quantized = false, + .byteswap = ggml_byteswap_i64, }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, .type_size = sizeof(float), .is_quantized = false, + .byteswap = ggml_byteswap_i32, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -605,6 +676,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .byteswap = ggml_byteswap_i16, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -613,6 +685,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, + .byteswap = ggml_byteswap_q4_0, }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -621,6 +694,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, + .byteswap = ggml_byteswap_q4_1, }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -641,6 +715,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_0, .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref, + .byteswap = ggml_byteswap_q5_0, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -649,6 +724,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_1, .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, + .byteswap = ggml_byteswap_q5_1, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -657,6 +733,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q8_0, .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref, + .byteswap = ggml_byteswap_q8_0, }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -664,6 +741,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_1), .is_quantized = true, .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, + .byteswap = ggml_byteswap_q8_1, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -672,6 +750,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_K, .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref, + .byteswap = ggml_byteswap_q2_k, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -680,6 +759,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q3_K, .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref, + .byteswap = ggml_byteswap_q3_k, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -688,6 +768,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_K, .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref, + .byteswap = ggml_byteswap_q4_k, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -696,6 +777,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_K, .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref, + .byteswap = ggml_byteswap_q5_k, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -704,6 +786,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q6_K, .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref, + .byteswap = ggml_byteswap_q6_k, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -712,6 +795,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq2_xxs, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -720,6 +804,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq2_xs, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -728,6 +813,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref, + .byteswap = ggml_byteswap_iq3_xxs, }, [GGML_TYPE_IQ3_S] = { .type_name = "iq3_s", @@ -736,6 +822,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq3_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref, + .byteswap = ggml_byteswap_iq3_s, }, [GGML_TYPE_IQ2_S] = { .type_name = "iq2_s", @@ -744,6 +831,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref, + .byteswap = ggml_byteswap_iq2_s, }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", @@ -752,6 +840,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_s, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq1_s, }, [GGML_TYPE_IQ1_M] = { .type_name = "iq1_m", @@ -760,6 +849,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_m, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq1_m, }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", @@ -768,6 +858,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, + .byteswap = ggml_byteswap_iq4_nl, }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", @@ -776,12 +867,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref, + .byteswap = ggml_byteswap_iq4_xs, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, .type_size = sizeof(block_q8_K), .is_quantized = true, + .byteswap = ggml_byteswap_q8_k, }, [GGML_TYPE_BF16] = { .type_name = "bf16", @@ -790,24 +883,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, + .byteswap = ggml_byteswap_i16, }, [31] = { // GGML_TYPE_Q4_0_4_4 .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_4x4, }, [32] = { // GGML_TYPE_Q4_0_4_8 .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_4x8, }, [33] = { // GGML_TYPE_Q4_0_8_8 .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_8x8, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -6499,3 +6596,171 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons if (p0->strict_cpu != p1->strict_cpu ) return false; return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } + +static void ggml_byteswap_i16(void * restrict buffer, size_t elements) { + uint16_t *data_ptr = (uint16_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(data_ptr + i); + } +} + +static void ggml_byteswap_i32(void * restrict buffer, size_t elements) { + uint32_t *data_ptr = (uint32_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le32(data_ptr + i); + } +} + +static void ggml_byteswap_i64(void * restrict buffer, size_t elements) { + uint64_t *data_ptr = (uint64_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le64(data_ptr + i); + } +} + +static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { + block_q4_K *data_ptr = (block_q4_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].dmin)); + } +} + +static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { + block_q6_K *data_ptr = (block_q6_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq1_m(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "byteswap function not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab13669c5..bc47dd6cc 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,6 +15,52 @@ #include #include +#if defined(__gnu_linux__) +#include +#else +#define le64toh(x) (x) +#define le32toh(x) (x) +#define le16toh(x) (x) +#endif + +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define convert_from_le(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include + +template = 0> +static inline void convert_from_le(T * /*value*/) +{ +} + +template = 0> +static inline void convert_from_le(T * value) { + uint16_t temp; + memcpy(&temp, value, sizeof(uint16_t)); + temp = le16toh(temp); + memcpy(value, &temp, sizeof(uint16_t)); +} + +template = 0> +static inline void convert_from_le(T * value) { + uint32_t temp; + memcpy(&temp, value, sizeof(uint32_t)); + temp = le32toh(temp); + memcpy(value, &temp, sizeof(uint32_t)); +} + +template = 0> +static inline void convert_from_le(T * value) { + uint64_t temp; + memcpy(&temp, value, sizeof(uint64_t)); + temp = le64toh(temp); + memcpy(value, &temp, sizeof(uint64_t)); +} +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif + template struct type_to_gguf_type; @@ -223,7 +269,9 @@ struct gguf_reader { template bool read(T & dst) const { - return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + auto res = fread(&dst, 1, sizeof(dst), file); + convert_from_le(&dst); + return res == sizeof(dst); } template diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf61..be6e607b7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1024,6 +1024,14 @@ bool llama_model_loader::load_all_data( if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); + } +#endif + if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1052,6 +1060,14 @@ bool llama_model_loader::load_all_data( read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type)); + } +#endif + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); From a8757fec66a3f58ceb7c7a27a8f8781838fc3803 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Thu, 24 Oct 2024 11:07:44 +0200 Subject: [PATCH 02/14] Implement most of remaining byteswap functions --- ggml/src/ggml.c | 152 ++++++++++++++++++++++++++++-------------------- 1 file changed, 90 insertions(+), 62 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b4740a1ba..d89fd38b1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -619,7 +619,6 @@ static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements); static void ggml_byteswap_iq3_s (void * restrict buffer, size_t elements); static void ggml_byteswap_iq2_s (void * restrict buffer, size_t elements); static void ggml_byteswap_iq1_s (void * restrict buffer, size_t elements); -static void ggml_byteswap_iq1_m (void * restrict buffer, size_t elements); static void ggml_byteswap_iq4_nl (void * restrict buffer, size_t elements); static void ggml_byteswap_iq4_xs (void * restrict buffer, size_t elements); static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements); @@ -849,7 +848,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_m, .from_float_ref = NULL, - .byteswap = ggml_byteswap_iq1_m, }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", @@ -6619,51 +6617,63 @@ static void ggml_byteswap_i64(void * restrict buffer, size_t elements) { } static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q4_0 *data_ptr = (block_q4_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q4_1 *data_ptr = (block_q4_1*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].m)); + } } static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q5_0 *data_ptr = (block_q5_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q5_1 *data_ptr = (block_q5_1*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].m)); + } } static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q8_0 *data_ptr = (block_q8_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q8_1 *data_ptr = (block_q8_1*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].s)); + } } static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q2_K *data_ptr = (block_q2_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].dmin)); + } } static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q3_K *data_ptr = (block_q3_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { @@ -6675,9 +6685,11 @@ static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { } static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q5_K *data_ptr = (block_q5_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].dmin)); + } } static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { @@ -6688,63 +6700,79 @@ static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { } static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq2_xxs *data_ptr = (block_iq2_xxs*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/8; ++j) { + convert_from_le16(&(data_ptr[i].qs[j])); + } + } } static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq2_xs *data_ptr = (block_iq2_xs*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/8; ++j) { + convert_from_le16(&(data_ptr[i].qs[j])); + } + } } static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq3_xxs *data_ptr = (block_iq3_xxs*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq3_s *data_ptr = (block_iq3_s*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq2_s *data_ptr = (block_iq2_s*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); -} - -static void ggml_byteswap_iq1_m(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq1_s *data_ptr = (block_iq1_s*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/32; ++j) { + convert_from_le16(&(data_ptr[i].qh[j])); + } + } } static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq4_nl *data_ptr = (block_iq4_nl*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } } static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_iq4_xs *data_ptr = (block_iq4_xs*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + convert_from_le16(&(data_ptr[i].scales_h)); + } } static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); - UNUSED(buffer); - UNUSED(elements); + block_q8_K *data_ptr = (block_q8_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le32(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/16; ++j) { + convert_from_le16(&(data_ptr[i].bsums[j])); + } + } } static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) { From 0682209c6663baa29b5deaccfbaed644288852d7 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Thu, 9 Jan 2025 14:50:32 +0100 Subject: [PATCH 03/14] Implement byteswap for tq1_0 and tq2_0 --- ggml/src/ggml.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d89fd38b1..383246a72 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -625,6 +625,8 @@ static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements); static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements); static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements); static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements); +static void ggml_byteswap_tq1_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_tq2_0 (void * restrict buffer, size_t elements); static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { @@ -911,6 +913,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_tq1_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref, + .byteswap = ggml_byteswap_tq1_0, }, [GGML_TYPE_TQ2_0] = { .type_name = "tq2_0", @@ -919,6 +922,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, + .byteswap = ggml_byteswap_tq2_0, }, [36] = { // GGML_TYPE_IQ4_NL_4_4 .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", @@ -6792,3 +6796,17 @@ static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { UNUSED(buffer); UNUSED(elements); } + +static void ggml_byteswap_tq1_0(void * restrict buffer, size_t elements) { + block_tq1_0 *data_ptr = (block_tq1_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_tq2_0(void * restrict buffer, size_t elements) { + block_tq2_0 *data_ptr = (block_tq2_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + convert_from_le16(&(data_ptr[i].d)); + } +} From 21f7ca2fb3bd8e60355e00cf5bca4e26b5a125da Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Thu, 9 Jan 2025 14:59:22 +0100 Subject: [PATCH 04/14] Disable mmap on s390x Usually downloaded models are little-endian and byteswapping is needed. Byteswapping is not implemented for mmap model loading. --- src/llama-mmap.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index b716630a8..ce33b43b1 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -441,7 +441,8 @@ void * llama_mmap::addr() const { return pimpl->addr; } void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); } -#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32) +// disable mmap on s390x while it usually loads little-endian models +#if (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32) const bool llama_mmap::SUPPORTED = true; #else const bool llama_mmap::SUPPORTED = false; From 088f9a6c32cf48f87ad31d0a7de2fa9b41d21993 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Fri, 10 Jan 2025 11:16:41 +0100 Subject: [PATCH 05/14] Get rid of additional memcpy calls --- ggml/src/ggml.c | 15 +++------------ ggml/src/gguf.cpp | 15 +++------------ 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 383246a72..553ac5926 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -51,24 +51,15 @@ #define convert_from_le64(x) #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ static inline void convert_from_le16(void * value) { - uint16_t temp; - memcpy(&temp, value, sizeof(uint16_t)); - temp = le16toh(temp); - memcpy(value, &temp, sizeof(uint16_t)); + *((uint16_t*)value) = le16toh(*((uint16_t*)value)); } static inline void convert_from_le32(void * value) { - uint32_t temp; - memcpy(&temp, value, sizeof(uint32_t)); - temp = le32toh(temp); - memcpy(value, &temp, sizeof(uint32_t)); + *((uint32_t*)value) = le32toh(*((uint32_t*)value)); } static inline void convert_from_le64(void * value) { - uint64_t temp; - memcpy(&temp, value, sizeof(uint64_t)); - temp = le64toh(temp); - memcpy(value, &temp, sizeof(uint64_t)); + *((uint64_t*)value) = le64toh(*((uint64_t*)value)); } #else #error Unexpected or undefined __BYTE_ORDER__ diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index bc47dd6cc..55de3b765 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -36,26 +36,17 @@ static inline void convert_from_le(T * /*value*/) template = 0> static inline void convert_from_le(T * value) { - uint16_t temp; - memcpy(&temp, value, sizeof(uint16_t)); - temp = le16toh(temp); - memcpy(value, &temp, sizeof(uint16_t)); + *((uint16_t*)value) = le16toh(*((uint16_t*)value)); } template = 0> static inline void convert_from_le(T * value) { - uint32_t temp; - memcpy(&temp, value, sizeof(uint32_t)); - temp = le32toh(temp); - memcpy(value, &temp, sizeof(uint32_t)); + *((uint32_t*)value) = le32toh(*((uint32_t*)value)); } template = 0> static inline void convert_from_le(T * value) { - uint64_t temp; - memcpy(&temp, value, sizeof(uint64_t)); - temp = le64toh(temp); - memcpy(value, &temp, sizeof(uint64_t)); + *((uint64_t*)value) = le64toh(*((uint64_t*)value)); } #else #error Unexpected or undefined __BYTE_ORDER__ From 27c19c4eb767666c0cad818133a21cb0271c7c1f Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Fri, 10 Jan 2025 12:19:26 +0100 Subject: [PATCH 06/14] Implement write byteswap for tests --- ggml/src/gguf.cpp | 68 +++++++++++++++++++++++++++++++++ tests/test-gguf.cpp | 92 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 158 insertions(+), 2 deletions(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 55de3b765..f7136b3f0 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1184,7 +1184,13 @@ struct gguf_writer { template void write(const T & val) const { for (size_t i = 0; i < sizeof(val); ++i) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ buf.push_back(reinterpret_cast(&val)[i]); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + buf.push_back(reinterpret_cast(&val)[sizeof(val) - i - 1]); +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif } } @@ -1233,6 +1239,7 @@ struct gguf_writer { } switch (kv.get_type()) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ case GGUF_TYPE_UINT8: case GGUF_TYPE_INT8: case GGUF_TYPE_UINT16: @@ -1245,6 +1252,60 @@ struct gguf_writer { case GGUF_TYPE_FLOAT64: { write(kv.data); } break; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + case GGUF_TYPE_UINT8: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_INT8: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_UINT16: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_INT16: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_UINT32: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_INT32: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_FLOAT32: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_UINT64: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_INT64: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_FLOAT64: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif case GGUF_TYPE_BOOL: { for (size_t i = 0; i < ne; ++i) { write(kv.get_val(i)); @@ -1295,6 +1356,13 @@ struct gguf_writer { memcpy(buf.data() + offset, info.t.data, nbytes); } +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + auto byteswap = ggml_get_type_traits(info.t.type)->byteswap; + if (byteswap != nullptr) { + byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type)); + } +#endif + pad(alignment); } }; diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 6ed696328..1dd391f77 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -10,6 +10,43 @@ #include #include +#if defined(__gnu_linux__) +#include +#else +#define le64toh(x) (x) +#define le32toh(x) (x) +#define le16toh(x) (x) +#endif + +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define convert_to_le(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include + +template = 0> +static inline void convert_to_le(T * /*value*/) +{ +} + +template = 0> +static inline void convert_to_le(T * value) { + *((uint16_t*)value) = htole16(*((uint16_t*)value)); +} + +template = 0> +static inline void convert_to_le(T * value) { + *((uint32_t*)value) = htole32(*((uint32_t*)value)); +} + +template = 0> +static inline void convert_to_le(T * value) { + *((uint64_t*)value) = htole64(*((uint64_t*)value)); +} +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif + constexpr int offset_has_kv = 1000; constexpr int offset_has_tensors = 2000; constexpr int offset_has_data = 3000; @@ -146,7 +183,8 @@ static std::vector> get_kv_types(std:: } template -static void helper_write(FILE * file, const T & val) { +static void helper_write(FILE * file, T val) { + convert_to_le(&val); GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val)); } @@ -363,7 +401,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft helper_write(file, big_dim); } } else { - helper_write(file, shape.data(), n_dims*sizeof(int64_t)); + for (uint32_t j = 0; j < n_dims; ++j) { + helper_write(file, shape[j]); + } } { @@ -533,6 +573,33 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i continue; } +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + switch (type_arr) { + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + for (size_t j = 0; j < arr_n; ++j) { + convert_to_le((uint16_t*)(data8 + j * 2)); + } + break; + + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + for (size_t j = 0; j < arr_n; ++j) { + convert_to_le((uint32_t*)(data8 + j * 4)); + } + break; + + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + for (size_t j = 0; j < arr_n; ++j) { + convert_to_le((uint64_t*)(data8 + j * 8)); + } + break; + } +#endif + if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) { ok = false; } @@ -548,6 +615,27 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i continue; } +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + switch (type) { + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + convert_to_le((uint16_t*)(data8)); + break; + + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + convert_to_le((uint32_t*)(data8)); + break; + + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + convert_to_le((uint64_t*)(data8)); + break; + } +#endif + if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) { ok = false; } From fa8fc317f3adcf7c441cd54fcc7e2105c308497d Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Fri, 10 Jan 2025 18:19:47 +0100 Subject: [PATCH 07/14] Fix unicode flags conversion from and to uint16_t Bitfields are allocated in different order on s390x --- src/unicode.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/unicode.h b/src/unicode.h index c27098df7..ad55959c0 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -15,6 +15,10 @@ struct unicode_cpt_flags { SYMBOL = 0x0040, // regex: \p{S} CONTROL = 0x0080, // regex: \p{C} MASK_CATEGORIES = 0x00FF, + WHITESPACE = 0x0100, + LOWERCASE = 0x0200, + UPPERCASE = 0x0400, + NFD = 0x0800, }; // codepoint type @@ -34,11 +38,49 @@ struct unicode_cpt_flags { // decode from uint16 inline unicode_cpt_flags(const uint16_t flags = 0) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ *reinterpret_cast(this) = flags; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + is_undefined = (flags & UNDEFINED) ? 1 : 0; + is_number = (flags & NUMBER) ? 1 : 0; + is_letter = (flags & LETTER) ? 1 : 0; + is_separator = (flags & SEPARATOR) ? 1 : 0; + is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0; + is_punctuation = (flags & PUNCTUATION) ? 1 : 0; + is_symbol = (flags & SYMBOL) ? 1 : 0; + is_control = (flags & CONTROL) ? 1 : 0; + is_whitespace = (flags & WHITESPACE) ? 1 : 0; + is_lowercase = (flags & LOWERCASE) ? 1 : 0; + is_uppercase = (flags & UPPERCASE) ? 1 : 0; + is_nfd = (flags & NFD) ? 1 : 0; +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif } inline uint16_t as_uint() const { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *reinterpret_cast(this); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + uint16_t result = + is_undefined * UNDEFINED + + is_number * NUMBER + + is_letter * LETTER + + is_separator * SEPARATOR + + is_accent_mark * ACCENT_MARK + + is_punctuation * PUNCTUATION + + is_symbol * SYMBOL + + is_control * CONTROL + + is_whitespace * WHITESPACE + + is_lowercase * LOWERCASE + + is_uppercase * UPPERCASE + + is_nfd * NFD + ; + + return result; +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif } inline uint16_t category_flag() const { From 1d015486274f2e083753eb4bfc406e872485c8d1 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Tue, 14 Jan 2025 11:11:46 +0100 Subject: [PATCH 08/14] Fix unused variable warnings --- ggml/src/ggml.c | 6 +++--- ggml/src/gguf.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 553ac5926..d017bb4f9 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -46,9 +46,9 @@ // endianness conversion #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define convert_from_le16(x) -#define convert_from_le32(x) -#define convert_from_le64(x) +#define convert_from_le16(x) UNUSED(x) +#define convert_from_le32(x) UNUSED(x) +#define convert_from_le64(x) UNUSED(x) #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ static inline void convert_from_le16(void * value) { *((uint16_t*)value) = le16toh(*((uint16_t*)value)); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index f7136b3f0..5dd47c067 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -25,7 +25,7 @@ // endianness conversion #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define convert_from_le(x) +#define convert_from_le(x) (void)(x) #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #include From a9402ba2b6553b25c7abaac416598673692987d9 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 15 Jan 2025 11:48:26 +0100 Subject: [PATCH 09/14] Move conversion functions to common header Add ggml_ prefix to their names. --- ggml/src/ggml-impl.h | 65 ++++++++++++++++++++++++++++ ggml/src/ggml.c | 101 +++++++++++++++---------------------------- ggml/src/gguf.cpp | 39 +---------------- tests/test-gguf.cpp | 51 +++------------------- 4 files changed, 109 insertions(+), 147 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index eab017889..1a1056f7c 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -28,6 +28,14 @@ #include #endif +#if defined(__gnu_linux__) +#include +#else // defined(__gnu_linux__) +#define le64toh(x) (x) +#define le32toh(x) (x) +#define le16toh(x) (x) +#endif // defined(__gnu_linux__) + #ifdef __cplusplus extern "C" { #endif @@ -553,6 +561,31 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define ggml_convert_from_le16(x) GGML_UNUSED(x) +#define ggml_convert_from_le32(x) GGML_UNUSED(x) +#define ggml_convert_from_le64(x) GGML_UNUSED(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +static inline void ggml_convert_from_le16(void * value) { + *((uint16_t*)value) = le16toh(*((uint16_t*)value)); +} + +static inline void ggml_convert_from_le32(void * value) { + *((uint32_t*)value) = le32toh(*((uint32_t*)value)); +} + +static inline void ggml_convert_from_le64(void * value) { + *((uint64_t*)value) = le64toh(*((uint64_t*)value)); +} +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#error Unexpected or undefined __BYTE_ORDER__ +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + +#define ggml_convert_to_le16(x) ggml_convert_from_le16(x) +#define ggml_convert_to_le32(x) ggml_convert_from_le32(x) +#define ggml_convert_to_le64(x) ggml_convert_from_le64(x) + #ifdef __cplusplus } #endif @@ -560,6 +593,38 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #ifdef __cplusplus #include +// endianness conversion +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define ggml_convert_from_le(x) GGML_UNUSED(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#include + +template = 0> +static inline void ggml_convert_from_le(T * value) +{ + GGML_UNUSED(value); +} + +template = 0> +static inline void ggml_convert_from_le(T * value) { + ggml_convert_from_le16(value); +} + +template = 0> +static inline void ggml_convert_from_le(T * value) { + ggml_convert_from_le32(value); +} + +template = 0> +static inline void ggml_convert_from_le(T * value) { + ggml_convert_from_le64(value); +} +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#error Unexpected or undefined __BYTE_ORDER__ +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + +#define ggml_convert_to_le(x) ggml_convert_from_le(x) + // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d017bb4f9..02ee731f3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -36,35 +36,6 @@ #include #endif -#if defined(__gnu_linux__) -#include -#else -#define le64toh(x) (x) -#define le32toh(x) (x) -#define le16toh(x) (x) -#endif - -// endianness conversion -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define convert_from_le16(x) UNUSED(x) -#define convert_from_le32(x) UNUSED(x) -#define convert_from_le64(x) UNUSED(x) -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -static inline void convert_from_le16(void * value) { - *((uint16_t*)value) = le16toh(*((uint16_t*)value)); -} - -static inline void convert_from_le32(void * value) { - *((uint32_t*)value) = le32toh(*((uint32_t*)value)); -} - -static inline void convert_from_le64(void * value) { - *((uint64_t*)value) = le64toh(*((uint64_t*)value)); -} -#else -#error Unexpected or undefined __BYTE_ORDER__ -#endif - #if defined(__APPLE__) #include #include @@ -6593,113 +6564,113 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons static void ggml_byteswap_i16(void * restrict buffer, size_t elements) { uint16_t *data_ptr = (uint16_t*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(data_ptr + i); + ggml_convert_from_le16(data_ptr + i); } } static void ggml_byteswap_i32(void * restrict buffer, size_t elements) { uint32_t *data_ptr = (uint32_t*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le32(data_ptr + i); + ggml_convert_from_le32(data_ptr + i); } } static void ggml_byteswap_i64(void * restrict buffer, size_t elements) { uint64_t *data_ptr = (uint64_t*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le64(data_ptr + i); + ggml_convert_from_le64(data_ptr + i); } } static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) { block_q4_0 *data_ptr = (block_q4_0*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) { block_q4_1 *data_ptr = (block_q4_1*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].m)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].m)); } } static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) { block_q5_0 *data_ptr = (block_q5_0*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) { block_q5_1 *data_ptr = (block_q5_1*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].m)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].m)); } } static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) { block_q8_0 *data_ptr = (block_q8_0*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) { block_q8_1 *data_ptr = (block_q8_1*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].s)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].s)); } } static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) { block_q2_K *data_ptr = (block_q2_K*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].dmin)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].dmin)); } } static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) { block_q3_K *data_ptr = (block_q3_K*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { block_q4_K *data_ptr = (block_q4_K*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].dmin)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].dmin)); } } static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) { block_q5_K *data_ptr = (block_q5_K*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].dmin)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].dmin)); } } static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { block_q6_K *data_ptr = (block_q6_K*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { block_iq2_xxs *data_ptr = (block_iq2_xxs*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); for (size_t j = 0; j < QK_K/8; ++j) { - convert_from_le16(&(data_ptr[i].qs[j])); + ggml_convert_from_le16(&(data_ptr[i].qs[j])); } } } @@ -6707,9 +6678,9 @@ static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { block_iq2_xs *data_ptr = (block_iq2_xs*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); for (size_t j = 0; j < QK_K/8; ++j) { - convert_from_le16(&(data_ptr[i].qs[j])); + ggml_convert_from_le16(&(data_ptr[i].qs[j])); } } } @@ -6717,30 +6688,30 @@ static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) { block_iq3_xxs *data_ptr = (block_iq3_xxs*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) { block_iq3_s *data_ptr = (block_iq3_s*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) { block_iq2_s *data_ptr = (block_iq2_s*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { block_iq1_s *data_ptr = (block_iq1_s*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); for (size_t j = 0; j < QK_K/32; ++j) { - convert_from_le16(&(data_ptr[i].qh[j])); + ggml_convert_from_le16(&(data_ptr[i].qh[j])); } } } @@ -6748,24 +6719,24 @@ static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) { block_iq4_nl *data_ptr = (block_iq4_nl*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) { block_iq4_xs *data_ptr = (block_iq4_xs*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); - convert_from_le16(&(data_ptr[i].scales_h)); + ggml_convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].scales_h)); } } static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { block_q8_K *data_ptr = (block_q8_K*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le32(&(data_ptr[i].d)); + ggml_convert_from_le32(&(data_ptr[i].d)); for (size_t j = 0; j < QK_K/16; ++j) { - convert_from_le16(&(data_ptr[i].bsums[j])); + ggml_convert_from_le16(&(data_ptr[i].bsums[j])); } } } @@ -6791,13 +6762,13 @@ static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { static void ggml_byteswap_tq1_0(void * restrict buffer, size_t elements) { block_tq1_0 *data_ptr = (block_tq1_0*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } static void ggml_byteswap_tq2_0(void * restrict buffer, size_t elements) { block_tq2_0 *data_ptr = (block_tq2_0*) buffer; for (size_t i = 0; i < elements; ++i) { - convert_from_le16(&(data_ptr[i].d)); + ggml_convert_from_le16(&(data_ptr[i].d)); } } diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 5dd47c067..a3d99ab16 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,43 +15,6 @@ #include #include -#if defined(__gnu_linux__) -#include -#else -#define le64toh(x) (x) -#define le32toh(x) (x) -#define le16toh(x) (x) -#endif - -// endianness conversion -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define convert_from_le(x) (void)(x) -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#include - -template = 0> -static inline void convert_from_le(T * /*value*/) -{ -} - -template = 0> -static inline void convert_from_le(T * value) { - *((uint16_t*)value) = le16toh(*((uint16_t*)value)); -} - -template = 0> -static inline void convert_from_le(T * value) { - *((uint32_t*)value) = le32toh(*((uint32_t*)value)); -} - -template = 0> -static inline void convert_from_le(T * value) { - *((uint64_t*)value) = le64toh(*((uint64_t*)value)); -} -#else -#error Unexpected or undefined __BYTE_ORDER__ -#endif - template struct type_to_gguf_type; @@ -261,7 +224,7 @@ struct gguf_reader { template bool read(T & dst) const { auto res = fread(&dst, 1, sizeof(dst), file); - convert_from_le(&dst); + ggml_convert_from_le(&dst); return res == sizeof(dst); } diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 1dd391f77..86a0cdb44 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -10,43 +10,6 @@ #include #include -#if defined(__gnu_linux__) -#include -#else -#define le64toh(x) (x) -#define le32toh(x) (x) -#define le16toh(x) (x) -#endif - -// endianness conversion -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define convert_to_le(x) -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#include - -template = 0> -static inline void convert_to_le(T * /*value*/) -{ -} - -template = 0> -static inline void convert_to_le(T * value) { - *((uint16_t*)value) = htole16(*((uint16_t*)value)); -} - -template = 0> -static inline void convert_to_le(T * value) { - *((uint32_t*)value) = htole32(*((uint32_t*)value)); -} - -template = 0> -static inline void convert_to_le(T * value) { - *((uint64_t*)value) = htole64(*((uint64_t*)value)); -} -#else -#error Unexpected or undefined __BYTE_ORDER__ -#endif - constexpr int offset_has_kv = 1000; constexpr int offset_has_tensors = 2000; constexpr int offset_has_data = 3000; @@ -184,7 +147,7 @@ static std::vector> get_kv_types(std:: template static void helper_write(FILE * file, T val) { - convert_to_le(&val); + ggml_convert_to_le(&val); GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val)); } @@ -578,7 +541,7 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i case GGUF_TYPE_UINT16: case GGUF_TYPE_INT16: for (size_t j = 0; j < arr_n; ++j) { - convert_to_le((uint16_t*)(data8 + j * 2)); + ggml_convert_to_le((uint16_t*)(data8 + j * 2)); } break; @@ -586,7 +549,7 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i case GGUF_TYPE_INT32: case GGUF_TYPE_FLOAT32: for (size_t j = 0; j < arr_n; ++j) { - convert_to_le((uint32_t*)(data8 + j * 4)); + ggml_convert_to_le((uint32_t*)(data8 + j * 4)); } break; @@ -594,7 +557,7 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i case GGUF_TYPE_INT64: case GGUF_TYPE_FLOAT64: for (size_t j = 0; j < arr_n; ++j) { - convert_to_le((uint64_t*)(data8 + j * 8)); + ggml_convert_to_le((uint64_t*)(data8 + j * 8)); } break; } @@ -619,19 +582,19 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i switch (type) { case GGUF_TYPE_UINT16: case GGUF_TYPE_INT16: - convert_to_le((uint16_t*)(data8)); + ggml_convert_to_le((uint16_t*)(data8)); break; case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: case GGUF_TYPE_FLOAT32: - convert_to_le((uint32_t*)(data8)); + ggml_convert_to_le((uint32_t*)(data8)); break; case GGUF_TYPE_UINT64: case GGUF_TYPE_INT64: case GGUF_TYPE_FLOAT64: - convert_to_le((uint64_t*)(data8)); + ggml_convert_to_le((uint64_t*)(data8)); break; } #endif From 1d06f0f115aeb5619674b6a442705558f7bc8944 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 15 Jan 2025 11:53:02 +0100 Subject: [PATCH 10/14] Make assert messages unique --- ggml/src/ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 02ee731f3..725f35695 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6742,19 +6742,19 @@ static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { } static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); + GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x4 is not implemented yet"); UNUSED(buffer); UNUSED(elements); } static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); + GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x8 is not implemented yet"); UNUSED(buffer); UNUSED(elements); } static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { - GGML_ASSERT(false && "byteswap function not implemented yet"); + GGML_ASSERT(false && "function ggml_byteswap_q4_0_8x8 is not implemented yet"); UNUSED(buffer); UNUSED(elements); } From cfb2cd1ee90ec39db5f9c35ae93a278faaf10d6f Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 15 Jan 2025 11:54:37 +0100 Subject: [PATCH 11/14] Update alignment of byteswap function type definition --- ggml/include/ggml.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f524a34fc..a1a8c9622 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2144,7 +2144,7 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - typedef void (*ggml_byteswap_t) ( void * GGML_RESTRICT buffer, size_t elements); + typedef void (*ggml_byteswap_t) ( void * GGML_RESTRICT buffer, size_t elements); struct ggml_type_traits { const char * type_name; From 3c22daa66ed73bc80d7e6db3346ac10ccb6db36d Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 15 Jan 2025 11:59:30 +0100 Subject: [PATCH 12/14] Update preprocessor directives according to guidelines --- ggml/src/gguf.cpp | 10 +++++----- src/llama-mmap.cpp | 4 ++-- src/llama-model-loader.cpp | 4 ++-- src/unicode.h | 8 ++++---- tests/test-gguf.cpp | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index a3d99ab16..602995c57 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1151,9 +1151,9 @@ struct gguf_writer { buf.push_back(reinterpret_cast(&val)[i]); #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ buf.push_back(reinterpret_cast(&val)[sizeof(val) - i - 1]); -#else +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #error Unexpected or undefined __BYTE_ORDER__ -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ } } @@ -1266,9 +1266,9 @@ struct gguf_writer { write(kv.get_val(i)); } } break; -#else +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #error Unexpected or undefined __BYTE_ORDER__ -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ case GGUF_TYPE_BOOL: { for (size_t i = 0; i < ne; ++i) { write(kv.get_val(i)); @@ -1324,7 +1324,7 @@ struct gguf_writer { if (byteswap != nullptr) { byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type)); } -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ pad(alignment); } diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index ce33b43b1..20fe2277f 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -444,9 +444,9 @@ void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragme // disable mmap on s390x while it usually loads little-endian models #if (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32) const bool llama_mmap::SUPPORTED = true; -#else +#else // (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32) const bool llama_mmap::SUPPORTED = false; -#endif +#endif // (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32) // llama_mlock diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index be6e607b7..cc98896fc 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1030,7 +1030,7 @@ bool llama_model_loader::load_all_data( if (byteswap != nullptr) { byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); } -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { @@ -1066,7 +1066,7 @@ bool llama_model_loader::load_all_data( if (byteswap != nullptr) { byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type)); } -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { diff --git a/src/unicode.h b/src/unicode.h index ad55959c0..87b2ef7ca 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -53,9 +53,9 @@ struct unicode_cpt_flags { is_lowercase = (flags & LOWERCASE) ? 1 : 0; is_uppercase = (flags & UPPERCASE) ? 1 : 0; is_nfd = (flags & NFD) ? 1 : 0; -#else +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #error Unexpected or undefined __BYTE_ORDER__ -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ } inline uint16_t as_uint() const { @@ -78,9 +78,9 @@ struct unicode_cpt_flags { ; return result; -#else +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #error Unexpected or undefined __BYTE_ORDER__ -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ } inline uint16_t category_flag() const { diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 86a0cdb44..dc87f5f0a 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -561,7 +561,7 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i } break; } -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) { ok = false; @@ -597,7 +597,7 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i ggml_convert_to_le((uint64_t*)(data8)); break; } -#endif +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) { ok = false; From f4217a81fc181e43e42d3c019df97906be31663b Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Tue, 21 Jan 2025 12:15:34 +0100 Subject: [PATCH 13/14] Disable mmap on s390x in llama-quant too --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655..c91af4cbd 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -514,7 +514,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // mmap consistently increases speed Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. -#if defined(__linux__) || defined(_WIN32) +#if (defined(__linux__) && !defined(__s390x__)) || defined(_WIN32) constexpr bool use_mmap = true; #else constexpr bool use_mmap = false; From a9db9b0048f3c3c9c191743596345b3175215187 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Tue, 21 Jan 2025 12:16:32 +0100 Subject: [PATCH 14/14] Implement --no-byteswap argument to disable byteswapping on big endian platform --- common/arg.cpp | 7 +++++++ common/common.cpp | 13 +++++++----- common/common.h | 1 + .../convert-llama2c-to-ggml.cpp | 5 +++-- examples/export-lora/export-lora.cpp | 5 +++-- examples/gguf-hash/gguf-hash.cpp | 5 +++-- examples/gguf-split/gguf-split.cpp | 10 ++++++---- examples/gguf/gguf.cpp | 10 ++++++---- examples/llava/clip.cpp | 5 +++-- ggml/include/gguf.h | 2 ++ ggml/src/gguf.cpp | 17 ++++++++++++---- include/llama.h | 4 +++- src/llama-adapter.cpp | 11 +++++----- src/llama-model-loader.cpp | 20 +++++++++++-------- src/llama-model-loader.h | 4 +++- src/llama-model.cpp | 1 + src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- tests/test-gguf.cpp | 10 ++++++---- 19 files changed, 88 insertions(+), 46 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a6226a34b..ad36a4572 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1438,6 +1438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); + add_opt(common_arg( + {"--no-byteswap"}, + "don't byteswap model data on big endian systems (use if model is byteswapped to big endian in advance)", + [](common_params & params) { + params.no_byteswap = true; + } + ).set_env("LLAMA_NO_BYTESWAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d2..83d21470a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -987,7 +987,7 @@ struct common_init_result common_init_from_params(common_params & params) { // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + lora.reset(llama_adapter_lora_init(model, la.path.c_str(), mparams.no_byteswap)); if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); @@ -1092,6 +1092,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + mparams.no_byteswap = params.no_byteswap; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1418,8 +1419,9 @@ struct llama_model * common_load_model_from_url( int n_split = 0; { struct gguf_init_params gguf_params = { - /*.no_alloc = */ true, - /*.ctx = */ NULL, + /*.no_alloc = */ true, + /*.ctx = */ NULL, + /*.no_byteswap = */ false, }; auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); if (!ctx_gguf) { @@ -2063,8 +2065,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, - /* .ctx = */ &ctx, + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + /* .no_byteswap = */ false, }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { diff --git a/common/common.h b/common/common.h index 571260372..1b9d79c79 100644 --- a/common/common.h +++ b/common/common.h @@ -307,6 +307,7 @@ struct common_params { bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool no_byteswap = false; // skip byteswapping on big endian systems ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index bdf0eed2a..2990d6533 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(filename, params); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 91238e4be..e50dabf08 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -48,8 +48,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) { static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ ctx_ggml, + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + /*.no_byteswap = */ false, }; struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); if (!ctx_gguf) { diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122..3ef8ca49b 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.no_byteswap = */ false, }; // xxh64 init diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index ef3ceb686..bd2c18bd0 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -361,8 +361,9 @@ static void gguf_split(const split_params & split_params) { struct ggml_context * ctx_meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.no_byteswap = */ false, }; std::ifstream f_input(split_params.input.c_str(), std::ios::binary); @@ -426,8 +427,9 @@ static void gguf_merge(const split_params & split_params) { struct ggml_context * ctx_meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.no_byteswap = */ false, }; if (i_split > 0) { diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c..18c818694 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) { // just read tensor info static bool gguf_ex_read_0(const std::string & fname) { struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, + /*.no_alloc = */ false, + /*.ctx = */ NULL, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); @@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 24073c5a9..b3bb83d23 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1122,8 +1122,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { struct ggml_context * meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, + /*.no_alloc = */ true, + /*.ctx = */ &meta, + /*.no_byteswap = */ false, }; struct gguf_context * ctx = gguf_init_from_file(fname, params); diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee20206..47a511d34 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -74,6 +74,8 @@ extern "C" { // if not NULL, create a ggml_context and allocate the tensor data in it struct ggml_context ** ctx; + + bool no_byteswap; }; GGML_API struct gguf_context * gguf_init_empty(void); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 602995c57..66906189d 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -218,13 +218,17 @@ struct gguf_context { struct gguf_reader { FILE * file; + bool no_byteswap = false; gguf_reader(FILE * file) : file(file) {} + gguf_reader(FILE * file, bool v_no_byteswap) : file(file), no_byteswap(v_no_byteswap) {} template bool read(T & dst) const { auto res = fread(&dst, 1, sizeof(dst), file); - ggml_convert_from_le(&dst); + if (!no_byteswap) { + ggml_convert_from_le(&dst); + } return res == sizeof(dst); } @@ -319,7 +323,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector & buf; + bool no_byteswap = false; gguf_writer(std::vector & buf) : buf(buf) {} @@ -1150,7 +1155,11 @@ struct gguf_writer { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ buf.push_back(reinterpret_cast(&val)[i]); #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - buf.push_back(reinterpret_cast(&val)[sizeof(val) - i - 1]); + if (!no_byteswap) { + buf.push_back(reinterpret_cast(&val)[sizeof(val) - i - 1]); + } else { + buf.push_back(reinterpret_cast(&val)[i]); + } #else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #error Unexpected or undefined __BYTE_ORDER__ #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -1321,7 +1330,7 @@ struct gguf_writer { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ auto byteswap = ggml_get_type_traits(info.t.type)->byteswap; - if (byteswap != nullptr) { + if (byteswap != nullptr && !no_byteswap) { byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type)); } #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ diff --git a/include/llama.h b/include/llama.h index 3b75e7607..23ab4c2c2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -304,6 +304,7 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data + bool no_byteswap; // don't do byteswap, load pre-byteswapped big endian model on big endian system }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations @@ -542,7 +543,8 @@ extern "C" { // Load a LoRA adapter from file LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, - const char * path_lora); + const char * path_lora, + bool no_byteswap); // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8a0800463..e3c6bac73 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -146,13 +146,14 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * return nullptr; } -static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter, bool no_byteswap) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx_init; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &ctx_init, + /* .no_alloc = */ true, + /* .ctx = */ &ctx_init, + /* .no_byteswap = */ no_byteswap, }; gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; @@ -327,11 +328,11 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } -struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) { +struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora, bool no_byteswap) { struct llama_adapter_lora * adapter = new llama_adapter_lora(); try { - llama_adapter_lora_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(*model, path_lora, *adapter, no_byteswap); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index cc98896fc..d2b887df7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p) { + const struct llama_model_kv_override * param_overrides_p, + bool no_byteswap) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -460,8 +461,9 @@ llama_model_loader::llama_model_loader( // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + /*.no_byteswap = */ no_byteswap, }; meta.reset(gguf_init_from_file(fname.c_str(), params)); @@ -520,8 +522,9 @@ llama_model_loader::llama_model_loader( const char * fname_split = splits[idx].c_str(); struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + /*.no_byteswap = */ no_byteswap, }; gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; if (!ctx_gguf) { @@ -681,8 +684,9 @@ llama_model_loader::llama_model_loader( use_mmap = false; } - this->use_mmap = use_mmap; + this->use_mmap = use_mmap; this->check_tensors = check_tensors; + this->no_byteswap = no_byteswap; } std::string llama_model_loader::get_arch_name() const { @@ -1027,7 +1031,7 @@ bool llama_model_loader::load_all_data( #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; - if (byteswap != nullptr) { + if (byteswap != nullptr && !no_byteswap) { byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); } #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -1063,7 +1067,7 @@ bool llama_model_loader::load_all_data( #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ auto byteswap = ggml_get_type_traits(cur->type)->byteswap; - if (byteswap != nullptr) { + if (byteswap != nullptr && !no_byteswap) { byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type)); } #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index fe35404b2..24fd7f381 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -70,6 +70,7 @@ struct llama_model_loader { bool use_mmap = false; bool check_tensors; + bool no_byteswap = false; llama_files files; llama_ftype ftype; @@ -95,7 +96,8 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p); + const struct llama_model_kv_override * param_overrides_p, + bool no_byteswap); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 031b4c30b..38dd1f918 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3768,6 +3768,7 @@ struct llama_model_params llama_model_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, + /*.no_byteswap =*/ false, }; #ifdef GGML_USE_METAL diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c91af4cbd..4cf8f3245 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, /*no_byteswap*/ false); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index e8cfe5012..6d5af9c3a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.no_byteswap); ml.print_info(); diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index dc87f5f0a..0b25c5846 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -758,8 +758,9 @@ static std::pair test_handcrafted_file(const unsigned int seed) { struct ggml_context * ctx = nullptr; struct gguf_init_params gguf_params = { - /*no_alloc =*/ false, - /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + /*no_alloc =*/ false, + /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + /*no_byteswap =*/ false, }; struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); @@ -1154,8 +1155,9 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { - /*no_alloc =*/ false, - /*ctx =*/ only_meta ? nullptr : &ctx_1, + /*no_alloc =*/ false, + /*ctx =*/ only_meta ? nullptr : &ctx_1, + /*no_byteswap =*/ false, }; struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);