Merge a9db9b0048
into d80be897ac
This commit is contained in:
commit
586d8c3b86
24 changed files with 610 additions and 48 deletions
|
@ -1438,6 +1438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.use_mmap = false;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_MMAP"));
|
||||
add_opt(common_arg(
|
||||
{"--no-byteswap"},
|
||||
"don't byteswap model data on big endian systems (use if model is byteswapped to big endian in advance)",
|
||||
[](common_params & params) {
|
||||
params.no_byteswap = true;
|
||||
}
|
||||
).set_env("LLAMA_NO_BYTESWAP"));
|
||||
add_opt(common_arg(
|
||||
{"--numa"}, "TYPE",
|
||||
"attempt optimizations that help on some NUMA systems\n"
|
||||
|
|
|
@ -988,7 +988,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str(), mparams.no_byteswap));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
|
@ -1093,6 +1093,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.no_byteswap = params.no_byteswap;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
|
@ -1419,8 +1420,9 @@ struct llama_model * common_load_model_from_url(
|
|||
int n_split = 0;
|
||||
{
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
|
@ -2082,8 +2084,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co
|
|||
|
||||
ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ false,
|
||||
/* .ctx = */ &ctx,
|
||||
/* .no_alloc = */ false,
|
||||
/* .ctx = */ &ctx,
|
||||
/* .no_byteswap = */ false,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
|
|
|
@ -317,6 +317,7 @@ struct common_params {
|
|||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_byteswap = false; // skip byteswapping on big endian systems
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
|
|
@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
|
|||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(filename, params);
|
||||
|
|
|
@ -48,8 +48,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
|
|||
|
||||
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ ctx_ggml,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ ctx_ggml,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
|
|
|
@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
|
|||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
// xxh64 init
|
||||
|
|
|
@ -361,8 +361,9 @@ static void gguf_split(const split_params & split_params) {
|
|||
struct ggml_context * ctx_meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
|
||||
|
@ -426,8 +427,9 @@ static void gguf_merge(const split_params & split_params) {
|
|||
struct ggml_context * ctx_meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
if (i_split > 0) {
|
||||
|
|
|
@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) {
|
|||
// just read tensor info
|
||||
static bool gguf_ex_read_0(const std::string & fname) {
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ NULL,
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ NULL,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
|
@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
|||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
|
|
|
@ -1166,8 +1166,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
struct ggml_context * meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &meta,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &meta,
|
||||
/*.no_byteswap = */ false,
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname, params);
|
||||
|
|
|
@ -2144,6 +2144,7 @@ extern "C" {
|
|||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_byteswap_t) ( void * GGML_RESTRICT buffer, size_t elements);
|
||||
|
||||
struct ggml_type_traits {
|
||||
const char * type_name;
|
||||
|
@ -2153,6 +2154,7 @@ extern "C" {
|
|||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
ggml_byteswap_t byteswap;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
|
|
@ -74,6 +74,8 @@ extern "C" {
|
|||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
|
||||
bool no_byteswap;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
|
|
|
@ -28,6 +28,14 @@
|
|||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
#include <endian.h>
|
||||
#else // defined(__gnu_linux__)
|
||||
#define le64toh(x) (x)
|
||||
#define le32toh(x) (x)
|
||||
#define le16toh(x) (x)
|
||||
#endif // defined(__gnu_linux__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -553,6 +561,31 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// endianness conversion
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define ggml_convert_from_le16(x) GGML_UNUSED(x)
|
||||
#define ggml_convert_from_le32(x) GGML_UNUSED(x)
|
||||
#define ggml_convert_from_le64(x) GGML_UNUSED(x)
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
static inline void ggml_convert_from_le16(void * value) {
|
||||
*((uint16_t*)value) = le16toh(*((uint16_t*)value));
|
||||
}
|
||||
|
||||
static inline void ggml_convert_from_le32(void * value) {
|
||||
*((uint32_t*)value) = le32toh(*((uint32_t*)value));
|
||||
}
|
||||
|
||||
static inline void ggml_convert_from_le64(void * value) {
|
||||
*((uint64_t*)value) = le64toh(*((uint64_t*)value));
|
||||
}
|
||||
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
#define ggml_convert_to_le16(x) ggml_convert_from_le16(x)
|
||||
#define ggml_convert_to_le32(x) ggml_convert_from_le32(x)
|
||||
#define ggml_convert_to_le64(x) ggml_convert_from_le64(x)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -560,6 +593,38 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|||
#ifdef __cplusplus
|
||||
#include <vector>
|
||||
|
||||
// endianness conversion
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define ggml_convert_from_le(x) GGML_UNUSED(x)
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#include <type_traits>
|
||||
|
||||
template <typename T, std::enable_if_t<sizeof(T) == 1, int> = 0>
|
||||
static inline void ggml_convert_from_le(T * value)
|
||||
{
|
||||
GGML_UNUSED(value);
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<sizeof(T) == 2, int> = 0>
|
||||
static inline void ggml_convert_from_le(T * value) {
|
||||
ggml_convert_from_le16(value);
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<sizeof(T) == 4, int> = 0>
|
||||
static inline void ggml_convert_from_le(T * value) {
|
||||
ggml_convert_from_le32(value);
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<sizeof(T) == 8, int> = 0>
|
||||
static inline void ggml_convert_from_le(T * value) {
|
||||
ggml_convert_from_le64(value);
|
||||
}
|
||||
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
#define ggml_convert_to_le(x) ggml_convert_from_le(x)
|
||||
|
||||
// expose GGUF internals for test code
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
|
|
273
ggml/src/ggml.c
273
ggml/src/ggml.c
|
@ -565,6 +565,35 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
|||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
||||
|
||||
static void ggml_byteswap_i16 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_i32 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_i64 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q4_0 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q4_1 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q5_0 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q5_1 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q8_0 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q8_1 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q2_k (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q3_k (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q4_k (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q5_k (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q6_k (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq2_xxs (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq2_xs (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq3_s (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq2_s (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq1_s (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq4_nl (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_iq4_xs (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_tq1_0 (void * restrict buffer, size_t elements);
|
||||
static void ggml_byteswap_tq2_0 (void * restrict buffer, size_t elements);
|
||||
|
||||
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I8] = {
|
||||
.type_name = "i8",
|
||||
|
@ -577,30 +606,35 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.blck_size = 1,
|
||||
.type_size = sizeof(int16_t),
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_i16,
|
||||
},
|
||||
[GGML_TYPE_I32] = {
|
||||
.type_name = "i32",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(int32_t),
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_i32,
|
||||
},
|
||||
[GGML_TYPE_I64] = {
|
||||
.type_name = "i64",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(int64_t),
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_i64,
|
||||
},
|
||||
[GGML_TYPE_F64] = {
|
||||
.type_name = "f64",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(double),
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_i64,
|
||||
},
|
||||
[GGML_TYPE_F32] = {
|
||||
.type_name = "f32",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(float),
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_i32,
|
||||
},
|
||||
[GGML_TYPE_F16] = {
|
||||
.type_name = "f16",
|
||||
|
@ -609,6 +643,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.byteswap = ggml_byteswap_i16,
|
||||
},
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.type_name = "q4_0",
|
||||
|
@ -617,6 +652,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
|
||||
.byteswap = ggml_byteswap_q4_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_1] = {
|
||||
.type_name = "q4_1",
|
||||
|
@ -625,6 +661,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
|
||||
.byteswap = ggml_byteswap_q4_1,
|
||||
},
|
||||
[4] = { // GGML_TYPE_Q4_2
|
||||
.type_name = "DEPRECATED",
|
||||
|
@ -645,6 +682,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
|
||||
.byteswap = ggml_byteswap_q5_0,
|
||||
},
|
||||
[GGML_TYPE_Q5_1] = {
|
||||
.type_name = "q5_1",
|
||||
|
@ -653,6 +691,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
|
||||
.byteswap = ggml_byteswap_q5_1,
|
||||
},
|
||||
[GGML_TYPE_Q8_0] = {
|
||||
.type_name = "q8_0",
|
||||
|
@ -661,6 +700,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
|
||||
.byteswap = ggml_byteswap_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q8_1] = {
|
||||
.type_name = "q8_1",
|
||||
|
@ -668,6 +708,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.type_size = sizeof(block_q8_1),
|
||||
.is_quantized = true,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
||||
.byteswap = ggml_byteswap_q8_1,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.type_name = "q2_K",
|
||||
|
@ -676,6 +717,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
|
||||
.byteswap = ggml_byteswap_q2_k,
|
||||
},
|
||||
[GGML_TYPE_Q3_K] = {
|
||||
.type_name = "q3_K",
|
||||
|
@ -684,6 +726,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
|
||||
.byteswap = ggml_byteswap_q3_k,
|
||||
},
|
||||
[GGML_TYPE_Q4_K] = {
|
||||
.type_name = "q4_K",
|
||||
|
@ -692,6 +735,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
|
||||
.byteswap = ggml_byteswap_q4_k,
|
||||
},
|
||||
[GGML_TYPE_Q5_K] = {
|
||||
.type_name = "q5_K",
|
||||
|
@ -700,6 +744,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
|
||||
.byteswap = ggml_byteswap_q5_k,
|
||||
},
|
||||
[GGML_TYPE_Q6_K] = {
|
||||
.type_name = "q6_K",
|
||||
|
@ -708,6 +753,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
|
||||
.byteswap = ggml_byteswap_q6_k,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XXS] = {
|
||||
.type_name = "iq2_xxs",
|
||||
|
@ -716,6 +762,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
||||
.from_float_ref = NULL,
|
||||
.byteswap = ggml_byteswap_iq2_xxs,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XS] = {
|
||||
.type_name = "iq2_xs",
|
||||
|
@ -724,6 +771,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
||||
.from_float_ref = NULL,
|
||||
.byteswap = ggml_byteswap_iq2_xs,
|
||||
},
|
||||
[GGML_TYPE_IQ3_XXS] = {
|
||||
.type_name = "iq3_xxs",
|
||||
|
@ -732,6 +780,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
|
||||
.byteswap = ggml_byteswap_iq3_xxs,
|
||||
},
|
||||
[GGML_TYPE_IQ3_S] = {
|
||||
.type_name = "iq3_s",
|
||||
|
@ -740,6 +789,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_s,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
|
||||
.byteswap = ggml_byteswap_iq3_s,
|
||||
},
|
||||
[GGML_TYPE_IQ2_S] = {
|
||||
.type_name = "iq2_s",
|
||||
|
@ -748,6 +798,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
|
||||
.byteswap = ggml_byteswap_iq2_s,
|
||||
},
|
||||
[GGML_TYPE_IQ1_S] = {
|
||||
.type_name = "iq1_s",
|
||||
|
@ -756,6 +807,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
|
||||
.from_float_ref = NULL,
|
||||
.byteswap = ggml_byteswap_iq1_s,
|
||||
},
|
||||
[GGML_TYPE_IQ1_M] = {
|
||||
.type_name = "iq1_m",
|
||||
|
@ -772,6 +824,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
|
||||
.byteswap = ggml_byteswap_iq4_nl,
|
||||
},
|
||||
[GGML_TYPE_IQ4_XS] = {
|
||||
.type_name = "iq4_xs",
|
||||
|
@ -780,12 +833,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
|
||||
.byteswap = ggml_byteswap_iq4_xs,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
.type_name = "q8_K",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_q8_K),
|
||||
.is_quantized = true,
|
||||
.byteswap = ggml_byteswap_q8_k,
|
||||
},
|
||||
[GGML_TYPE_BF16] = {
|
||||
.type_name = "bf16",
|
||||
|
@ -794,24 +849,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
||||
.byteswap = ggml_byteswap_i16,
|
||||
},
|
||||
[31] = { // GGML_TYPE_Q4_0_4_4
|
||||
.type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_q4_0_4x4,
|
||||
},
|
||||
[32] = { // GGML_TYPE_Q4_0_4_8
|
||||
.type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_q4_0_4x8,
|
||||
},
|
||||
[33] = { // GGML_TYPE_Q4_0_8_8
|
||||
.type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
.byteswap = ggml_byteswap_q4_0_8x8,
|
||||
},
|
||||
[GGML_TYPE_TQ1_0] = {
|
||||
.type_name = "tq1_0",
|
||||
|
@ -820,6 +879,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq1_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref,
|
||||
.byteswap = ggml_byteswap_tq1_0,
|
||||
},
|
||||
[GGML_TYPE_TQ2_0] = {
|
||||
.type_name = "tq2_0",
|
||||
|
@ -828,6 +888,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
||||
.byteswap = ggml_byteswap_tq2_0,
|
||||
},
|
||||
[36] = { // GGML_TYPE_IQ4_NL_4_4
|
||||
.type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
|
||||
|
@ -6509,3 +6570,215 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
|
|||
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
||||
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
||||
}
|
||||
|
||||
static void ggml_byteswap_i16(void * restrict buffer, size_t elements) {
|
||||
uint16_t *data_ptr = (uint16_t*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(data_ptr + i);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_i32(void * restrict buffer, size_t elements) {
|
||||
uint32_t *data_ptr = (uint32_t*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le32(data_ptr + i);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_i64(void * restrict buffer, size_t elements) {
|
||||
uint64_t *data_ptr = (uint64_t*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le64(data_ptr + i);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) {
|
||||
block_q4_0 *data_ptr = (block_q4_0*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) {
|
||||
block_q4_1 *data_ptr = (block_q4_1*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].m));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) {
|
||||
block_q5_0 *data_ptr = (block_q5_0*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) {
|
||||
block_q5_1 *data_ptr = (block_q5_1*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].m));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) {
|
||||
block_q8_0 *data_ptr = (block_q8_0*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) {
|
||||
block_q8_1 *data_ptr = (block_q8_1*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].s));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) {
|
||||
block_q2_K *data_ptr = (block_q2_K*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].dmin));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) {
|
||||
block_q3_K *data_ptr = (block_q3_K*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) {
|
||||
block_q4_K *data_ptr = (block_q4_K*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].dmin));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) {
|
||||
block_q5_K *data_ptr = (block_q5_K*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].dmin));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) {
|
||||
block_q6_K *data_ptr = (block_q6_K*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) {
|
||||
block_iq2_xxs *data_ptr = (block_iq2_xxs*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
for (size_t j = 0; j < QK_K/8; ++j) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].qs[j]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) {
|
||||
block_iq2_xs *data_ptr = (block_iq2_xs*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
for (size_t j = 0; j < QK_K/8; ++j) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].qs[j]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) {
|
||||
block_iq3_xxs *data_ptr = (block_iq3_xxs*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) {
|
||||
block_iq3_s *data_ptr = (block_iq3_s*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) {
|
||||
block_iq2_s *data_ptr = (block_iq2_s*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) {
|
||||
block_iq1_s *data_ptr = (block_iq1_s*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
for (size_t j = 0; j < QK_K/32; ++j) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].qh[j]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) {
|
||||
block_iq4_nl *data_ptr = (block_iq4_nl*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) {
|
||||
block_iq4_xs *data_ptr = (block_iq4_xs*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
ggml_convert_from_le16(&(data_ptr[i].scales_h));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) {
|
||||
block_q8_K *data_ptr = (block_q8_K*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le32(&(data_ptr[i].d));
|
||||
for (size_t j = 0; j < QK_K/16; ++j) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].bsums[j]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) {
|
||||
GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x4 is not implemented yet");
|
||||
UNUSED(buffer);
|
||||
UNUSED(elements);
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) {
|
||||
GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x8 is not implemented yet");
|
||||
UNUSED(buffer);
|
||||
UNUSED(elements);
|
||||
}
|
||||
|
||||
static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) {
|
||||
GGML_ASSERT(false && "function ggml_byteswap_q4_0_8x8 is not implemented yet");
|
||||
UNUSED(buffer);
|
||||
UNUSED(elements);
|
||||
}
|
||||
|
||||
static void ggml_byteswap_tq1_0(void * restrict buffer, size_t elements) {
|
||||
block_tq1_0 *data_ptr = (block_tq1_0*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_byteswap_tq2_0(void * restrict buffer, size_t elements) {
|
||||
block_tq2_0 *data_ptr = (block_tq2_0*) buffer;
|
||||
for (size_t i = 0; i < elements; ++i) {
|
||||
ggml_convert_from_le16(&(data_ptr[i].d));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -218,12 +218,18 @@ struct gguf_context {
|
|||
|
||||
struct gguf_reader {
|
||||
FILE * file;
|
||||
bool no_byteswap = false;
|
||||
|
||||
gguf_reader(FILE * file) : file(file) {}
|
||||
gguf_reader(FILE * file, bool v_no_byteswap) : file(file), no_byteswap(v_no_byteswap) {}
|
||||
|
||||
template <typename T>
|
||||
bool read(T & dst) const {
|
||||
return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
|
||||
auto res = fread(&dst, 1, sizeof(dst), file);
|
||||
if (!no_byteswap) {
|
||||
ggml_convert_from_le(&dst);
|
||||
}
|
||||
return res == sizeof(dst);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -317,7 +323,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
|
|||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
|
||||
const struct gguf_reader gr(file);
|
||||
const struct gguf_reader gr(file, params.no_byteswap);
|
||||
struct gguf_context * ctx = new gguf_context;
|
||||
|
||||
bool ok = true;
|
||||
|
@ -1139,13 +1145,24 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
|
|||
|
||||
struct gguf_writer {
|
||||
std::vector<int8_t> & buf;
|
||||
bool no_byteswap = false;
|
||||
|
||||
gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}
|
||||
|
||||
template <typename T>
|
||||
void write(const T & val) const {
|
||||
for (size_t i = 0; i < sizeof(val); ++i) {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
if (!no_byteswap) {
|
||||
buf.push_back(reinterpret_cast<const int8_t *>(&val)[sizeof(val) - i - 1]);
|
||||
} else {
|
||||
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
|
||||
}
|
||||
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1194,6 +1211,7 @@ struct gguf_writer {
|
|||
}
|
||||
|
||||
switch (kv.get_type()) {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
case GGUF_TYPE_UINT8:
|
||||
case GGUF_TYPE_INT8:
|
||||
case GGUF_TYPE_UINT16:
|
||||
|
@ -1206,6 +1224,60 @@ struct gguf_writer {
|
|||
case GGUF_TYPE_FLOAT64: {
|
||||
write(kv.data);
|
||||
} break;
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
case GGUF_TYPE_UINT8: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<uint8_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_INT8: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<int8_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_UINT16: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<uint16_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_INT16: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<int16_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_UINT32: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<uint32_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_INT32: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<int32_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_FLOAT32: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<float>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_UINT64: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<uint64_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_INT64: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<int64_t>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_FLOAT64: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<double>(i));
|
||||
}
|
||||
} break;
|
||||
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
case GGUF_TYPE_BOOL: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<bool>(i));
|
||||
|
@ -1256,6 +1328,13 @@ struct gguf_writer {
|
|||
memcpy(buf.data() + offset, info.t.data, nbytes);
|
||||
}
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
auto byteswap = ggml_get_type_traits(info.t.type)->byteswap;
|
||||
if (byteswap != nullptr && !no_byteswap) {
|
||||
byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type));
|
||||
}
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
pad(alignment);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -304,6 +304,7 @@ extern "C" {
|
|||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool no_byteswap; // don't do byteswap, load pre-byteswapped big endian model on big endian system
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
@ -542,7 +543,8 @@ extern "C" {
|
|||
// Load a LoRA adapter from file
|
||||
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
const char * path_lora,
|
||||
bool no_byteswap);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
|
|
|
@ -146,13 +146,14 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
|
||||
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter, bool no_byteswap) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
ggml_context * ctx_init;
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
/* .ctx = */ &ctx_init,
|
||||
/* .no_alloc = */ true,
|
||||
/* .ctx = */ &ctx_init,
|
||||
/* .no_byteswap = */ no_byteswap,
|
||||
};
|
||||
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
|
||||
|
@ -327,11 +328,11 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
|
|||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora, bool no_byteswap) {
|
||||
struct llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
|
||||
try {
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter, no_byteswap);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
|
|
|
@ -441,11 +441,12 @@ void * llama_mmap::addr() const { return pimpl->addr; }
|
|||
|
||||
void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
|
||||
|
||||
#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
|
||||
// disable mmap on s390x while it usually loads little-endian models
|
||||
#if (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32)
|
||||
const bool llama_mmap::SUPPORTED = true;
|
||||
#else
|
||||
#else // (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32)
|
||||
const bool llama_mmap::SUPPORTED = false;
|
||||
#endif
|
||||
#endif // (defined(_POSIX_MEMLOCK_RANGE) && !defined(__s390x__)) || defined(_WIN32)
|
||||
|
||||
// llama_mlock
|
||||
|
||||
|
|
|
@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
|
|||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
const struct llama_model_kv_override * param_overrides_p) {
|
||||
const struct llama_model_kv_override * param_overrides_p,
|
||||
bool no_byteswap) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
|
@ -460,8 +461,9 @@ llama_model_loader::llama_model_loader(
|
|||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
/*.no_byteswap = */ no_byteswap,
|
||||
};
|
||||
|
||||
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
|
@ -520,8 +522,9 @@ llama_model_loader::llama_model_loader(
|
|||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
/*.no_byteswap = */ no_byteswap,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
|
@ -681,8 +684,9 @@ llama_model_loader::llama_model_loader(
|
|||
use_mmap = false;
|
||||
}
|
||||
|
||||
this->use_mmap = use_mmap;
|
||||
this->use_mmap = use_mmap;
|
||||
this->check_tensors = check_tensors;
|
||||
this->no_byteswap = no_byteswap;
|
||||
}
|
||||
|
||||
std::string llama_model_loader::get_arch_name() const {
|
||||
|
@ -1024,6 +1028,14 @@ bool llama_model_loader::load_all_data(
|
|||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(cur->data, n_size);
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
|
||||
if (byteswap != nullptr && !no_byteswap) {
|
||||
byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
|
||||
}
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
||||
|
@ -1052,6 +1064,14 @@ bool llama_model_loader::load_all_data(
|
|||
read_buf.resize(n_size);
|
||||
file->seek(weight->offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), n_size);
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
|
||||
if (byteswap != nullptr && !no_byteswap) {
|
||||
byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type));
|
||||
}
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
||||
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
||||
|
|
|
@ -70,6 +70,7 @@ struct llama_model_loader {
|
|||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
bool no_byteswap = false;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
|
@ -95,7 +96,8 @@ struct llama_model_loader {
|
|||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
const struct llama_model_kv_override * param_overrides_p);
|
||||
const struct llama_model_kv_override * param_overrides_p,
|
||||
bool no_byteswap);
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
|
|
|
@ -3792,6 +3792,7 @@ struct llama_model_params llama_model_default_params() {
|
|||
/*.use_mmap =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.no_byteswap =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
|
|
@ -514,7 +514,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
|
||||
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||
#if defined(__linux__) || defined(_WIN32)
|
||||
#if (defined(__linux__) && !defined(__s390x__)) || defined(_WIN32)
|
||||
constexpr bool use_mmap = true;
|
||||
#else
|
||||
constexpr bool use_mmap = false;
|
||||
|
@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, /*no_byteswap*/ false);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
|
@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.no_byteswap);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
|
|
@ -15,6 +15,10 @@ struct unicode_cpt_flags {
|
|||
SYMBOL = 0x0040, // regex: \p{S}
|
||||
CONTROL = 0x0080, // regex: \p{C}
|
||||
MASK_CATEGORIES = 0x00FF,
|
||||
WHITESPACE = 0x0100,
|
||||
LOWERCASE = 0x0200,
|
||||
UPPERCASE = 0x0400,
|
||||
NFD = 0x0800,
|
||||
};
|
||||
|
||||
// codepoint type
|
||||
|
@ -34,11 +38,49 @@ struct unicode_cpt_flags {
|
|||
|
||||
// decode from uint16
|
||||
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
*reinterpret_cast<uint16_t*>(this) = flags;
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
is_undefined = (flags & UNDEFINED) ? 1 : 0;
|
||||
is_number = (flags & NUMBER) ? 1 : 0;
|
||||
is_letter = (flags & LETTER) ? 1 : 0;
|
||||
is_separator = (flags & SEPARATOR) ? 1 : 0;
|
||||
is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
|
||||
is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
|
||||
is_symbol = (flags & SYMBOL) ? 1 : 0;
|
||||
is_control = (flags & CONTROL) ? 1 : 0;
|
||||
is_whitespace = (flags & WHITESPACE) ? 1 : 0;
|
||||
is_lowercase = (flags & LOWERCASE) ? 1 : 0;
|
||||
is_uppercase = (flags & UPPERCASE) ? 1 : 0;
|
||||
is_nfd = (flags & NFD) ? 1 : 0;
|
||||
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
}
|
||||
|
||||
inline uint16_t as_uint() const {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
return *reinterpret_cast<const uint16_t*>(this);
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
uint16_t result =
|
||||
is_undefined * UNDEFINED
|
||||
+ is_number * NUMBER
|
||||
+ is_letter * LETTER
|
||||
+ is_separator * SEPARATOR
|
||||
+ is_accent_mark * ACCENT_MARK
|
||||
+ is_punctuation * PUNCTUATION
|
||||
+ is_symbol * SYMBOL
|
||||
+ is_control * CONTROL
|
||||
+ is_whitespace * WHITESPACE
|
||||
+ is_lowercase * LOWERCASE
|
||||
+ is_uppercase * UPPERCASE
|
||||
+ is_nfd * NFD
|
||||
;
|
||||
|
||||
return result;
|
||||
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#error Unexpected or undefined __BYTE_ORDER__
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
}
|
||||
|
||||
inline uint16_t category_flag() const {
|
||||
|
|
|
@ -146,7 +146,8 @@ static std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
static void helper_write(FILE * file, const T & val) {
|
||||
static void helper_write(FILE * file, T val) {
|
||||
ggml_convert_to_le(&val);
|
||||
GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val));
|
||||
}
|
||||
|
||||
|
@ -363,7 +364,9 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
|
|||
helper_write(file, big_dim);
|
||||
}
|
||||
} else {
|
||||
helper_write(file, shape.data(), n_dims*sizeof(int64_t));
|
||||
for (uint32_t j = 0; j < n_dims; ++j) {
|
||||
helper_write(file, shape[j]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -533,6 +536,33 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i
|
|||
continue;
|
||||
}
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
switch (type_arr) {
|
||||
case GGUF_TYPE_UINT16:
|
||||
case GGUF_TYPE_INT16:
|
||||
for (size_t j = 0; j < arr_n; ++j) {
|
||||
ggml_convert_to_le((uint16_t*)(data8 + j * 2));
|
||||
}
|
||||
break;
|
||||
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
for (size_t j = 0; j < arr_n; ++j) {
|
||||
ggml_convert_to_le((uint32_t*)(data8 + j * 4));
|
||||
}
|
||||
break;
|
||||
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
for (size_t j = 0; j < arr_n; ++j) {
|
||||
ggml_convert_to_le((uint64_t*)(data8 + j * 8));
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
|
||||
ok = false;
|
||||
}
|
||||
|
@ -548,6 +578,27 @@ static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned i
|
|||
continue;
|
||||
}
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT16:
|
||||
case GGUF_TYPE_INT16:
|
||||
ggml_convert_to_le((uint16_t*)(data8));
|
||||
break;
|
||||
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
ggml_convert_to_le((uint32_t*)(data8));
|
||||
break;
|
||||
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
ggml_convert_to_le((uint64_t*)(data8));
|
||||
break;
|
||||
}
|
||||
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
|
||||
ok = false;
|
||||
}
|
||||
|
@ -707,8 +758,9 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
|||
|
||||
struct ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
||||
/*no_byteswap =*/ false,
|
||||
};
|
||||
|
||||
struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
|
||||
|
@ -1103,8 +1155,9 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
|||
|
||||
struct ggml_context * ctx_1 = nullptr;
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ only_meta ? nullptr : &ctx_1,
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ only_meta ? nullptr : &ctx_1,
|
||||
/*no_byteswap =*/ false,
|
||||
};
|
||||
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue