llama : style formatting + remove helper methods
This commit is contained in:
parent
2dd5d2c92c
commit
a82e3a4d92
3 changed files with 79 additions and 115 deletions
10
ggml.h
10
ggml.h
|
@ -1744,12 +1744,12 @@ extern "C" {
|
||||||
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
||||||
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
||||||
|
|
||||||
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
||||||
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
||||||
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
||||||
GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i);
|
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
||||||
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
|
|
||||||
|
|
||||||
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
|
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
|
||||||
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
|
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
|
||||||
|
|
172
gguf-llama.cpp
172
gguf-llama.cpp
|
@ -510,22 +510,9 @@ struct llama_state {
|
||||||
// global state
|
// global state
|
||||||
static llama_state g_state;
|
static llama_state g_state;
|
||||||
|
|
||||||
template <typename T>
|
//
|
||||||
static T checked_mul(T a, T b) {
|
// model loading and saving
|
||||||
T ret = a * b;
|
//
|
||||||
if (a != 0 && ret / a != b) {
|
|
||||||
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
|
||||||
(unsigned long long) a, (unsigned long long) b));
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t checked_div(size_t a, size_t b) {
|
|
||||||
if (b == 0 || a % b != 0) {
|
|
||||||
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
|
||||||
}
|
|
||||||
return a / b;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
||||||
char buf[256];
|
char buf[256];
|
||||||
|
@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
|
||||||
size_t size = ggml_type_size(type);
|
|
||||||
for (uint32_t dim : ne) {
|
|
||||||
size = checked_mul<size_t>(size, dim);
|
|
||||||
}
|
|
||||||
return size / ggml_blck_size(type);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct gguf_load_tensor {
|
struct gguf_load_tensor {
|
||||||
std::string name;
|
std::string name;
|
||||||
enum ggml_type type = GGML_TYPE_F32;
|
enum ggml_type type = GGML_TYPE_F32;
|
||||||
|
@ -573,20 +552,19 @@ struct gguf_file_loader {
|
||||||
|
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map)
|
gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, "rb") {
|
||||||
: file(fname, "rb") {
|
|
||||||
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
/*.no_alloc = */ true,
|
/*.no_alloc = */ true,
|
||||||
/*.ctx = */ &ctx_data,
|
/*.ctx = */ &ctx_data,
|
||||||
};
|
};
|
||||||
|
|
||||||
gguf_ctx = gguf_init_from_file(fname, params);
|
gguf_ctx = gguf_init_from_file(fname, params);
|
||||||
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
|
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
|
||||||
|
|
||||||
read_hparams();
|
read_hparams();
|
||||||
read_vocab();
|
read_vocab();
|
||||||
read_tensor_metadata(tensors_map);
|
read_tensor_metadata(tensors_map);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -637,18 +615,18 @@ struct gguf_file_loader {
|
||||||
|
|
||||||
void read_vocab() {
|
void read_vocab() {
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
vocab.id_to_token.resize(hparams.n_vocab);
|
||||||
int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
|
|
||||||
|
const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
|
||||||
if (token_idx == -1) {
|
if (token_idx == -1) {
|
||||||
throw std::runtime_error("cannot find token list in GGUF file\n");
|
throw std::runtime_error("cannot find token list in GGUF file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
|
const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
|
||||||
if (score_idx == -1) {
|
if (score_idx == -1) {
|
||||||
throw std::runtime_error("cannot find token scores list in GGUF file\n");
|
throw std::runtime_error("cannot find token scores list in GGUF file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
||||||
|
|
||||||
std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);
|
std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
|
@ -702,7 +680,7 @@ struct gguf_file_loader {
|
||||||
tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i);
|
tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i);
|
||||||
|
|
||||||
tensor.name = name;
|
tensor.name = name;
|
||||||
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
|
tensor.size = ggml_nbytes(cur);
|
||||||
|
|
||||||
tensors_map.tensors.push_back(tensor);
|
tensors_map.tensors.push_back(tensor);
|
||||||
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
||||||
|
@ -787,7 +765,7 @@ struct gguf_file_saver {
|
||||||
gguf_type arr_type;
|
gguf_type arr_type;
|
||||||
int n_arr;
|
int n_arr;
|
||||||
|
|
||||||
switch(vtype) {
|
switch (vtype) {
|
||||||
case GGUF_TYPE_BOOL:
|
case GGUF_TYPE_BOOL:
|
||||||
bool_val = gguf_get_val_bool(fl->gguf_ctx, i);
|
bool_val = gguf_get_val_bool(fl->gguf_ctx, i);
|
||||||
file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val);
|
file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val);
|
||||||
|
@ -810,7 +788,7 @@ struct gguf_file_saver {
|
||||||
break;
|
break;
|
||||||
case GGUF_TYPE_STRING:
|
case GGUF_TYPE_STRING:
|
||||||
str_val = gguf_get_val_str(fl->gguf_ctx, i);
|
str_val = gguf_get_val_str(fl->gguf_ctx, i);
|
||||||
file.write_val<std::string>(key, GGUF_TYPE_STRING, str_val);
|
file.write_str(key, GGUF_TYPE_STRING, str_val);
|
||||||
break;
|
break;
|
||||||
case GGUF_TYPE_UINT16:
|
case GGUF_TYPE_UINT16:
|
||||||
u16_val = gguf_get_val_u16(fl->gguf_ctx, i);
|
u16_val = gguf_get_val_u16(fl->gguf_ctx, i);
|
||||||
|
@ -826,7 +804,7 @@ struct gguf_file_saver {
|
||||||
break;
|
break;
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
arr_type = gguf_get_arr_type(fl->gguf_ctx, i);
|
arr_type = gguf_get_arr_type(fl->gguf_ctx, i);
|
||||||
n_arr = gguf_get_arr_n(fl->gguf_ctx, i);
|
n_arr = gguf_get_arr_n (fl->gguf_ctx, i);
|
||||||
if (arr_type == GGUF_TYPE_FLOAT32) {
|
if (arr_type == GGUF_TYPE_FLOAT32) {
|
||||||
write_hparam_arr_f32(key, arr_type, i, n_arr);
|
write_hparam_arr_f32(key, arr_type, i, n_arr);
|
||||||
} else if (arr_type == GGUF_TYPE_STRING) {
|
} else if (arr_type == GGUF_TYPE_STRING) {
|
||||||
|
@ -923,20 +901,6 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
|
||||||
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
|
||||||
}
|
|
||||||
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
|
|
||||||
if (lt.ne != ne) {
|
|
||||||
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
|
||||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return get_tensor_for(lt, backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
|
struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
|
||||||
struct ggml_tensor * tensor;
|
struct ggml_tensor * tensor;
|
||||||
if (backend != GGML_BACKEND_CPU) {
|
if (backend != GGML_BACKEND_CPU) {
|
||||||
|
@ -960,16 +924,41 @@ struct llama_model_loader {
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||||
|
auto it = tensors_map.name_to_idx.find(name);
|
||||||
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
|
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
||||||
|
}
|
||||||
|
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
|
||||||
|
if (lt.ne != ne) {
|
||||||
|
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
||||||
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return get_tensor_for(lt, backend);
|
||||||
|
}
|
||||||
|
|
||||||
void done_getting_tensors() const {
|
void done_getting_tensors() const {
|
||||||
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
||||||
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
|
void load_data_for(gguf_load_tensor & lt) const {
|
||||||
size_t data_size = 0;
|
if (use_mmap) {
|
||||||
|
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
||||||
|
} else {
|
||||||
|
gguf_file & file = file_loader->file;
|
||||||
|
file.seek(lt.file_off, SEEK_SET);
|
||||||
|
file.read_raw(lt.data, lt.size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
|
||||||
|
size_t data_size = 0;
|
||||||
size_t prefetch_size = 0;
|
size_t prefetch_size = 0;
|
||||||
size_t lock_size = 0;
|
size_t lock_size = 0;
|
||||||
|
|
||||||
for (const gguf_load_tensor & lt : tensors_map.tensors) {
|
for (const gguf_load_tensor & lt : tensors_map.tensors) {
|
||||||
data_size += lt.size;
|
data_size += lt.size;
|
||||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||||
|
@ -1031,31 +1020,6 @@ struct llama_model_loader {
|
||||||
done_size += lt.size;
|
done_size += lt.size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_data_for(gguf_load_tensor & lt) {
|
|
||||||
if (use_mmap) {
|
|
||||||
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
|
||||||
} else {
|
|
||||||
gguf_file & file = file_loader->file;
|
|
||||||
file.seek(lt.file_off, SEEK_SET);
|
|
||||||
file.read_raw(lt.data, lt.size);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (0) {
|
|
||||||
print_checksum(lt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_checksum(gguf_load_tensor & lt) {
|
|
||||||
uint32_t sum = 0;
|
|
||||||
for (size_t i = 0; i < lt.size; i++) {
|
|
||||||
uint8_t byte = lt.data[i];
|
|
||||||
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
|
|
||||||
}
|
|
||||||
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
|
|
||||||
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1185,18 +1149,18 @@ int64_t llama_time_us() {
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// model loading
|
// load LLaMA models
|
||||||
//
|
//
|
||||||
|
|
||||||
static const char *gguf_file_version_name(gguf_file_version version) {
|
static const char * gguf_file_version_name(gguf_file_version version) {
|
||||||
switch (version) {
|
switch (version) {
|
||||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
||||||
}
|
}
|
||||||
|
|
||||||
return "unknown";
|
return "unknown";
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
static const char * llama_ftype_name(enum llama_ftype ftype) {
|
||||||
switch (ftype) {
|
switch (ftype) {
|
||||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||||
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
||||||
|
@ -1207,8 +1171,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||||
|
|
||||||
// K-quants
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
||||||
|
@ -1216,15 +1181,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
||||||
default: return "unknown, may not work";
|
|
||||||
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *llama_model_type_name(e_model type) {
|
static const char * llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case MODEL_3B: return "3B";
|
case MODEL_3B: return "3B";
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
case MODEL_13B: return "13B";
|
case MODEL_13B: return "13B";
|
||||||
case MODEL_30B: return "30B";
|
case MODEL_30B: return "30B";
|
||||||
case MODEL_65B: return "65B";
|
case MODEL_65B: return "65B";
|
||||||
|
@ -1605,7 +1571,6 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
const int64_t n_embd_head = hparams.n_embd_head();
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
const float freq_base = hparams.rope_freq_base;
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
@ -1714,7 +1679,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
|
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 0);
|
llama_context::use_buf(ctx0, 0);
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
|
@ -1853,7 +1818,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
ggml_set_name(cur, "result_wo");
|
ggml_set_name(cur, "result_wo");
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 1);
|
llama_context::use_buf(ctx0, 1);
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||||
offload_func(inpFF);
|
offload_func(inpFF);
|
||||||
|
@ -1909,7 +1874,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 0);
|
llama_context::use_buf(ctx0, 0);
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
|
@ -1927,7 +1892,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
ggml_set_name(cur, "result_output");
|
ggml_set_name(cur, "result_output");
|
||||||
|
|
||||||
lctx.use_buf(ctx0, -1);
|
llama_context::use_buf(ctx0, -1);
|
||||||
|
|
||||||
// logits -> probs
|
// logits -> probs
|
||||||
//cur = ggml_soft_max_inplace(ctx0, cur);
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
||||||
|
@ -2997,9 +2962,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto rejects =
|
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
||||||
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
for (const auto & reject : rejects) {
|
||||||
for (auto & reject : rejects) {
|
|
||||||
candidates->data[reject.index].logit = -INFINITY;
|
candidates->data[reject.index].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3726,7 +3690,7 @@ void llama_free(struct llama_context * ctx) {
|
||||||
int llama_model_quantize(
|
int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
const llama_model_quantize_params *params) {
|
const llama_model_quantize_params * params) {
|
||||||
try {
|
try {
|
||||||
llama_model_quantize_internal(fname_inp, fname_out, params);
|
llama_model_quantize_internal(fname_inp, fname_out, params);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -4344,8 +4308,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
||||||
GGML_UNUSED(n_token_capacity);
|
GGML_UNUSED(n_token_capacity);
|
||||||
GGML_UNUSED(n_token_count_out);
|
GGML_UNUSED(n_token_count_out);
|
||||||
|
|
||||||
|
// TODO: implement with GGUF format
|
||||||
// TODO: implement with GGUF format
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4390,7 +4353,6 @@ int llama_eval(
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int llama_eval_embd(
|
int llama_eval_embd(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const float * embd,
|
const float * embd,
|
||||||
|
|
12
gguf-util.h
12
gguf-util.h
|
@ -122,9 +122,10 @@ struct gguf_file {
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void write_val(const std::string & key, enum gguf_type type, const T & val) {
|
void write_val(const std::string & key, enum gguf_type type, const T & val) {
|
||||||
|
static_assert(std::is_fundamental<T>::value, "T must be a primitive type");
|
||||||
write_str(key);
|
write_str(key);
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||||
fwrite((const char *) &val, sizeof(val), 1, fp);
|
fwrite((const char *) &val, sizeof(val), 1, fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -137,7 +138,7 @@ struct gguf_file {
|
||||||
|
|
||||||
const int32_t n = val.size();
|
const int32_t n = val.size();
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
fwrite((const char *) &n, sizeof(n), 1, fp);
|
||||||
fwrite(val.data(), sizeof(T), n, fp);
|
fwrite(val.data(), sizeof(T), n, fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,7 +160,7 @@ struct gguf_file {
|
||||||
|
|
||||||
const int32_t n = val.size();
|
const int32_t n = val.size();
|
||||||
fwrite((const char *) &type, sizeof(type), 1, fp);
|
fwrite((const char *) &type, sizeof(type), 1, fp);
|
||||||
fwrite((const char *) &n, sizeof(n), 1, fp);
|
fwrite((const char *) &n, sizeof(n), 1, fp);
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
const int32_t nstr = val[i].size();
|
const int32_t nstr = val[i].size();
|
||||||
fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
|
fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
|
||||||
|
@ -265,7 +266,7 @@ struct gguf_mmap {
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
static constexpr bool SUPPORTED = true;
|
static constexpr bool SUPPORTED = true;
|
||||||
|
|
||||||
gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
|
||||||
(void) numa;
|
(void) numa;
|
||||||
|
|
||||||
size = file->size;
|
size = file->size;
|
||||||
|
@ -312,7 +313,8 @@ struct gguf_mmap {
|
||||||
#else
|
#else
|
||||||
static constexpr bool SUPPORTED = false;
|
static constexpr bool SUPPORTED = false;
|
||||||
|
|
||||||
gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
|
||||||
|
(void) file;
|
||||||
(void) prefetch;
|
(void) prefetch;
|
||||||
(void) numa;
|
(void) numa;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue