gguf : add KV constant maps
This commit is contained in:
parent
3057d6a687
commit
3c025a6d07
2 changed files with 272 additions and 146 deletions
26
gguf.py
26
gguf.py
|
@ -28,12 +28,12 @@ KEY_GENERAL_SOURCE_URL = "general.source.url"
|
||||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
|
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
||||||
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||||
KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
|
KEY_BLOCK_COUNT = "{arch}.block_count"
|
||||||
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||||
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||||
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
|
|
||||||
# attention
|
# attention
|
||||||
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -581,7 +581,7 @@ class GGUFWriter:
|
||||||
self.add_string(KEY_GENERAL_AUTHOR, author)
|
self.add_string(KEY_GENERAL_AUTHOR, author)
|
||||||
|
|
||||||
def add_tensor_data_layout(self, layout: str):
|
def add_tensor_data_layout(self, layout: str):
|
||||||
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||||
|
|
||||||
def add_url(self, url: str):
|
def add_url(self, url: str):
|
||||||
self.add_string(KEY_GENERAL_URL, url)
|
self.add_string(KEY_GENERAL_URL, url)
|
||||||
|
@ -608,27 +608,27 @@ class GGUFWriter:
|
||||||
|
|
||||||
def add_context_length(self, length: int):
|
def add_context_length(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
|
KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_embedding_length(self, length: int):
|
def add_embedding_length(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_block_count(self, length: int):
|
def add_block_count(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
|
KEY_BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_feed_forward_length(self, length: int):
|
def add_feed_forward_length(self, length: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_parallel_residual(self, use: bool):
|
def add_parallel_residual(self, use: bool):
|
||||||
self.add_bool(
|
self.add_bool(
|
||||||
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
def add_tensor_data_layout(self, layout: str):
|
def add_tensor_data_layout(self, layout: str):
|
||||||
self.add_string(
|
self.add_string(
|
||||||
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||||
|
|
||||||
def add_head_count(self, count: int):
|
def add_head_count(self, count: int):
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
|
|
392
llama.cpp
392
llama.cpp
|
@ -143,6 +143,111 @@ enum llm_arch {
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||||
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
|
{ LLM_ARCH_GPT2, "gpt2" },
|
||||||
|
{ LLM_ARCH_GPTJ, "gptj" },
|
||||||
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
||||||
|
{ LLM_ARCH_MPT, "mpt" },
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llm_kv {
|
||||||
|
LLM_KV_GENERAL_ARCHITECTURE,
|
||||||
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
||||||
|
LLM_KV_GENERAL_ALIGNMENT,
|
||||||
|
LLM_KV_GENERAL_NAME,
|
||||||
|
LLM_KV_GENERAL_AUTHOR,
|
||||||
|
LLM_KV_GENERAL_URL,
|
||||||
|
LLM_KV_GENERAL_DESCRIPTION,
|
||||||
|
LLM_KV_GENERAL_LICENSE,
|
||||||
|
LLM_KV_GENERAL_SOURCE_URL,
|
||||||
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
||||||
|
|
||||||
|
LLM_KV_CONTEXT_LENGTH,
|
||||||
|
LLM_KV_EMBEDDING_LENGTH,
|
||||||
|
LLM_KV_BLOCK_COUNT,
|
||||||
|
LLM_KV_FEED_FORWARD_LENGTH,
|
||||||
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||||
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||||
|
|
||||||
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
||||||
|
LLM_KV_ATTENTION_CLAMP_KQV,
|
||||||
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
||||||
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
||||||
|
|
||||||
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
|
LLM_KV_ROPE_SCALE_LINEAR,
|
||||||
|
|
||||||
|
LLM_KV_TOKENIZER_MODEL,
|
||||||
|
LLM_KV_TOKENIZER_LIST,
|
||||||
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
||||||
|
LLM_KV_TOKENIZER_SCORES,
|
||||||
|
LLM_KV_TOKENIZER_MERGES,
|
||||||
|
LLM_KV_TOKENIZER_BOS_ID,
|
||||||
|
LLM_KV_TOKENIZER_EOS_ID,
|
||||||
|
LLM_KV_TOKENIZER_UNK_ID,
|
||||||
|
LLM_KV_TOKENIZER_SEP_ID,
|
||||||
|
LLM_KV_TOKENIZER_PAD_ID,
|
||||||
|
LLM_KV_TOKENIZER_HF_JSON,
|
||||||
|
LLM_KV_TOKENIZER_RWKV,
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||||
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||||
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||||
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||||
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
||||||
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
||||||
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
||||||
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
||||||
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
||||||
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
|
||||||
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
|
||||||
|
|
||||||
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||||
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||||
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||||
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
||||||
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
||||||
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||||
|
|
||||||
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
|
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||||
|
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||||
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||||
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||||
|
|
||||||
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||||
|
|
||||||
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||||
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||||
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
||||||
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
||||||
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
||||||
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||||
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LLM_KV {
|
||||||
|
LLM_KV(llm_arch arch) : arch(arch) {}
|
||||||
|
|
||||||
|
llm_arch arch;
|
||||||
|
|
||||||
|
std::string operator()(llm_kv kv) const {
|
||||||
|
return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
enum llm_tensor {
|
enum llm_tensor {
|
||||||
LLM_TENSOR_TOKEN_EMBD,
|
LLM_TENSOR_TOKEN_EMBD,
|
||||||
LLM_TENSOR_POS_EMBD,
|
LLM_TENSOR_POS_EMBD,
|
||||||
|
@ -163,16 +268,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
|
||||||
{ LLM_ARCH_GPT2, "gpt2" },
|
|
||||||
{ LLM_ARCH_GPTJ, "gptj" },
|
|
||||||
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
|
||||||
{ LLM_ARCH_MPT, "mpt" },
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES_BASE = {
|
|
||||||
{
|
{
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
{
|
{
|
||||||
|
@ -221,46 +317,49 @@ static llm_arch llm_arch_from_string(const std::string & name) {
|
||||||
// helper to handle gguf constants
|
// helper to handle gguf constants
|
||||||
// usage:
|
// usage:
|
||||||
//
|
//
|
||||||
// const auto llm = LLM<LLM_ARCH_LLAMA>();
|
// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
||||||
//
|
//
|
||||||
// std::string name = LLM(LLM_TENSOR_OUTPUT); -> "output"
|
// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
|
||||||
// std::string name = LLM(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
|
// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
|
||||||
// std::string name = LLM(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
|
// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
|
||||||
//
|
//
|
||||||
template <enum llm_arch T>
|
struct LLM_TN {
|
||||||
struct LLM {
|
LLM_TN(llm_arch arch) : arch(arch) {}
|
||||||
|
|
||||||
|
llm_arch arch;
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor) const {
|
std::string operator()(llm_tensor tensor) const {
|
||||||
return LLM_TENSOR_NAMES_BASE[T].at(tensor);
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
||||||
return LLM_TENSOR_NAMES_BASE[T].at(tensor) + "." + suffix;
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, int bid) const {
|
std::string operator()(llm_tensor tensor, int bid) const {
|
||||||
return format(LLM_TENSOR_NAMES_BASE[T].at(tensor).c_str(), bid);
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
||||||
return format(LLM_TENSOR_NAMES_BASE[T].at(tensor).c_str(), bid) + "." + suffix;
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf helpers
|
// gguf helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
||||||
{ \
|
{ \
|
||||||
const int kid = gguf_find_key(ctx, key); \
|
const std::string skey(key); \
|
||||||
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
||||||
if (kid >= 0) { \
|
if (kid >= 0) { \
|
||||||
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
||||||
if (ktype != (type)) { \
|
if (ktype != (type)) { \
|
||||||
throw std::runtime_error(format("key %s has wrong type: %s", key, gguf_type_name(ktype))); \
|
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
|
||||||
} \
|
} \
|
||||||
(dst) = func(ctx, kid); \
|
(dst) = func(ctx, kid); \
|
||||||
} else if (req) { \
|
} else if (req) { \
|
||||||
throw std::runtime_error(format("key not found in model: %s", key)); \
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1361,26 +1460,117 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static llm_arch llm_load_arch(llama_model_loader & ml) {
|
||||||
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
|
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||||
|
|
||||||
|
std::string arch_name;
|
||||||
|
GGUF_GET_KEY(ctx, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_GENERAL_ARCHITECTURE));
|
||||||
|
|
||||||
|
const llm_arch arch = llm_arch_from_string(arch_name);
|
||||||
|
if (arch == LLM_ARCH_UNKNOWN) {
|
||||||
|
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
||||||
|
}
|
||||||
|
|
||||||
|
return arch;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llm_load_hparams(
|
||||||
|
llm_arch arch,
|
||||||
|
llama_model_loader & ml,
|
||||||
|
llama_model & model,
|
||||||
|
int n_ctx,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale) {
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
|
const auto kv = LLM_KV(arch);
|
||||||
|
|
||||||
|
// get general kv
|
||||||
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
||||||
|
GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
|
||||||
|
|
||||||
|
// get hparams kv
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
||||||
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||||
|
|
||||||
|
// n_head_kv is optional, default to n_head
|
||||||
|
hparams.n_head_kv = hparams.n_head;
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
||||||
|
|
||||||
|
// TODO: manually setting rope scale should override this
|
||||||
|
// rope_freq_scale (inverse of the kv) is optional
|
||||||
|
float ropescale = 1.0f;
|
||||||
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||||
|
if (ropescale != 1.0f) {
|
||||||
|
rope_freq_scale = 1.0f/ropescale;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: generalize to non-LLaMA models
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
case 60: model.type = e_model::MODEL_30B; break;
|
||||||
|
case 80: model.type = e_model::MODEL_65B; break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
if (hparams.n_layer < 32) {
|
||||||
|
model.type = e_model::MODEL_7B;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLaMAv2
|
||||||
|
// TODO: probably not needed
|
||||||
|
{
|
||||||
|
const auto n_gqa = hparams.n_gqa();
|
||||||
|
|
||||||
|
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
||||||
|
LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
||||||
|
model.type = e_model::MODEL_70B;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
model.ftype = ml.ftype;
|
||||||
|
|
||||||
|
hparams.n_ctx = n_ctx;
|
||||||
|
hparams.rope_freq_base = rope_freq_base;
|
||||||
|
hparams.rope_freq_scale = rope_freq_scale;
|
||||||
|
}
|
||||||
|
|
||||||
static void llm_load_vocab(
|
static void llm_load_vocab(
|
||||||
|
llm_arch arch,
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model) {
|
llama_model & model) {
|
||||||
auto & vocab = model.vocab;
|
auto & vocab = model.vocab;
|
||||||
|
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
|
const auto kv = LLM_KV(arch);
|
||||||
|
|
||||||
|
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||||
if (token_idx == -1) {
|
if (token_idx == -1) {
|
||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||||
if (score_idx == -1) {
|
if (score_idx == -1) {
|
||||||
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
|
|
||||||
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||||
if (toktype_idx == -1) {
|
if (toktype_idx == -1) {
|
||||||
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
||||||
}
|
}
|
||||||
|
@ -1390,7 +1580,7 @@ static void llm_load_vocab(
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_name;
|
||||||
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
|
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
|
||||||
|
|
||||||
if (tokenizer_name == "llama") {
|
if (tokenizer_name == "llama") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
@ -1424,80 +1614,11 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
||||||
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
|
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
|
||||||
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
|
||||||
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
|
||||||
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
|
||||||
}
|
|
||||||
|
|
||||||
static void llm_load_hparams(
|
|
||||||
llama_model_loader & ml,
|
|
||||||
llama_model & model,
|
|
||||||
int n_ctx,
|
|
||||||
float rope_freq_base,
|
|
||||||
float rope_freq_scale) {
|
|
||||||
auto & hparams = model.hparams;
|
|
||||||
|
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
|
||||||
|
|
||||||
// get hparams kv
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon");
|
|
||||||
|
|
||||||
// n_head_kv is optional, default to n_head
|
|
||||||
hparams.n_head_kv = hparams.n_head;
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
|
|
||||||
|
|
||||||
// TODO: manually setting rope scale should override this
|
|
||||||
// rope_freq_scale (inverse of the kv) is optional
|
|
||||||
float ropescale = 1.0f;
|
|
||||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
|
|
||||||
if (ropescale != 1.0f) {
|
|
||||||
rope_freq_scale = 1.0f/ropescale;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get general kv
|
|
||||||
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
|
||||||
GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
|
||||||
case 80: model.type = e_model::MODEL_65B; break;
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
if (hparams.n_layer < 32) {
|
|
||||||
model.type = e_model::MODEL_7B;
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
|
|
||||||
model.ftype = ml.ftype;
|
|
||||||
|
|
||||||
hparams.n_ctx = n_ctx;
|
|
||||||
|
|
||||||
// LLaMAv2
|
|
||||||
// TODO: probably not needed
|
|
||||||
{
|
|
||||||
const auto n_gqa = hparams.n_gqa();
|
|
||||||
|
|
||||||
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
|
||||||
LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
|
||||||
model.type = e_model::MODEL_70B;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
hparams.rope_freq_base = rope_freq_base;
|
|
||||||
hparams.rope_freq_scale = rope_freq_scale;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
|
@ -1537,7 +1658,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_model_load_internal(
|
static void llm_load_llama(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
|
@ -1609,9 +1730,9 @@ static void llama_model_load_internal(
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
const uint32_t n_vocab = hparams.n_vocab;
|
const uint32_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
const auto llm = LLM<LLM_ARCH_LLAMA>();
|
const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
||||||
|
|
||||||
model.tok_embeddings = ml.create_tensor(ctx, llm(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
// "output" tensor
|
// "output" tensor
|
||||||
{
|
{
|
||||||
|
@ -1632,8 +1753,8 @@ static void llama_model_load_internal(
|
||||||
backend_output = GGML_BACKEND_CPU;
|
backend_output = GGML_BACKEND_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
model.norm = ml.create_tensor(ctx, llm(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
model.norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
model.output = ml.create_tensor(ctx, llm(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
vram_weights += ggml_nbytes(model.norm);
|
vram_weights += ggml_nbytes(model.norm);
|
||||||
}
|
}
|
||||||
|
@ -1652,18 +1773,18 @@ static void llama_model_load_internal(
|
||||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
layer.attention_norm = ml.create_tensor(ctx, llm(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
layer.attention_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx, llm(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
layer.wk = ml.create_tensor(ctx, llm(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
layer.wv = ml.create_tensor(ctx, llm(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
layer.wo = ml.create_tensor(ctx, llm(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx, llm(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.w1 = ml.create_tensor(ctx, llm(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
layer.w2 = ml.create_tensor(ctx, llm(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
layer.w3 = ml.create_tensor(ctx, llm(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
|
@ -1783,16 +1904,10 @@ static bool llama_model_load(
|
||||||
try {
|
try {
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
||||||
|
|
||||||
std::string arch_name;
|
const llm_arch arch = llm_load_arch(*ml);
|
||||||
GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
|
|
||||||
|
|
||||||
const llm_arch arch = llm_arch_from_string(arch_name);
|
llm_load_hparams(arch, *ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
||||||
if (arch == LLM_ARCH_UNKNOWN) {
|
llm_load_vocab (arch, *ml, model);
|
||||||
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
|
||||||
}
|
|
||||||
|
|
||||||
llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
|
||||||
llm_load_vocab(*ml, model);
|
|
||||||
|
|
||||||
llm_load_print_meta(*ml, model);
|
llm_load_print_meta(*ml, model);
|
||||||
|
|
||||||
|
@ -1805,14 +1920,25 @@ static bool llama_model_load(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model_load_internal(*ml, model, n_batch, n_gpu_layers,
|
switch (arch) {
|
||||||
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
case LLM_ARCH_LLAMA:
|
||||||
use_mlock, progress_callback, progress_callback_user_data);
|
{
|
||||||
return true;
|
llm_load_llama(*ml, model, n_batch, n_gpu_layers,
|
||||||
|
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
||||||
|
use_mlock, progress_callback, progress_callback_user_data);
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_FALCON:
|
||||||
|
{
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("unsupported architecture");
|
||||||
|
};
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph(
|
static struct ggml_cgraph * llama_build_graph(
|
||||||
|
@ -3674,9 +3800,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
const auto llm = LLM<LLM_ARCH_LLAMA>();
|
const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
||||||
|
|
||||||
if (name == llm(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
||||||
|
@ -3712,10 +3838,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
if (name == llm(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
||||||
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
||||||
} else if (name == llm(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||||
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
||||||
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue