llama : fix llm_type enum names
ggml-ci
This commit is contained in:
parent
fffa6b15c4
commit
a48412f92b
5 changed files with 381 additions and 382 deletions
|
@ -35,7 +35,7 @@
|
||||||
|
|
||||||
// TODO: consider moving to llama-impl.h if needed in more places
|
// TODO: consider moving to llama-impl.h if needed in more places
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
std::string llama_format_win_err(DWORD err) {
|
static std::string llama_format_win_err(DWORD err) {
|
||||||
LPSTR buf;
|
LPSTR buf;
|
||||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -13,72 +13,71 @@
|
||||||
struct llama_model_loader;
|
struct llama_model_loader;
|
||||||
|
|
||||||
// available models
|
// available models
|
||||||
// TODO: this enum does not follow the enum naming convention
|
|
||||||
enum llm_type {
|
enum llm_type {
|
||||||
MODEL_UNKNOWN,
|
LLM_TYPE_UNKNOWN,
|
||||||
MODEL_14M,
|
LLM_TYPE_14M,
|
||||||
MODEL_17M,
|
LLM_TYPE_17M,
|
||||||
MODEL_22M,
|
LLM_TYPE_22M,
|
||||||
MODEL_33M,
|
LLM_TYPE_33M,
|
||||||
MODEL_60M,
|
LLM_TYPE_60M,
|
||||||
MODEL_70M,
|
LLM_TYPE_70M,
|
||||||
MODEL_80M,
|
LLM_TYPE_80M,
|
||||||
MODEL_109M,
|
LLM_TYPE_109M,
|
||||||
MODEL_137M,
|
LLM_TYPE_137M,
|
||||||
MODEL_160M,
|
LLM_TYPE_160M,
|
||||||
MODEL_220M,
|
LLM_TYPE_220M,
|
||||||
MODEL_250M,
|
LLM_TYPE_250M,
|
||||||
MODEL_270M,
|
LLM_TYPE_270M,
|
||||||
MODEL_335M,
|
LLM_TYPE_335M,
|
||||||
MODEL_410M,
|
LLM_TYPE_410M,
|
||||||
MODEL_450M,
|
LLM_TYPE_450M,
|
||||||
MODEL_770M,
|
LLM_TYPE_770M,
|
||||||
MODEL_780M,
|
LLM_TYPE_780M,
|
||||||
MODEL_0_5B,
|
LLM_TYPE_0_5B,
|
||||||
MODEL_1B,
|
LLM_TYPE_1B,
|
||||||
MODEL_1_3B,
|
LLM_TYPE_1_3B,
|
||||||
MODEL_1_4B,
|
LLM_TYPE_1_4B,
|
||||||
MODEL_1_5B,
|
LLM_TYPE_1_5B,
|
||||||
MODEL_1_6B,
|
LLM_TYPE_1_6B,
|
||||||
MODEL_2B,
|
LLM_TYPE_2B,
|
||||||
MODEL_2_8B,
|
LLM_TYPE_2_8B,
|
||||||
MODEL_3B,
|
LLM_TYPE_3B,
|
||||||
MODEL_4B,
|
LLM_TYPE_4B,
|
||||||
MODEL_6B,
|
LLM_TYPE_6B,
|
||||||
MODEL_6_9B,
|
LLM_TYPE_6_9B,
|
||||||
MODEL_7B,
|
LLM_TYPE_7B,
|
||||||
MODEL_8B,
|
LLM_TYPE_8B,
|
||||||
MODEL_9B,
|
LLM_TYPE_9B,
|
||||||
MODEL_11B,
|
LLM_TYPE_11B,
|
||||||
MODEL_12B,
|
LLM_TYPE_12B,
|
||||||
MODEL_13B,
|
LLM_TYPE_13B,
|
||||||
MODEL_14B,
|
LLM_TYPE_14B,
|
||||||
MODEL_15B,
|
LLM_TYPE_15B,
|
||||||
MODEL_16B,
|
LLM_TYPE_16B,
|
||||||
MODEL_20B,
|
LLM_TYPE_20B,
|
||||||
MODEL_30B,
|
LLM_TYPE_30B,
|
||||||
MODEL_32B,
|
LLM_TYPE_32B,
|
||||||
MODEL_34B,
|
LLM_TYPE_34B,
|
||||||
MODEL_35B,
|
LLM_TYPE_35B,
|
||||||
MODEL_40B,
|
LLM_TYPE_40B,
|
||||||
MODEL_65B,
|
LLM_TYPE_65B,
|
||||||
MODEL_70B,
|
LLM_TYPE_70B,
|
||||||
MODEL_236B,
|
LLM_TYPE_236B,
|
||||||
MODEL_314B,
|
LLM_TYPE_314B,
|
||||||
MODEL_671B,
|
LLM_TYPE_671B,
|
||||||
MODEL_SMALL,
|
LLM_TYPE_SMALL,
|
||||||
MODEL_MEDIUM,
|
LLM_TYPE_MEDIUM,
|
||||||
MODEL_LARGE,
|
LLM_TYPE_LARGE,
|
||||||
MODEL_XL,
|
LLM_TYPE_XL,
|
||||||
MODEL_A1_7B,
|
LLM_TYPE_A1_7B,
|
||||||
MODEL_A2_7B,
|
LLM_TYPE_A2_7B,
|
||||||
MODEL_8x7B,
|
LLM_TYPE_8x7B,
|
||||||
MODEL_8x22B,
|
LLM_TYPE_8x22B,
|
||||||
MODEL_16x12B,
|
LLM_TYPE_16x12B,
|
||||||
MODEL_16x3_8B,
|
LLM_TYPE_16x3_8B,
|
||||||
MODEL_10B_128x3_66B,
|
LLM_TYPE_10B_128x3_66B,
|
||||||
MODEL_57B_A14B,
|
LLM_TYPE_57B_A14B,
|
||||||
MODEL_27B,
|
LLM_TYPE_27B,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_layer_posnet {
|
struct llama_layer_posnet {
|
||||||
|
@ -284,7 +283,7 @@ struct llama_layer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
llm_type type = MODEL_UNKNOWN;
|
llm_type type = LLM_TYPE_UNKNOWN;
|
||||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||||
|
|
||||||
std::string name = "n/a";
|
std::string name = "n/a";
|
||||||
|
|
|
@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
if (qs.model.type == MODEL_70B) {
|
if (qs.model.type == LLM_TYPE_70B) {
|
||||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
|
|
|
@ -1809,7 +1809,7 @@ struct llm_build_context {
|
||||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
|
struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
@ -1834,7 +1834,7 @@ struct llm_build_context {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
switch (model.type) {
|
switch (model.type) {
|
||||||
case MODEL_7B:
|
case LLM_TYPE_7B:
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
@ -1846,7 +1846,7 @@ struct llm_build_context {
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
case MODEL_13B:
|
case LLM_TYPE_13B:
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
|
||||||
break;
|
break;
|
||||||
|
@ -4889,9 +4889,9 @@ struct llm_build_context {
|
||||||
|
|
||||||
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
||||||
switch (model.type) {
|
switch (model.type) {
|
||||||
case llm_type::MODEL_2B:
|
case LLM_TYPE_2B:
|
||||||
case llm_type::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
case LLM_TYPE_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||||
case llm_type::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
||||||
default: GGML_ABORT("fatal error");
|
default: GGML_ABORT("fatal error");
|
||||||
};
|
};
|
||||||
cb(Qcur, "Qcur_scaled", il);
|
cb(Qcur, "Qcur_scaled", il);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue