Adapt token embeddings and output.weight to vocab size
due to the huge increase of the embeddings and output weight size for models with huge vocab, they seem to quantize with less loss.
This commit is contained in:
parent
17b71512a6
commit
4ba561808d
1 changed files with 28 additions and 5 deletions
|
@ -15877,15 +15877,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
new_type = GGML_TYPE_Q8_0;
|
new_type = GGML_TYPE_Q8_0;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS;
|
||||||
new_type = GGML_TYPE_Q4_K;
|
else new_type = GGML_TYPE_Q4_K;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||||
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
|
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
|
||||||
new_type = GGML_TYPE_Q5_K;
|
new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||||
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else new_type = GGML_TYPE_Q6_K;
|
||||||
|
}
|
||||||
else if (new_type != GGML_TYPE_Q8_0) {
|
else if (new_type != GGML_TYPE_Q8_0) {
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
|
@ -15894,10 +15902,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
||||||
new_type = qs.params->token_embedding_type;
|
new_type = qs.params->token_embedding_type;
|
||||||
} else {
|
} else {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS;
|
||||||
|
else new_type = GGML_TYPE_IQ2_S;
|
||||||
|
}
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||||
new_type = GGML_TYPE_IQ2_S;
|
new_type = GGML_TYPE_IQ2_S;
|
||||||
}
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
|
||||||
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S;
|
||||||
|
else new_type = GGML_TYPE_IQ3_XXS;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
||||||
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
|
||||||
|
else new_type = GGML_TYPE_IQ3_S;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
|
||||||
|
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S;
|
||||||
|
else new_type = GGML_TYPE_IQ4_XS;
|
||||||
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue