llama: add support for small granite models
it works only for the small models 3b and 8b. The convert-hf-to-gguf.py script uses the vocabulary size of the granite models to detect granite and set the correct configuration. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
parent
06748ff338
commit
b974e9fcfb
2 changed files with 11 additions and 3 deletions
|
@ -1322,6 +1322,10 @@ class LlamaModel(Model):
|
||||||
if "add_prefix_space" in tokenizer_config_json:
|
if "add_prefix_space" in tokenizer_config_json:
|
||||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||||
|
|
||||||
|
# Apply to granite small models only
|
||||||
|
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
@ -1336,9 +1340,9 @@ class LlamaModel(Model):
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith("q_proj.weight"):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith("k_proj.weight"):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
|
|
|
@ -3982,7 +3982,9 @@ static void llm_load_hparams(
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 22: model.type = e_model::MODEL_1B; break;
|
case 22: model.type = e_model::MODEL_1B; break;
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
// granite uses a vocab with len 49152
|
||||||
|
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
||||||
|
case 36: model.type = e_model::MODEL_8B; break; // granite
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
case 48: model.type = e_model::MODEL_34B; break;
|
case 48: model.type = e_model::MODEL_34B; break;
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
case 60: model.type = e_model::MODEL_30B; break;
|
||||||
|
@ -4252,6 +4254,8 @@ static void llm_load_hparams(
|
||||||
case 30: model.type = e_model::MODEL_3B; break;
|
case 30: model.type = e_model::MODEL_3B; break;
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
case 40: model.type = e_model::MODEL_15B; break;
|
case 40: model.type = e_model::MODEL_15B; break;
|
||||||
|
case 52: model.type = e_model::MODEL_20B; break; // granite
|
||||||
|
case 88: model.type = e_model::MODEL_34B; break; // granite
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue