Merge branch 'master' into xsn/fix_lora
This commit is contained in:
commit
e68344cb06
79 changed files with 3369 additions and 411 deletions
|
@ -265,7 +265,7 @@ class Model:
|
|||
break
|
||||
|
||||
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
||||
data: np.ndarray = data # type hint
|
||||
data: np.ndarray # type hint
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
data_qtype: gguf.GGMLQuantizationType | None = None
|
||||
|
@ -602,10 +602,6 @@ class Model:
|
|||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||
|
||||
|
@ -1363,7 +1359,7 @@ class LlamaModel(Model):
|
|||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self. _set_vocab_sentencepiece()
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
try:
|
||||
self._set_vocab_llama_hf()
|
||||
|
@ -2127,7 +2123,7 @@ class InternLM2Model(Model):
|
|||
logger.error(f'Error: Missing {tokenizer_path}')
|
||||
sys.exit(1)
|
||||
|
||||
sentencepiece_model = model.ModelProto()
|
||||
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
|
||||
|
@ -2155,6 +2151,9 @@ class InternLM2Model(Model):
|
|||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
# take care of ununsed raw token
|
||||
if piece.startswith('[UNUSED'):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
|
@ -2170,6 +2169,47 @@ class InternLM2Model(Model):
|
|||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
chat_eos_token = '<|im_end|>'
|
||||
chat_eos_token_id = None
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
|
||||
for token_id, foken_data in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
token = foken_data["content"]
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
tokenizer_file = self.dir_model / 'tokenizer.json'
|
||||
if tokenizer_file.is_file():
|
||||
with open(tokenizer_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
added_tokens = tokenizer_json.get("added_tokens", [])
|
||||
for foken_data in added_tokens:
|
||||
token_id = int(foken_data["id"])
|
||||
token = foken_data["content"]
|
||||
if token == chat_eos_token:
|
||||
chat_eos_token_id = token_id
|
||||
token = token.encode("utf-8")
|
||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||
assert(tokens[token_id] == token)
|
||||
tokens[token_id] = token
|
||||
scores[token_id] = -1000.0
|
||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
|
@ -2179,28 +2219,16 @@ class InternLM2Model(Model):
|
|||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
old_eos = special_vocab.special_token_ids["eos"]
|
||||
if "chat" in os.path.basename(self.dir_model.absolute()):
|
||||
if chat_eos_token_id is not None:
|
||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
|
||||
in chat mode so that the conversation can end normally.")
|
||||
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
||||
" in chat mode so that the conversation can end normally.")
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _try_get_sft_eos(self, tokenizer):
|
||||
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
|
||||
im_end_list = tokenizer.Encode('<|im_end|>')
|
||||
eos_token = None
|
||||
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
|
||||
if len(unused_145_list) == 1:
|
||||
eos_token = unused_145_list[0]
|
||||
if len(im_end_list) == 1:
|
||||
eos_token = im_end_list[0]
|
||||
assert eos_token
|
||||
return eos_token
|
||||
|
||||
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
|
@ -2219,6 +2247,10 @@ in chat mode so that the conversation can end normally.")
|
|||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
|
@ -2979,16 +3011,16 @@ class T5Model(Model):
|
|||
if not tokenizer_path.is_file():
|
||||
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||
|
||||
sentencepiece_model = model.ModelProto()
|
||||
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
|
||||
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
|
||||
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
||||
if sentencepiece_model.trainer_spec.model_type == 2: # BPE
|
||||
# assure the tokenizer model file name is correct
|
||||
assert tokenizer_path.name == 'tokenizer.model'
|
||||
return self._set_vocab_sentencepiece()
|
||||
else:
|
||||
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||
|
@ -3159,7 +3191,7 @@ class JaisModel(Model):
|
|||
# but Jais's PyTorch model simply precalculates the slope values and places them
|
||||
# in relative_pes.slopes
|
||||
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
||||
first_val = float(data_torch._data[0])
|
||||
first_val = float(data_torch[0].item())
|
||||
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
||||
|
||||
return tensors
|
||||
|
@ -3193,7 +3225,7 @@ class ChatGLMModel(Model):
|
|||
def set_vocab_chatglm3(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
tokens: list[bytearray] = []
|
||||
tokens: list[bytes] = []
|
||||
toktypes: list[int] = []
|
||||
scores: list[float] = []
|
||||
|
||||
|
@ -3342,7 +3374,7 @@ class ChatGLMModel(Model):
|
|||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.hparams.get("_name_or_path").split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
|
||||
self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue