convert : upgrade to sentencepiece v0.2.0
This commit is contained in:
parent
0d720acb91
commit
c33775bcc7
2 changed files with 13 additions and 9 deletions
20
convert.py
20
convert.py
|
@ -281,6 +281,7 @@ class Params:
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
f_rope_freq_base = None
|
f_rope_freq_base = None
|
||||||
|
n_ff = None
|
||||||
|
|
||||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||||
if config.get("moe"):
|
if config.get("moe"):
|
||||||
|
@ -305,6 +306,8 @@ class Params:
|
||||||
n_experts_used = config["moe"]["num_experts_per_tok"]
|
n_experts_used = config["moe"]["num_experts_per_tok"]
|
||||||
f_rope_freq_base = 1e6
|
f_rope_freq_base = 1e6
|
||||||
|
|
||||||
|
assert n_ff is not None
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = model["tok_embeddings.weight"].shape[0],
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd = config["dim"],
|
n_embd = config["dim"],
|
||||||
|
@ -459,7 +462,8 @@ class SentencePieceVocab(Vocab):
|
||||||
# not found in alternate location either
|
# not found in alternate location either
|
||||||
raise FileNotFoundError('Cannot find tokenizer.model')
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
||||||
|
self.sentencepiece_tokenizer.LoadFromFile(fname_tokenizer)
|
||||||
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
|
@ -479,23 +483,23 @@ class SentencePieceVocab(Vocab):
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.IdToPiece(i)
|
||||||
text = piece.encode("utf-8")
|
text = piece.encode("utf-8")
|
||||||
score: float = tokenizer.get_score(i)
|
score: float = tokenizer.GetScore(i)
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.IsUnknown(i):
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
if tokenizer.is_control(i):
|
if tokenizer.IsControl(i):
|
||||||
toktype = gguf.TokenType.CONTROL
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
# NOTE: I think added_tokens are user defined.
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
if tokenizer.IsUnused(i):
|
||||||
toktype = gguf.TokenType.UNUSED
|
toktype = gguf.TokenType.UNUSED
|
||||||
if tokenizer.is_byte(i):
|
if tokenizer.IsByte(i):
|
||||||
toktype = gguf.TokenType.BYTE
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
yield text, score, toktype
|
yield text, score, toktype
|
||||||
|
@ -904,7 +908,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
def rebuild_from_type_v2(func, new_type, args, state):
|
def rebuild_from_type_v2(func, new_type, args, state):
|
||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
||||||
CLASSES = {
|
CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to determine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
numpy~=1.24.4
|
numpy~=1.24.4
|
||||||
sentencepiece~=0.1.98
|
sentencepiece~=0.2.0
|
||||||
transformers>=4.35.2,<5.0.0
|
transformers>=4.35.2,<5.0.0
|
||||||
gguf>=0.1.0
|
gguf>=0.1.0
|
||||||
protobuf>=4.21.0,<5.0.0
|
protobuf>=4.21.0,<5.0.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue