chore: Fix import path, token comparisons, and update token type references
This commit is contained in:
parent
9dbc9571a3
commit
aa28cfe6ec
1 changed files with 14 additions and 12 deletions
|
@ -31,7 +31,7 @@ if TYPE_CHECKING:
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path('gguf-py')))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert import LlamaHfVocab
|
from convert import LlamaHfVocab
|
||||||
|
@ -375,10 +375,13 @@ class Model:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert tokenizer.vocab_size == vocab_size
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
|
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||||
|
|
||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
tokens.append(f"[PAD{i}]")
|
tokens.append(f"[PAD{i}]")
|
||||||
|
@ -393,7 +396,6 @@ class Model:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
||||||
return tokens, toktypes, tokpre
|
return tokens, toktypes, tokpre
|
||||||
|
|
||||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||||
|
@ -2439,7 +2441,7 @@ class ArcticModel(Model):
|
||||||
|
|
||||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
scores: list[float] = [-10000.0] * vocab_size
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
toktypes: list[int] = [gguf.TokenType.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
for token_id in range(tokenizer.vocab_size()):
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
|
||||||
|
@ -2447,15 +2449,15 @@ class ArcticModel(Model):
|
||||||
text = piece.encode("utf-8")
|
text = piece.encode("utf-8")
|
||||||
score = tokenizer.GetScore(token_id)
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
toktype = SentencePieceTokenTypes.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
if tokenizer.IsUnknown(token_id):
|
if tokenizer.IsUnknown(token_id):
|
||||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
elif tokenizer.IsControl(token_id):
|
elif tokenizer.IsControl(token_id):
|
||||||
toktype = SentencePieceTokenTypes.CONTROL
|
toktype = gguf.TokenType.CONTROL
|
||||||
elif tokenizer.IsUnused(token_id):
|
elif tokenizer.IsUnused(token_id):
|
||||||
toktype = SentencePieceTokenTypes.UNUSED
|
toktype = gguf.TokenType.UNUSED
|
||||||
elif tokenizer.IsByte(token_id):
|
elif tokenizer.IsByte(token_id):
|
||||||
toktype = SentencePieceTokenTypes.BYTE
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
tokens[token_id] = text
|
tokens[token_id] = text
|
||||||
scores[token_id] = score
|
scores[token_id] = score
|
||||||
|
@ -2477,16 +2479,16 @@ class ArcticModel(Model):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
token_content = token_json["content"]
|
token_content = token_json["content"]
|
||||||
token_type = SentencePieceTokenTypes.USER_DEFINED
|
token_type = gguf.TokenType.USER_DEFINED
|
||||||
token_score = -10000.0
|
token_score = -10000.0
|
||||||
|
|
||||||
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
||||||
# Set the score to 0.0 as in the original tokenizer.model
|
# Set the score to 0.0 as in the original tokenizer.model
|
||||||
if ("special" in token_json) and token_json["special"]:
|
if ("special" in token_json) and token_json["special"]:
|
||||||
if token_content == tokenizer_config_json["unk_token"]:
|
if token_content == tokenizer_config_json["unk_token"]:
|
||||||
token_type = SentencePieceTokenTypes.UNKNOWN
|
token_type = gguf.TokenType.UNKNOWN
|
||||||
else:
|
else:
|
||||||
token_type = SentencePieceTokenTypes.CONTROL
|
token_type = gguf.TokenType.CONTROL
|
||||||
token_score = 0.0
|
token_score = 0.0
|
||||||
|
|
||||||
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue