chore: Fix import path, token comparisons, and update token type references
This commit is contained in:
parent
9dbc9571a3
commit
aa28cfe6ec
1 changed files with 14 additions and 12 deletions
|
@ -31,7 +31,7 @@ if TYPE_CHECKING:
|
|||
from torch import Tensor
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
sys.path.insert(1, str(Path('gguf-py')))
|
||||
import gguf
|
||||
|
||||
from convert import LlamaHfVocab
|
||||
|
@ -375,10 +375,13 @@ class Model:
|
|||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert tokenizer.vocab_size == vocab_size
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
|
@ -393,7 +396,6 @@ class Model:
|
|||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
return tokens, toktypes, tokpre
|
||||
|
||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
|
@ -2439,7 +2441,7 @@ class ArcticModel(Model):
|
|||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
toktypes: list[int] = [gguf.TokenType.UNKNOWN] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
|
||||
|
@ -2447,15 +2449,15 @@ class ArcticModel(Model):
|
|||
text = piece.encode("utf-8")
|
||||
score = tokenizer.GetScore(token_id)
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
if tokenizer.IsUnknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
elif tokenizer.IsControl(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
elif tokenizer.IsUnused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
|
@ -2477,16 +2479,16 @@ class ArcticModel(Model):
|
|||
continue
|
||||
|
||||
token_content = token_json["content"]
|
||||
token_type = SentencePieceTokenTypes.USER_DEFINED
|
||||
token_type = gguf.TokenType.USER_DEFINED
|
||||
token_score = -10000.0
|
||||
|
||||
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
||||
# Set the score to 0.0 as in the original tokenizer.model
|
||||
if ("special" in token_json) and token_json["special"]:
|
||||
if token_content == tokenizer_config_json["unk_token"]:
|
||||
token_type = SentencePieceTokenTypes.UNKNOWN
|
||||
token_type = gguf.TokenType.UNKNOWN
|
||||
else:
|
||||
token_type = SentencePieceTokenTypes.CONTROL
|
||||
token_type = gguf.TokenType.CONTROL
|
||||
token_score = 0.0
|
||||
|
||||
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue