chore: Fix import path, token comparisons, and update token type references

This commit is contained in:
teleprint-me 2024-05-28 18:44:57 -04:00
parent 9dbc9571a3
commit aa28cfe6ec
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -31,7 +31,7 @@ if TYPE_CHECKING:
from torch import Tensor from torch import Tensor
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path('gguf-py')))
import gguf import gguf
from convert import LlamaHfVocab from convert import LlamaHfVocab
@ -375,10 +375,13 @@ class Model:
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model) tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert tokenizer.vocab_size == vocab_size assert max(tokenizer.vocab.values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab() added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
tokens.append(f"[PAD{i}]") tokens.append(f"[PAD{i}]")
@ -393,7 +396,6 @@ class Model:
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.NORMAL)
tokpre = self.get_vocab_base_pre(tokenizer)
return tokens, toktypes, tokpre return tokens, toktypes, tokpre
# NOTE: this function is generated by convert-hf-to-gguf-update.py # NOTE: this function is generated by convert-hf-to-gguf-update.py
@ -2439,7 +2441,7 @@ class ArcticModel(Model):
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size toktypes: list[int] = [gguf.TokenType.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()): for token_id in range(tokenizer.vocab_size()):
@ -2447,15 +2449,15 @@ class ArcticModel(Model):
text = piece.encode("utf-8") text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id) score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL toktype = gguf.TokenType.NORMAL
if tokenizer.IsUnknown(token_id): if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN toktype = gguf.TokenType.UNKNOWN
elif tokenizer.IsControl(token_id): elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL toktype = gguf.TokenType.CONTROL
elif tokenizer.IsUnused(token_id): elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED toktype = gguf.TokenType.UNUSED
elif tokenizer.IsByte(token_id): elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE toktype = gguf.TokenType.BYTE
tokens[token_id] = text tokens[token_id] = text
scores[token_id] = score scores[token_id] = score
@ -2477,16 +2479,16 @@ class ArcticModel(Model):
continue continue
token_content = token_json["content"] token_content = token_json["content"]
token_type = SentencePieceTokenTypes.USER_DEFINED token_type = gguf.TokenType.USER_DEFINED
token_score = -10000.0 token_score = -10000.0
# Map unk_token to UNKNOWN, other special tokens to CONTROL # Map unk_token to UNKNOWN, other special tokens to CONTROL
# Set the score to 0.0 as in the original tokenizer.model # Set the score to 0.0 as in the original tokenizer.model
if ("special" in token_json) and token_json["special"]: if ("special" in token_json) and token_json["special"]:
if token_content == tokenizer_config_json["unk_token"]: if token_content == tokenizer_config_json["unk_token"]:
token_type = SentencePieceTokenTypes.UNKNOWN token_type = gguf.TokenType.UNKNOWN
else: else:
token_type = SentencePieceTokenTypes.CONTROL token_type = gguf.TokenType.CONTROL
token_score = 0.0 token_score = 0.0
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")