move back vocab_size where it was
This commit is contained in:
parent
9eb2b4f39c
commit
ff12b8fbd6
1 changed files with 2 additions and 2 deletions
|
@ -359,14 +359,14 @@ class BpeVocab:
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab:
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
|
||||||
|
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
if fname_added_tokens is not None:
|
if fname_added_tokens is not None:
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||||
else:
|
else:
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
|
||||||
|
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens: dict[int, str] = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
new_tokens: dict[int, str] = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens)))
|
expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
actual_new_ids: list[int] = sorted(new_tokens.keys())
|
actual_new_ids: list[int] = sorted(new_tokens.keys())
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue