py : improve BPE tokenizer support (#5189)
This commit is contained in:
parent
fbe7dfa53c
commit
e76627bcce
1 changed files with 4 additions and 1 deletions
|
@ -334,7 +334,10 @@ class Params:
|
||||||
class BpeVocab:
|
class BpeVocab:
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||||
|
try:
|
||||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||||
|
except:
|
||||||
|
self.vocab = self.bpe_tokenizer
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
if fname_added_tokens is not None:
|
if fname_added_tokens is not None:
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue