parent
2e6fd63b29
commit
79852ab884
1 changed files with 89 additions and 78 deletions
167
convert.py
167
convert.py
|
@ -44,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
|
|
||||||
|
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||||
|
FAST_TOKENIZER_FILE = 'tokenizer.json'
|
||||||
|
|
||||||
#
|
#
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
@ -367,32 +370,42 @@ class BpeVocab(Vocab):
|
||||||
tokenizer_model = "gpt2"
|
tokenizer_model = "gpt2"
|
||||||
name = "bpe"
|
name = "bpe"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None):
|
def __init__(self, base_path: Path):
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
added_tokens: dict[str, int] = {}
|
||||||
bpe_tokenizer = json.load(f)
|
|
||||||
|
|
||||||
if isinstance(bpe_tokenizer.get('model'), dict):
|
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
||||||
self.vocab = bpe_tokenizer["model"]["vocab"]
|
# "slow" tokenizer
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
self.vocab = json.load(f)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
self.vocab = bpe_tokenizer
|
# "fast" tokenizer
|
||||||
added_tokens: dict[str, int]
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
if fname_added_tokens is not None:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
with open(fname_added_tokens, encoding="utf-8") as f:
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
added_tokens = json.load(f)
|
tokenizer_json = json.load(f)
|
||||||
else:
|
|
||||||
# Fall back to trying to find the added tokens in tokenizer.json
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
if (
|
||||||
if not tokenizer_json_file.is_file():
|
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
||||||
added_tokens = {}
|
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
||||||
else:
|
):
|
||||||
with open(tokenizer_json_file, encoding="utf-8") as f:
|
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
added_tokens = dict(
|
self.vocab = tokenizer_model["vocab"]
|
||||||
(item['content'], item['id'])
|
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
if (added := tokenizer_json.get('added_tokens')) is not None:
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item['content'] not in self.vocab)
|
added_tokens = {item['content']: item['id']
|
||||||
|
for item in added
|
||||||
|
if item['content'] not in self.vocab}
|
||||||
|
|
||||||
vocab_size = len(self.vocab)
|
vocab_size = len(self.vocab)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
@ -432,15 +445,20 @@ class SentencePieceVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "spm"
|
name = "spm"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None):
|
def __init__(self, base_path: Path):
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
added_tokens: dict[str, int] = {}
|
||||||
added_tokens: dict[str, int]
|
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
||||||
if fname_added_tokens is not None:
|
# normal location
|
||||||
with open(fname_added_tokens, encoding="utf-8") as f:
|
try:
|
||||||
added_tokens = json.load(f)
|
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
||||||
else:
|
added_tokens = json.load(f)
|
||||||
added_tokens = {}
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
||||||
|
# not found in alternate location either
|
||||||
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
|
@ -494,27 +512,40 @@ class SentencePieceVocab(Vocab):
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class HfVocab(Vocab):
|
class LlamaHfVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "hfft"
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None):
|
def __init__(self, base_path: Path):
|
||||||
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
# pre-check so we know if we need transformers
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if (
|
||||||
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"To use HfVocab, please install the `transformers` package. "
|
"To use LlamaHfVocab, please install the `transformers` package. "
|
||||||
"You can install it with `pip install transformers`."
|
"You can install it with `pip install transformers`."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
print("fname_tokenizer:", fname_tokenizer)
|
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
# Allow the tokenizer to default to slow or fast versions.
|
||||||
# Explicitly set tokenizer to use local paths.
|
# Explicitly set tokenizer to use local paths.
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
fname_tokenizer,
|
base_path,
|
||||||
cache_dir=fname_tokenizer,
|
cache_dir=base_path,
|
||||||
local_files_only=True,
|
local_files_only=True,
|
||||||
)
|
)
|
||||||
|
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
||||||
|
|
||||||
# Initialize lists and dictionaries for added tokens
|
# Initialize lists and dictionaries for added tokens
|
||||||
self.added_tokens_list = []
|
self.added_tokens_list = []
|
||||||
|
@ -594,7 +625,7 @@ class HfVocab(Vocab):
|
||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -1315,32 +1346,10 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||||
|
|
||||||
|
|
||||||
class VocabFactory:
|
class VocabFactory:
|
||||||
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
|
_VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
|
||||||
|
|
||||||
def __init__(self, path: Path):
|
def __init__(self, path: Path):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.file_paths = self._detect_files()
|
|
||||||
print(f"Found vocab files: {self.file_paths}")
|
|
||||||
|
|
||||||
def _detect_files(self) -> dict[str, Path | None]:
|
|
||||||
def locate(file: str) -> Path | None:
|
|
||||||
if (path := self.path / file).exists():
|
|
||||||
return path
|
|
||||||
if (path := self.path.parent / file).exists():
|
|
||||||
return path
|
|
||||||
return None
|
|
||||||
|
|
||||||
return {vt: locate(f) for vt, f in self._FILES.items()}
|
|
||||||
|
|
||||||
def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
|
|
||||||
for vtype in vocab_types:
|
|
||||||
try:
|
|
||||||
path = self.file_paths[vtype]
|
|
||||||
except KeyError:
|
|
||||||
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
|
||||||
if path is not None:
|
|
||||||
return vtype, path
|
|
||||||
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
|
||||||
|
|
||||||
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||||
load_merges = vocab.name == "bpe"
|
load_merges = vocab.name == "bpe"
|
||||||
|
@ -1353,23 +1362,25 @@ class VocabFactory:
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
||||||
vocab_type, path = self._select_file(vocab_types)
|
vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
|
||||||
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
|
selected_vocabs: dict[str, type[Vocab]] = {}
|
||||||
|
for vtype in vocab_types:
|
||||||
|
try:
|
||||||
|
selected_vocabs[vtype] = vocab_classes[vtype]
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
||||||
|
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
for vtype, cls in selected_vocabs.items():
|
||||||
if vocab_type == "bpe":
|
try:
|
||||||
return BpeVocab(
|
vocab = cls(self.path)
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
break
|
||||||
)
|
except FileNotFoundError:
|
||||||
if vocab_type == "spm":
|
pass # ignore unavailable tokenizers
|
||||||
return SentencePieceVocab(
|
else:
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
|
||||||
)
|
|
||||||
if vocab_type == "hfft":
|
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
|
||||||
return HfVocab(
|
return vocab
|
||||||
path.parent, added_tokens_path if added_tokens_path.exists() else None
|
|
||||||
)
|
|
||||||
raise ValueError(vocab_type)
|
|
||||||
|
|
||||||
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
|
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
|
||||||
vocab: BaseVocab
|
vocab: BaseVocab
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue