convert-hf : HfVocab -> LlamaHfVocab
This commit is contained in:
parent
79852ab884
commit
ebad773e9d
2 changed files with 9 additions and 14 deletions
|
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert import HfVocab
|
from convert import LlamaHfVocab
|
||||||
|
|
||||||
|
|
||||||
###### MODEL DEFINITIONS ######
|
###### MODEL DEFINITIONS ######
|
||||||
|
@ -370,12 +370,8 @@ class Model(ABC):
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_hf(self):
|
def _set_vocab_llama_hf(self):
|
||||||
path = self.dir_model
|
vocab = LlamaHfVocab(self.dir_model)
|
||||||
added_tokens_path = self.dir_model
|
|
||||||
vocab = HfVocab(
|
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
|
||||||
)
|
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -1097,7 +1093,7 @@ class MiniCPMModel(Model):
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_hf()
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
@ -1698,11 +1694,8 @@ class BertModel(Model):
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
path = self.dir_model
|
|
||||||
added_tokens_path = self.dir_model if self.dir_model.exists() else None
|
|
||||||
|
|
||||||
# use huggingface vocab to get all tokens
|
# use huggingface vocab to get all tokens
|
||||||
vocab = HfVocab(path, added_tokens_path)
|
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
||||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||||
assert len(tokens) == vocab.vocab_size
|
assert len(tokens) == vocab.vocab_size
|
||||||
self.vocab_size = vocab.vocab_size
|
self.vocab_size = vocab.vocab_size
|
||||||
|
|
|
@ -516,7 +516,7 @@ class LlamaHfVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "hfft"
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
@ -524,7 +524,9 @@ class LlamaHfVocab(Vocab):
|
||||||
|
|
||||||
# pre-check so we know if we need transformers
|
# pre-check so we know if we need transformers
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
if (
|
if ignore_nonllama:
|
||||||
|
pass # workaround incorrect use of this class for WordPiece
|
||||||
|
elif (
|
||||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
):
|
):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue