Promote add_X_token to GGUF metadata for BOS and EOS

This commit is contained in:
KerfuffleV2 2023-11-10 14:12:55 -07:00
parent f22b2f2045
commit 4814b4bbcd
3 changed files with 21 additions and 6 deletions

View file

@ -66,6 +66,8 @@ class Keys:
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"

View file

@ -393,6 +393,12 @@ class GGUFWriter:
def add_pad_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
def add_add_bos_token(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
def add_add_eos_token(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = ''
if not skip_pack_prefix:

View file

@ -46,8 +46,8 @@ class SpecialVocab:
file = sys.stderr,
)
for typ, tokid in self.special_token_ids.items():
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if handler is None:
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if id_handler is None:
print(
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
file = sys.stderr,
@ -55,11 +55,18 @@ class SpecialVocab:
continue
if not quiet:
print(f'gguf: Setting special token type {typ} to {tokid}')
handler(tokid)
for typ, add in self.add_special_token.items():
id_handler(tokid)
for typ, value in self.add_special_token.items():
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
if add_handler is None:
print(
f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
file = sys.stderr,
)
continue
if not quiet:
print(f'gguf: Setting add special token type {typ} to {add}')
gw.add_bool(f'tokenizer.ggml.add_{typ}_token', add)
print(f'gguf: Setting add_{typ}_token to {value}')
add_handler(value)
def _load(self, path: Path) -> None:
self._try_load_from_tokenizer_json(path)