Promote add_X_token to GGUF metadata for BOS and EOS
This commit is contained in:
parent
f22b2f2045
commit
4814b4bbcd
3 changed files with 21 additions and 6 deletions
|
@ -66,6 +66,8 @@ class Keys:
|
|||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
HF_JSON = "tokenizer.huggingface.json"
|
||||
RWKV = "tokenizer.rwkv.world"
|
||||
|
||||
|
|
|
@ -393,6 +393,12 @@ class GGUFWriter:
|
|||
def add_pad_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
||||
|
||||
def add_add_bos_token(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
|
||||
|
||||
def add_add_eos_token(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
|
|
@ -46,8 +46,8 @@ class SpecialVocab:
|
|||
file = sys.stderr,
|
||||
)
|
||||
for typ, tokid in self.special_token_ids.items():
|
||||
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||
if handler is None:
|
||||
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||
if id_handler is None:
|
||||
print(
|
||||
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
|
||||
file = sys.stderr,
|
||||
|
@ -55,11 +55,18 @@ class SpecialVocab:
|
|||
continue
|
||||
if not quiet:
|
||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||
handler(tokid)
|
||||
for typ, add in self.add_special_token.items():
|
||||
id_handler(tokid)
|
||||
for typ, value in self.add_special_token.items():
|
||||
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
|
||||
if add_handler is None:
|
||||
print(
|
||||
f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
|
||||
file = sys.stderr,
|
||||
)
|
||||
continue
|
||||
if not quiet:
|
||||
print(f'gguf: Setting add special token type {typ} to {add}')
|
||||
gw.add_bool(f'tokenizer.ggml.add_{typ}_token', add)
|
||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||
add_handler(value)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
self._try_load_from_tokenizer_json(path)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue