diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0c0b4ba8a..bf1ccf669 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -66,6 +66,8 @@ class Keys: UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2f437625a..75fb6976f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -393,6 +393,12 @@ class GGUFWriter: def add_pad_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.PAD_ID, id) + def add_add_bos_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_BOS, value) + + def add_add_eos_token(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 88830d47e..71192a928 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -46,8 +46,8 @@ class SpecialVocab: file = sys.stderr, ) for typ, tokid in self.special_token_ids.items(): - handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) - if handler is None: + id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) + if id_handler is None: print( f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr, @@ -55,11 +55,18 @@ class SpecialVocab: continue if not quiet: print(f'gguf: Setting special token type {typ} to {tokid}') - handler(tokid) - for typ, add in self.add_special_token.items(): + id_handler(tokid) + for typ, value in self.add_special_token.items(): + add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) + if add_handler is None: + print( + f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', + file = sys.stderr, + ) + continue if not quiet: - print(f'gguf: Setting add special token type {typ} to {add}') - gw.add_bool(f'tokenizer.ggml.add_{typ}_token', add) + print(f'gguf: Setting add_{typ}_token to {value}') + add_handler(value) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path)