Promote add_X_token to GGUF metadata for BOS and EOS
This commit is contained in:
parent
f22b2f2045
commit
4814b4bbcd
3 changed files with 21 additions and 6 deletions
|
@ -66,6 +66,8 @@ class Keys:
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
|
|
||||||
|
|
|
@ -393,6 +393,12 @@ class GGUFWriter:
|
||||||
def add_pad_token_id(self, id: int) -> None:
|
def add_pad_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
||||||
|
|
||||||
|
def add_add_bos_token(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
|
||||||
|
|
||||||
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -46,8 +46,8 @@ class SpecialVocab:
|
||||||
file = sys.stderr,
|
file = sys.stderr,
|
||||||
)
|
)
|
||||||
for typ, tokid in self.special_token_ids.items():
|
for typ, tokid in self.special_token_ids.items():
|
||||||
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||||
if handler is None:
|
if id_handler is None:
|
||||||
print(
|
print(
|
||||||
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
|
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
|
||||||
file = sys.stderr,
|
file = sys.stderr,
|
||||||
|
@ -55,11 +55,18 @@ class SpecialVocab:
|
||||||
continue
|
continue
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||||
handler(tokid)
|
id_handler(tokid)
|
||||||
for typ, add in self.add_special_token.items():
|
for typ, value in self.add_special_token.items():
|
||||||
|
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
|
||||||
|
if add_handler is None:
|
||||||
|
print(
|
||||||
|
f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
|
||||||
|
file = sys.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Setting add special token type {typ} to {add}')
|
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||||
gw.add_bool(f'tokenizer.ggml.add_{typ}_token', add)
|
add_handler(value)
|
||||||
|
|
||||||
def _load(self, path: Path) -> None:
|
def _load(self, path: Path) -> None:
|
||||||
self._try_load_from_tokenizer_json(path)
|
self._try_load_from_tokenizer_json(path)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue