From eaefebb2c90729c9652c0a056104ca03169847be Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 18 Nov 2023 14:53:34 +0100 Subject: [PATCH] gguf-py : export chat templates --- gguf-py/gguf/constants.py | 29 +++++++++++++++-------------- gguf-py/gguf/gguf_writer.py | 3 +++ gguf-py/gguf/vocab.py | 6 ++++++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7f63361bd..8bd82daca 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -56,20 +56,21 @@ class Keys: SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" # diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index c3b8c588f..ab7382c44 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -399,6 +399,9 @@ class GGUFWriter: def add_add_eos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def add_chat_template(self, value: str) -> None: + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index b9f50a0af..cc3f6bf4c 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -13,6 +13,7 @@ class SpecialVocab: merges: list[str] add_special_token: dict[str, bool] special_token_ids: dict[str, int] + chat_template: str | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -67,6 +68,10 @@ class SpecialVocab: if not quiet: print(f'gguf: Setting add_{typ}_token to {value}') add_handler(value) + if self.chat_template is not None: + if not quiet: + print(f'gguf: Setting chat_template to {self.chat_template}') + gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -156,6 +161,7 @@ class SpecialVocab: None, ) self._set_special_token(typ, maybe_token_id) + self.chat_template = tokenizer_config.get('chat_template') return True def _try_load_from_config_json(self, path: Path) -> bool: