From 06808a3d0de5eb131445619c12bf4efef21fb148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 10 Apr 2024 15:31:05 +0200 Subject: [PATCH] Support converting models with multiple chat templates Adds the following metadata: * tokenizer.chat_templates * tokenizer.chat_template. * tokenizer.chat_template. * tokenizer.chat_template.<...> Where `tokenizer.chat_templates` is an array of the template names (except `default`), `default` is added to the regular `tokenizer.chat_template`. --- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 31 +++++++++++++++++++++++++++++-- gguf-py/gguf/vocab.py | 2 +- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a6454a10e..3f9a602b2 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -90,6 +90,8 @@ class Keys: HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" # diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f4c440766..f86dde697 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -6,7 +6,8 @@ import struct import tempfile from enum import Enum, auto from io import BufferedWriter -from typing import IO, Any, Sequence +from typing import IO, Any, Sequence, Mapping +from string import ascii_letters, digits import numpy as np @@ -466,7 +467,33 @@ class GGUFWriter: def add_add_space_prefix(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) - def add_chat_template(self, value: str) -> None: + def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: + if isinstance(value, list): + template_default = None + template_names = set() + + for choice in value: + name = choice.get('name', '') + template = choice.get('template') + + # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it + name = ''.join((c for c in name if c in ['_'] + list(ascii_letters) + list(digits))) + + if name and template is not None: + if name == 'default': + template_default = template + else: + template_names.add(name) + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template) + + if template_names: + self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names)) + + if template_default is None: + return + + value = template_default + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index a23136b18..378eaecad 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -141,7 +141,7 @@ class SpecialVocab: with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) chat_template = tokenizer_config.get('chat_template') - if chat_template is None or isinstance(chat_template, str): + if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: print(