This commit is contained in:
Sigbjørn Skjæret 2024-10-11 11:49:00 -04:00 committed by GitHub
commit cdc3e78bb6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 33 additions and 6 deletions

View file

@ -167,6 +167,7 @@ class Keys:
CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE = "tokenizer.chat_template"
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates" CHAT_TEMPLATES = "tokenizer.chat_templates"
INVERSE_TEMPLATE = "tokenizer.inverse_template"
# FIM/Infill special tokens constants # FIM/Infill special tokens constants
PREFIX_ID = "tokenizer.ggml.prefix_token_id" PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id"

View file

@ -843,6 +843,9 @@ class GGUFWriter:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_inverse_template(self, value: str) -> None:
self.add_string(Keys.Tokenizer.INVERSE_TEMPLATE, value)
def add_prefix_token_id(self, id: int) -> None: def add_prefix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)

View file

@ -21,6 +21,7 @@ class SpecialVocab:
add_special_token: dict[str, bool] add_special_token: dict[str, bool]
special_token_ids: dict[str, int] special_token_ids: dict[str, int]
chat_template: str | Sequence[Mapping[str, str]] | None chat_template: str | Sequence[Mapping[str, str]] | None
inverse_template: str | None
def __init__( def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False, self, path: str | os.PathLike[str], load_merges: bool = False,
@ -33,6 +34,7 @@ class SpecialVocab:
self.load_merges = load_merges self.load_merges = load_merges
self.merges = [] self.merges = []
self.chat_template = None self.chat_template = None
self.inverse_template = None
if special_token_types is not None: if special_token_types is not None:
self.special_token_types = special_token_types self.special_token_types = special_token_types
else: else:
@ -71,6 +73,10 @@ class SpecialVocab:
if not quiet: if not quiet:
logger.info(f'Setting chat_template to {self.chat_template}') logger.info(f'Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template) gw.add_chat_template(self.chat_template)
if self.inverse_template is not None:
if not quiet:
logger.info(f'Setting inverse_template to {self.inverse_template}')
gw.add_inverse_template(self.inverse_template)
def _load(self, path: Path) -> None: def _load(self, path: Path) -> None:
self._try_load_from_tokenizer_json(path) self._try_load_from_tokenizer_json(path)
@ -159,6 +165,11 @@ class SpecialVocab:
self.chat_template = chat_template self.chat_template = chat_template
else: else:
logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
inverse_template = tokenizer_config.get('inverse_template')
if inverse_template is None or isinstance(inverse_template, str):
self.inverse_template = inverse_template
else:
logger.warning(f'Bad type for inverse_template field in {tokenizer_config_file!r} - ignoring')
for typ in self.special_token_types: for typ in self.special_token_types:
add_entry = tokenizer_config.get(f'add_{typ}_token') add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool): if isinstance(add_entry, bool):

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.10.0" version = "0.11.0"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

View file

@ -85,7 +85,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
continue continue
# Skip old chat templates if we have new ones # Skip old chat templates if we have new ones
if field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: if (field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata) or (field.name.startswith(gguf.Keys.Tokenizer.INVERSE_TEMPLATE) and gguf.Keys.Tokenizer.INVERSE_TEMPLATE in new_metadata):
logger.debug(f'Skipping {field.name}') logger.debug(f'Skipping {field.name}')
continue continue
@ -110,6 +110,11 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value) writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
if gguf.Keys.Tokenizer.INVERSE_TEMPLATE in new_metadata:
logger.debug('Adding inverse template')
writer.add_inverse_template(new_metadata[gguf.Keys.Tokenizer.INVERSE_TEMPLATE].value)
del new_metadata[gguf.Keys.Tokenizer.INVERSE_TEMPLATE]
for key, val in new_metadata.items(): for key, val in new_metadata.items():
logger.debug(f'Adding {key}: "{val.value}" {val.description}') logger.debug(f'Adding {key}: "{val.value}" {val.description}')
writer.add_key_value(key, val.value, val.type) writer.add_key_value(key, val.value, val.type)
@ -143,7 +148,8 @@ def main() -> None:
parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"') parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"')
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."') parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."') parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json') parser.add_argument("--inverse-template", type=str, help="Inverse template string", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat and/or inverse template(s)", metavar='tokenizer_config.json')
parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"') parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url') parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"')) parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
@ -166,12 +172,18 @@ def main() -> None:
if args.chat_template: if args.chat_template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template) new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
if args.inverse_template:
new_metadata[gguf.Keys.Tokenizer.INVERSE_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, args.inverse_template)
if args.chat_template_config: if args.chat_template_config:
with open(args.chat_template_config, 'r') as fp: with open(args.chat_template_config, 'r') as fp:
config = json.load(fp) config = json.load(fp)
template = config.get('chat_template') chat_template = config.get('chat_template')
if template: inverse_template = config.get('inverse_template')
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template) if chat_template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, chat_template)
if inverse_template:
new_metadata[gguf.Keys.Tokenizer.INVERSE_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, inverse_template)
if args.pre_tokenizer: if args.pre_tokenizer:
new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer) new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)