gguf-py : export chat templates
This commit is contained in:
parent
bbecf3f415
commit
eaefebb2c9
3 changed files with 24 additions and 14 deletions
|
@ -56,20 +56,21 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -399,6 +399,9 @@ class GGUFWriter:
|
||||||
def add_add_eos_token(self, value: bool) -> None:
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
|
def add_chat_template(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -13,6 +13,7 @@ class SpecialVocab:
|
||||||
merges: list[str]
|
merges: list[str]
|
||||||
add_special_token: dict[str, bool]
|
add_special_token: dict[str, bool]
|
||||||
special_token_ids: dict[str, int]
|
special_token_ids: dict[str, int]
|
||||||
|
chat_template: str | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
@ -67,6 +68,10 @@ class SpecialVocab:
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||||
add_handler(value)
|
add_handler(value)
|
||||||
|
if self.chat_template is not None:
|
||||||
|
if not quiet:
|
||||||
|
print(f'gguf: Setting chat_template to {self.chat_template}')
|
||||||
|
gw.add_chat_template(self.chat_template)
|
||||||
|
|
||||||
def _load(self, path: Path) -> None:
|
def _load(self, path: Path) -> None:
|
||||||
self._try_load_from_tokenizer_json(path)
|
self._try_load_from_tokenizer_json(path)
|
||||||
|
@ -156,6 +161,7 @@ class SpecialVocab:
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
self._set_special_token(typ, maybe_token_id)
|
self._set_special_token(typ, maybe_token_id)
|
||||||
|
self.chat_template = tokenizer_config.get('chat_template')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _try_load_from_config_json(self, path: Path) -> bool:
|
def _try_load_from_config_json(self, path: Path) -> bool:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue