refactor: Restructure tokenizer model metadata

This commit is contained in:
teleprint-me 2024-05-20 14:42:59 -04:00
parent 18bb36e496
commit d9ba963cd4
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -69,12 +69,13 @@ class Keys:
TIME_STEP_RANK = "{arch}.ssm.time_step_rank" TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model" # Model arch, e.g. llama
HASH = "tokenizer.ggml.hash" TYPE = "tokenizer.ggml.type" # BPE, SPM, WPM, etc.
TYPE = "tokenizer.ggml.type" PRE = "tokenizer.ggml.pre" # Pre-tokenizer reg-ex
HASH = "tokenizer.ggml.hash" # Merged vocab hash sum
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # BERT token types
SCORES = "tokenizer.ggml.scores" SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges" MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
@ -1001,8 +1002,9 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
# tokenization # tokenization
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
KEY_TOKENIZER_HASH = Keys.Tokenizer.HASH
KEY_TOKENIZER_TYPE = Keys.Tokenizer.TYPE KEY_TOKENIZER_TYPE = Keys.Tokenizer.TYPE
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
KEY_TOKENIZER_HASH = Keys.Tokenizer.HASH
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES