add comment regarding special token regex in chameleon pre-tokenizer
This commit is contained in:
parent
6e0ded3637
commit
05f138551f
1 changed files with 4 additions and 0 deletions
|
@ -15843,6 +15843,10 @@ struct llm_tokenizer_bpe {
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
|
case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
|
||||||
|
// Note: in theory, the special token (sentinel and image token) regex_exprs below
|
||||||
|
// are unnecessary, as they are split in `tokenizer_st_partition` anyway.
|
||||||
|
// However, since the upstream pre-tokenizer uses them, they are also
|
||||||
|
// included here (see https://huggingface.co/facebook/chameleon-7b).
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"<sentinel:[0-9]+>", // Sentinel tokens
|
"<sentinel:[0-9]+>", // Sentinel tokens
|
||||||
"(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
|
"(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue