add comment regarding special token regex in chameleon pre-tokenizer

2024-07-22 13:44:24 +02:00 · 2024-07-22 13:44:24 +02:00 · 05f138551f
commit 05f138551f
parent 6e0ded3637
1 changed files with 4 additions and 0 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -15843,6 +15843,10 @@ struct llm_tokenizer_bpe {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
 		// Note: in theory, the special token (sentinel and image token) regex_exprs below
 		// are unnecessary, as they are split in `tokenizer_st_partition` anyway.
 		// However, since the upstream pre-tokenizer uses them, they are also
 		// included here (see https://huggingface.co/facebook/chameleon-7b).
                regex_exprs = {
                    "<sentinel:[0-9]+>",  // Sentinel tokens
                    "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z",  // Image tokens