diff --git a/src/llama.cpp b/src/llama.cpp index 6d33b1edd..d19c5cf8f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15843,6 +15843,10 @@ struct llm_tokenizer_bpe { }; break; case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: + // Note: in theory, the special token (sentinel and image token) regex_exprs below + // are unnecessary, as they are split in `tokenizer_st_partition` anyway. + // However, since the upstream pre-tokenizer uses them, they are also + // included here (see https://huggingface.co/facebook/chameleon-7b). regex_exprs = { "", // Sentinel tokens "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens