From 05f138551fd2d8e5223f0d934dda110bbd373ff1 Mon Sep 17 00:00:00 2001 From: nopperl <54780682+nopperl@users.noreply.github.com> Date: Mon, 22 Jul 2024 13:44:24 +0200 Subject: [PATCH] add comment regarding special token regex in chameleon pre-tokenizer --- src/llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 6d33b1edd..d19c5cf8f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15843,6 +15843,10 @@ struct llm_tokenizer_bpe { }; break; case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: + // Note: in theory, the special token (sentinel and image token) regex_exprs below + // are unnecessary, as they are split in `tokenizer_st_partition` anyway. + // However, since the upstream pre-tokenizer uses them, they are also + // included here (see https://huggingface.co/facebook/chameleon-7b). regex_exprs = { "", // Sentinel tokens "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens