From 05f138551fd2d8e5223f0d934dda110bbd373ff1 Mon Sep 17 00:00:00 2001
From: nopperl <54780682+nopperl@users.noreply.github.com>
Date: Mon, 22 Jul 2024 13:44:24 +0200
Subject: [PATCH] add comment regarding special token regex in chameleon
 pre-tokenizer

---
 src/llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 6d33b1edd..d19c5cf8f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15843,6 +15843,10 @@ struct llm_tokenizer_bpe {
                 };
                 break;
             case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
+		// Note: in theory, the special token (sentinel and image token) regex_exprs below
+		// are unnecessary, as they are split in `tokenizer_st_partition` anyway.
+		// However, since the upstream pre-tokenizer uses them, they are also
+		// included here (see https://huggingface.co/facebook/chameleon-7b).
                 regex_exprs = {
                     "<sentinel:[0-9]+>",  // Sentinel tokens
                     "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z",  // Image tokens