llama : keep track of all EOG tokens in the vocab (#9609)

ggml-ci
2024-09-24 10:16:06 +03:00 · 2024-09-24 10:16:06 +03:00 · 31ac5834fe
commit 31ac5834fe
parent cea1486ecf
3 changed files with 61 additions and 18 deletions
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -6,6 +6,7 @@
 #include <vector>
 #include <unordered_map>
 #include <map>
+#include <set>

 struct llama_vocab {
    using id    = llama_token;
@ -49,12 +50,15 @@ struct llama_vocab {
    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
    id special_eom_id    = -1;

+    // set of all tokens that cause "end of generation"
+    std::set<id> special_eog_ids;
+
    // tokenizer flags
-    bool tokenizer_add_space_prefix = false;
-    bool tokenizer_add_bos          = false;
-    bool tokenizer_add_eos          = false;
-    bool tokenizer_ignore_merges    = false;
-    bool tokenizer_clean_spaces     = false;  // clean_up_tokenization_spaces
+    bool tokenizer_add_space_prefix           = false;
+    bool tokenizer_add_bos                    = false;
+    bool tokenizer_add_eos                    = false;
+    bool tokenizer_ignore_merges              = false;
+    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;