llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763)

* llama : add T5 model architecture, tensors and model header parameters * llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-06-25 21:14:35 +02:00 · 2024-06-25 21:14:35 +02:00 · 6fcbf68235
commit 6fcbf68235
parent e6bf007744
4 changed files with 586 additions and 38 deletions
--- a/llama.h
+++ b/llama.h
@ -67,6 +67,7 @@ extern "C" {
        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
+        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
    };

    // pre-tokenization types
@ -857,6 +858,7 @@ extern "C" {
    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
+    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding

    // Returns -1 if unknown, 1 for true or 0 for false.
    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);