llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763)

* llama : add T5 model architecture, tensors and model header parameters

* llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
fairydreaming 2024-06-25 21:14:35 +02:00 committed by GitHub
parent e6bf007744
commit 6fcbf68235
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 586 additions and 38 deletions

View file

@ -48,6 +48,7 @@ struct codepoint_flags {
std::string unicode_cpt_to_utf8(uint32_t cp);
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);