llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763)
* llama : add T5 model architecture, tensors and model header parameters * llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
parent
e6bf007744
commit
6fcbf68235
4 changed files with 586 additions and 38 deletions
|
@ -48,6 +48,7 @@ struct codepoint_flags {
|
|||
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue