llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763)
* llama : add T5 model architecture, tensors and model header parameters * llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
parent
e6bf007744
commit
6fcbf68235
4 changed files with 586 additions and 38 deletions
|
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
|||
return result;
|
||||
}
|
||||
|
||||
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
assert(offset < utf8.size());
|
||||
if (!(utf8[offset + 0] & 0x80)) {
|
||||
auto result = utf8[offset + 0];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue