llama : add inference support and model types for T5 and FLAN-T5 model families

llama : add new API functions to support encoder-decoder models: llama_encode(), llama_model_has_encoder(), llama_model_decoder_start_token()

common, llama-cli : use new API functions to support encoder-decoder models

convert-hf : handle shared token embeddings tensors in T5Model

convert-hf : handle SentencePiece BPE tokenizer in T5Model (for Pile-T5 models)

convert-hf : add MT5ForConditionalGeneration and UMT5ForConditionalGeneration to architectures supported by T5Model
This commit is contained in:
Stanisław Szymczyk 2024-06-26 15:03:01 +02:00
parent 6fcbf68235
commit 45681a57dd
5 changed files with 892 additions and 15 deletions

View file

@ -255,7 +255,9 @@ int main(int argc, char ** argv) {
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
if (!llama_model_has_encoder(model)) {
GGML_ASSERT(llama_add_eos_token(model) != 1);
}
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;
@ -517,6 +519,23 @@ int main(int argc, char ** argv) {
exit(1);
}
if (llama_model_has_encoder(model)) {
int enc_input_size = embd_inp.size();
llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = llama_token_bos(model);
}
embd_inp.clear();
embd_inp.push_back(decoder_start_token_id);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {