Inference support for T5 and FLAN-T5 model families (#5763)

* llama : add inference support and model types for T5 and FLAN-T5 model families * llama : add new API functions to support encoder-decoder models: llama_encode(), llama_model_has_encoder(), llama_model_decoder_start_token() * common, llama-cli, llama-batched : add support for encoder-decoder models * convert-hf : handle shared token embeddings tensors in T5Model * convert-hf : add support for SentencePiece BPE tokenizer in T5Model (for Pile-T5 models) * convert-hf : add MT5ForConditionalGeneration and UMT5ForConditionalGeneration to architectures supported by T5Model * convert : add t5 tokenizer tests, use "slow" HF tokenizer for t5 --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-07-04 15:46:11 +02:00 · 2024-07-04 15:46:11 +02:00 · 807b0c49ff
commit 807b0c49ff
parent f8c4c0738d
33 changed files with 946 additions and 31 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -255,7 +255,9 @@ int main(int argc, char ** argv) {
    }

    const bool add_bos = llama_should_add_bos_token(model);
-    GGML_ASSERT(llama_add_eos_token(model) != 1);
+    if (!llama_model_has_encoder(model)) {
+        GGML_ASSERT(llama_add_eos_token(model) != 1);
+    }
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
@ -517,6 +519,24 @@ int main(int argc, char ** argv) {
        exit(1);
    }

+    if (llama_model_has_encoder(model)) {
+        int enc_input_size = embd_inp.size();
+        llama_token * enc_input_buf = embd_inp.data();
+
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
+            LOG_TEE("%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == -1) {
+            decoder_start_token_id = llama_token_bos(model);
+        }
+
+        embd_inp.clear();
+        embd_inp.push_back(decoder_start_token_id);
+    }
+
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {