llama : add inference support and model types for T5 and FLAN-T5 model families

llama : add new API functions to support encoder-decoder models: llama_encode(), llama_model_has_encoder(), llama_model_decoder_start_token() common, llama-cli : use new API functions to support encoder-decoder models convert-hf : handle shared token embeddings tensors in T5Model convert-hf : handle SentencePiece BPE tokenizer in T5Model (for Pile-T5 models) convert-hf : add MT5ForConditionalGeneration and UMT5ForConditionalGeneration to architectures supported by T5Model
2024-06-26 15:03:01 +02:00 · 2024-06-26 15:03:01 +02:00 · 45681a57dd
commit 45681a57dd
parent 6fcbf68235
5 changed files with 892 additions and 15 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -255,7 +255,9 @@ int main(int argc, char ** argv) {
    }

    const bool add_bos = llama_should_add_bos_token(model);
-    GGML_ASSERT(llama_add_eos_token(model) != 1);
+    if (!llama_model_has_encoder(model)) {
+        GGML_ASSERT(llama_add_eos_token(model) != 1);
+    }
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
@ -517,6 +519,23 @@ int main(int argc, char ** argv) {
        exit(1);
    }

+    if (llama_model_has_encoder(model)) {
+        int enc_input_size = embd_inp.size();
+        llama_token * enc_input_buf = embd_inp.data();
+
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
+            LOG_TEE("%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == -1) {
+            decoder_start_token_id = llama_token_bos(model);
+        }
+        embd_inp.clear();
+        embd_inp.push_back(decoder_start_token_id);
+    }
+
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {