Add support for encoder-only T5 models (#8900)

* gguf-py : add T5ENCODER model architecture * common : call llama_decode() during warmup only if the model has decoder * convert-hf : add T5EncoderModel * llama : add llama_model_has_decoder() API function * llama : split build_t5() into build_t5_encoder() and build_t5_decoder() * llama : add support for LLM_ARCH_T5ENCODER * llama-embedding : add support for LLAMA_POOLING_TYPE_NONE * llama-embedding : add support for encoder-only models --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-08-10 11:43:26 +02:00 · 2024-08-10 11:43:26 +02:00 · 7c3f55c100
commit 7c3f55c100
parent 911b437f22
6 changed files with 702 additions and 335 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -217,6 +217,7 @@ class MODEL_ARCH(IntEnum):
    CHATGLM      = auto()
    BITNET       = auto()
    T5           = auto()
+    T5ENCODER    = auto()
    JAIS         = auto()


@ -344,6 +345,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.CHATGLM:        "chatglm",
    MODEL_ARCH.BITNET:         "bitnet",
    MODEL_ARCH.T5:             "t5",
+    MODEL_ARCH.T5ENCODER:      "t5encoder",
    MODEL_ARCH.JAIS:           "jais",
 }

@ -1036,6 +1038,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ENC_FFN_UP,
        MODEL_TENSOR.ENC_OUTPUT_NORM,
    ],
+    MODEL_ARCH.T5ENCODER: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ENC_ATTN_NORM,
+        MODEL_TENSOR.ENC_ATTN_Q,
+        MODEL_TENSOR.ENC_ATTN_K,
+        MODEL_TENSOR.ENC_ATTN_V,
+        MODEL_TENSOR.ENC_ATTN_OUT,
+        MODEL_TENSOR.ENC_ATTN_REL_B,
+        MODEL_TENSOR.ENC_FFN_NORM,
+        MODEL_TENSOR.ENC_FFN_GATE,
+        MODEL_TENSOR.ENC_FFN_DOWN,
+        MODEL_TENSOR.ENC_FFN_UP,
+        MODEL_TENSOR.ENC_OUTPUT_NORM,
+    ],
    MODEL_ARCH.JAIS: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,