llama : add reranking support (#9510)

* py : add XLMRobertaForSequenceClassification [no ci]

* py : fix scalar-tensor conversion [no ci]

* py : fix position embeddings chop [no ci]

* llama : read new cls tensors [no ci]

* llama : add classigication head (wip) [no ci]

* llama : add "rank" pooling type

ggml-ci

* server : add rerank endpoint

ggml-ci

* llama : aboud ggml_repeat during classification

* rerank : cleanup + comments

* server : accept /rerank endpoint in addition to /v1/rerank [no ci]

* embedding : parse special tokens

* jina : support v1 reranker

* vocab : minor style

ggml-ci

* server : initiate tests for later

ggml-ci

* server : add docs

* llama : add comment [no ci]

* llama : fix uninitialized tensors

* ci : add rerank tests

ggml-ci

* add reranking test

* change test data

* Update examples/server/server.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* add `--reranking` argument

* update server docs

* llama : fix comment [no ci]

ggml-ci

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
Georgi Gerganov 2024-09-28 17:42:03 +03:00 committed by GitHub
parent 1b2f992cd2
commit f4d2b8846a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 602 additions and 56 deletions

View file

@ -345,6 +345,8 @@ class MODEL_TENSOR(IntEnum):
ENC_FFN_DOWN = auto()
ENC_FFN_UP = auto()
ENC_OUTPUT_NORM = auto()
CLS = auto() # classifier
CLS_OUT = auto() # classifier output projection
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -504,6 +506,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
MODEL_TENSOR.CLS: "cls",
MODEL_TENSOR.CLS_OUT: "cls.output",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -613,6 +617,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
MODEL_TENSOR.CLS,
MODEL_TENSOR.CLS_OUT,
],
MODEL_ARCH.NOMIC_BERT: [
MODEL_TENSOR.TOKEN_EMBD,
@ -644,6 +650,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.LAYER_OUT_NORM,
MODEL_TENSOR.CLS,
],
MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD,