llama : support batched embeddings (#5466)

* batched embedding: pool outputs by sequence id. updated embedding example

* bring back non-causal attention

* embd : minor improvements

* llama : minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Douglas Hanley 2024-02-13 06:06:58 -06:00 committed by GitHub
parent ad014bba97
commit 03bf161eb6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 163 additions and 54 deletions

View file

@ -40,6 +40,7 @@ class Keys:
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count"
POOLING_LAYER = "{arch}.pooling_layer"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"

View file

@ -360,6 +360,9 @@ class GGUFWriter:
def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
def add_pooling_layer(self, value: bool) -> None:
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)