llama : streamline embeddings from "non-embedding" models (#8087)
This commit is contained in:
parent
bcefa03bc0
commit
d12f781074
4 changed files with 36 additions and 10 deletions
|
@ -180,6 +180,12 @@ extern "C" {
|
|||
LLAMA_POOLING_TYPE_LAST = 3,
|
||||
};
|
||||
|
||||
enum llama_attention_type {
|
||||
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
|
||||
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
|
||||
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
|
@ -297,6 +303,7 @@ extern "C" {
|
|||
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue