llama : streamline embeddings from "non-embedding" models (#8087)

This commit is contained in:
Douglas Hanley 2024-07-05 02:05:56 -05:00 committed by GitHub
parent bcefa03bc0
commit d12f781074
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 36 additions and 10 deletions

View file

@ -180,6 +180,12 @@ extern "C" {
LLAMA_POOLING_TYPE_LAST = 3,
};
enum llama_attention_type {
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@ -297,6 +303,7 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model