llama : allow pooled embeddings on any model (#7477)
* create append_pooling operation; allow to specify attention_type; add last token pooling; update examples * find result_norm/result_embd tensors properly; update output allocation logic * only use embd output for pooling_type NONE * get rid of old causal_attn accessor * take out attention_type; add in llama_set_embeddings * bypass logits when doing non-NONE pooling
This commit is contained in:
parent
0e64591e82
commit
80ea089d77
6 changed files with 130 additions and 70 deletions
|
@ -541,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
||||
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
||||
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
|
@ -1869,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
|
||||
options.push_back({ "backend" });
|
||||
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
||||
|
||||
if (llama_supports_mlock()) {
|
||||
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue