diff --git a/ci/run.sh b/ci/run.sh
index 7d241ecc0..7edae4f6e 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -751,7 +751,8 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
- (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+ # for this model, the SEP token is ""
+ (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?hi\nwhat is panda?it's a bear\nwhat is panda?The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f343cc252..13e54e501 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2027,7 +2027,7 @@ struct server_context {
continue;
}
- // prompt: querydoc
+ // prompt: [BOS]query[EOS][SEP]doc[EOS]
prompt_tokens.clear();
prompt_tokens.push_back(llama_token_bos(model));
{
@@ -2035,7 +2035,7 @@ struct server_context {
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
}
prompt_tokens.push_back(llama_token_eos(model));
- prompt_tokens.push_back(llama_token_bos(model));
+ prompt_tokens.push_back(llama_token_sep(model));
{
const auto part = tokenize(slot.prompt[1], false);
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());