diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 94e06b3cc..06a0a16c8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2004,8 +2004,9 @@ struct server_context { int32_t n_ubatch = llama_n_ubatch(ctx); // track if this is an embedding or non-embedding batch + // if we've added sampled tokens above, we are in non-embedding mode // -1: none, 0: non-embedding, 1: embedding - int32_t batch_type = -1; + int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { @@ -3271,7 +3272,7 @@ int main(int argc, char ** argv) { return res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_embeddings = [¶ms, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body);