Merge branch 'master' into support-mamba-ssm

2024-03-07 10:56:26 -05:00 · 2024-03-07 10:56:26 -05:00 · 916b586386
commit 916b586386
parent 5544f5211b 6cdabe6526
21 changed files with 2420 additions and 2811 deletions
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -58,7 +58,8 @@ jobs:
            cmake \
            python3-pip \
            wget \
-            psmisc
+            psmisc \
            language-pack-en
      - name: Build
        id: cmake_build
--- a/5
+++ b/5
@ -724,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -45,7 +45,8 @@ fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
-        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
--- a/convert.py
+++ b/convert.py
@ -1377,7 +1377,6 @@ def main(args_in: list[str] | None = None) -> None:
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
    parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
@ -1393,18 +1392,6 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    args = parser.parse_args(args_in)
    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        if tmp_model_path.is_dir():
            print(f"{tmp_model_path} exists as a weighted model.")
        else:
            tmp_model_path.mkdir(parents=True, exist_ok=True)
            print("Saving new weighted model ...")
            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
            print(f"Saved weighted model at {tmp_model_path}.")
        args.model = tmp_model_path
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -173,6 +173,7 @@ struct cmd_params {
    std::vector<bool> no_kv_offload;
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    std::vector<bool> embeddings;
    int reps;
    bool verbose;
    output_formats output_format;
@ -192,6 +193,7 @@ static const cmd_params cmd_params_defaults = {
    /* no_kv_offload */ {false},
    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
    /* embeddings    */ {false},
    /* reps          */ 5,
    /* verbose       */ false,
    /* output_format */ MARKDOWN
@ -214,6 +216,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@ -382,6 +385,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<bool>(argv[i], split_delim);
            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
        } else if (arg == "-embd" || arg == "--embeddings") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<bool>(argv[i], split_delim);
            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
        } else if (arg == "-ts" || arg == "--tensor-split") {
            if (++i >= argc) {
                invalid_param = true;
@ -453,6 +463,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
    return params;
@ -472,6 +483,7 @@ struct cmd_params_instance {
    bool no_kv_offload;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
    llama_model_params to_llama_mparams() const {
        llama_model_params mparams = llama_model_default_params();
@ -502,6 +514,7 @@ struct cmd_params_instance {
        cparams.type_k = type_k;
        cparams.type_v = type_v;
        cparams.offload_kqv = !no_kv_offload;
        cparams.embeddings = embeddings;
        return cparams;
    }
@ -517,6 +530,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & mmp : params.use_mmap)
    for (const auto & embd : params.embeddings)
    for (const auto & nb : params.n_batch)
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
@ -540,6 +554,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .no_kv_offload= */ nkvo,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
        }
@ -562,6 +577,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .no_kv_offload= */ nkvo,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
        }
@ -597,6 +613,7 @@ struct test {
    bool no_kv_offload;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
    int n_prompt;
    int n_gen;
    std::string test_time;
@ -619,6 +636,7 @@ struct test {
        no_kv_offload = inst.no_kv_offload;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
        embeddings = inst.embeddings;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
        // RFC 3339 date-time format
@ -690,7 +708,7 @@ struct test {
            "n_batch", "n_threads", "type_k", "type_v",
            "n_gpu_layers", "split_mode",
            "main_gpu", "no_kv_offload",
-            "tensor_split", "use_mmap",
+            "tensor_split", "use_mmap", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@ -710,7 +728,7 @@ struct test {
        }
        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "use_mmap") {
+            field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@ -744,7 +762,7 @@ struct test {
            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload),
-            tensor_split_str, std::to_string(use_mmap),
+            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@ -914,6 +932,9 @@ struct markdown_printer : public printer {
        if (field == "use_mmap") {
            return "mmap";
        }
        if (field == "embeddings") {
            return "embd";
        }
        if (field == "tensor_split") {
            return "ts";
        }
@ -957,6 +978,9 @@ struct markdown_printer : public printer {
        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
            fields.emplace_back("use_mmap");
        }
        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
            fields.emplace_back("embeddings");
        }
        fields.emplace_back("test");
        fields.emplace_back("t/s");
--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@ -13,7 +13,7 @@ async def main():
    model_url = "http://127.0.0.1:6900"
    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
        url= f"{model_url}/embedding",
-        json= {"content": str(i)*1024}
+        json= {"content": str(0)*1024}
    ) for i in range(n)])
    for response in responses:
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,12 +1,12 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -436,7 +436,7 @@ Notice that each `probs` is an array of length `n_probs`.
        "next_token": {
            "has_next_token": true,
            "n_remain": -1,
-            "num_tokens_predicted": 0,
+            "n_decoded": 0,
            "stopped_eos": false,
            "stopped_limit": false,
            "stopped_word": false,
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@ -1,225 +0,0 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
 #include "utils.hpp"
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 using json = nlohmann::json;
 inline static json oaicompat_completion_params_parse(
    const struct llama_model * model,
    const json &body, /* openai api json semantics */
    const std::string &chat_template)
 {
    json llama_params;
    llama_params["__oaicompat"] = true;
    // Map OpenAI parameters to llama.cpp parameters
    //
    // For parameters that are defined by the OpenAI documentation (e.g.
    // temperature), we explicitly specify OpenAI's intended default; we
    // need to do that because sometimes OpenAI disagrees with llama.cpp
    //
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body, "stream", false);
    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
    }
    // Handle 'stop' field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
    // Ensure there is ChatML-specific end sequence among stop words
    llama_params["stop"].push_back("<|im_end|>");
    return llama_params;
 }
 inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
 {
    json result = response.result_json;
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
    std::string content      = json_value(result, "content", std::string(""));
    std::string finish_reason = "length";
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    json choices =
        streaming ? json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"delta", json::object()}}})
                  : json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"message", json{{"content", content},
                                                         {"role", "assistant"}}}}});
    std::time_t t = std::time(0);
    json res =
        json{{"choices", choices},
            {"created", t},
            {"model",
                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
            {"usage",
                json{{"completion_tokens", num_tokens_predicted},
                     {"prompt_tokens",     num_prompt_tokens},
                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
            {"id", gen_chatcmplid()}};
    if (server_verbose) {
        res["__verbose"] = result;
    }
    if (result.contains("completion_probabilities")) {
        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
    }
    return res;
 }
 // return value is vector as there is one case where we might need to generate two responses
 inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
    json result = response.result_json;
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({response.result_json});
    }
    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
    bool stopped_word   = json_value(result, "stopped_word", false);
    bool stopped_eos    = json_value(result, "stopped_eos", false);
    bool stopped_limit  = json_value(result, "stopped_limit", false);
    std::string content = json_value(result, "content", std::string(""));
    std::string finish_reason;
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    if (stopped_limit) {
        finish_reason = "length";
    }
    std::time_t t = std::time(0);
    json choices;
    if (!finish_reason.empty()) {
        choices = json::array({json{{"finish_reason", finish_reason},
                                    {"index", 0},
                                    {"delta", json::object()}}});
    } else {
        if (first) {
            if (content.empty()) {
                choices = json::array({json{{"finish_reason", nullptr},
                                            {"index", 0},
                                            {"delta", json{{"role", "assistant"}}}}});
            } else {
                // We have to send this as two updates to conform to openai behavior
                json initial_ret = json{{"choices", json::array({json{
                                        {"finish_reason", nullptr},
                                        {"index", 0},
                                        {"delta", json{
                                            {"role", "assistant"}
                                        }}}})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                json second_ret = json{
                            {"choices", json::array({json{{"finish_reason", nullptr},
                                                            {"index", 0},
                                                            {"delta", json{
                                                            {"content", content}}}
                                                            }})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                return std::vector<json>({initial_ret, second_ret});
            }
        } else {
            // Some idiosyncrasy in task processing logic makes several trailing calls
            // with empty content, we ignore these at the calee site.
            if (content.empty()) {
                return std::vector<json>({json::object()});
            }
            choices = json::array({json{
                {"finish_reason", nullptr},
                {"index", 0},
                {"delta",
                json{
                    {"content", content},
                }},
            }});
        }
    }
    json ret = json{{"choices", choices},
                    {"created", t},
                    {"id", gen_chatcmplid()},
                    {"model", modelname},
                    {"object", "chat.completion.chunk"}};
    return std::vector<json>({ret});
 }
 inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
 {
    json res =
        json{
            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
            {"object", "list"},
            {"usage",
                json{{"prompt_tokens", 0},
                     {"total_tokens", 0}}},
            {"data", embeddings}
        };
    return res;
 }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@ -0,0 +1,94 @@
@llama.cpp
@embeddings
 Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
    And   a model alias bert-bge-small
    And   42 as server seed
    And   2 slots
    And   1024 as batch size
    And   2048 KV cache size
    And   embeddings extraction
    Then  the server is starting
    Then  the server is healthy
  Scenario: Embedding
    When embeddings are computed for:
    """
    What is the capital of Bulgaria ?
    """
    Then embeddings are generated
  Scenario: OAI Embeddings compatibility
    Given a model bert-bge-small
    When an OAI compatible embeddings computation request for:
    """
    What is the capital of Spain ?
    """
    Then embeddings are generated
  Scenario: OAI Embeddings compatibility with multiple inputs
    Given a model bert-bge-small
    Given a prompt:
      """
      In which country Paris is located ?
      """
    And a prompt:
      """
      Is Madrid the capital of Spain ?
      """
    When an OAI compatible embeddings computation request for multiple inputs
    Then embeddings are generated
  Scenario: Multi users embeddings
    Given a prompt:
      """
      Write a very long story about AI.
      """
    And a prompt:
      """
      Write another very long music lyrics.
      """
    And a prompt:
      """
      Write a very long poem.
      """
    And a prompt:
      """
      Write a very long joke.
      """
    Given concurrent embedding requests
    Then the server is busy
    Then the server is idle
    Then all embeddings are generated
  Scenario: Multi users OAI compatibility embeddings
    Given a prompt:
      """
      In which country Paris is located ?
      """
    And a prompt:
      """
      Is Madrid the capital of Spain ?
      """
    And a prompt:
      """
      What is the biggest US city ?
      """
    And a prompt:
      """
      What is the capital of Bulgaria ?
      """
    And   a model bert-bge-small
    Given concurrent OAI embedding requests
    Then the server is busy
    Then the server is idle
    Then all embeddings are generated
  Scenario: All embeddings should be the same
    Given 10 fixed prompts
    And   a model bert-bge-small
    Given concurrent OAI embedding requests
    Then all embeddings are the same
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -9,7 +9,6 @@ Feature: Parallel
    And   512 as batch size
    And   64 KV cache size
    And   2 slots
    And   embeddings extraction
    And   continuous batching
    Then  the server is starting
    Then  the server is healthy
@ -99,48 +98,3 @@ Feature: Parallel
    Then the server is busy
    Then the server is idle
    Then all prompts are predicted
  Scenario: Multi users embeddings
    Given a prompt:
      """
      Write a very long story about AI.
      """
    And a prompt:
      """
      Write another very long music lyrics.
      """
    And a prompt:
      """
      Write a very long poem.
      """
    And a prompt:
      """
      Write a very long joke.
      """
    Given concurrent embedding requests
    Then the server is busy
    Then the server is idle
    Then all embeddings are generated
  Scenario: Multi users OAI compatibility embeddings
    Given a prompt:
      """
      In which country Paris is located ?
      """
    And a prompt:
      """
      Is Madrid the capital of Spain ?
      """
    And a prompt:
      """
      What is the biggest US city ?
      """
    And a prompt:
      """
      What is the capital of Bulgaria ?
      """
    And   a model tinyllama-2
    Given concurrent OAI embedding requests
    Then the server is busy
    Then the server is idle
    Then all embeddings are generated
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -49,34 +49,6 @@ Feature: llama.cpp server
      | llama-2      | Book                        | What is the best book                | 8          | (Mom\|what)+           | 8           | disabled         |
      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks\|happy\|bird)+ | 32          | enabled          |
  Scenario: Embedding
    When embeddings are computed for:
    """
    What is the capital of Bulgaria ?
    """
    Then embeddings are generated
  Scenario: OAI Embeddings compatibility
    Given a model tinyllama-2
    When an OAI compatible embeddings computation request for:
    """
    What is the capital of Spain ?
    """
    Then embeddings are generated
  Scenario: OAI Embeddings compatibility with multiple inputs
    Given a model tinyllama-2
    Given a prompt:
      """
      In which country Paris is located ?
      """
    And a prompt:
      """
      Is Madrid the capital of Spain ?
      """
    When an OAI compatible embeddings computation request for multiple inputs
    Then embeddings are generated
  Scenario: Tokenize / Detokenize
    When tokenizing:
    """
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -10,6 +10,7 @@ from contextlib import closing
 from re import RegexFlag
 import aiohttp
 import numpy as np
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
@ -24,6 +25,9 @@ def step_server_config(context, server_fqdn, server_port):
    if 'PORT' in os.environ:
        context.server_port = int(os.environ['PORT'])
        print(f"$PORT set, overriding server port with to {context.server_port}")
    if 'FQDN' in os.environ:
        context.server_fqdn = os.environ['FQDN']
        print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
@ -34,6 +38,7 @@ def step_server_config(context, server_fqdn, server_port):
    context.n_ga_w = None
    context.n_gpu_layer = None
    context.n_predict = None
    context.n_prompts = 0
    context.n_server_predict = None
    context.n_slots = None
    context.prompt_prefix = None
@ -202,6 +207,7 @@ def step_n_tokens_predicted(context, predicted_n):
@step(u'a user prompt {user_prompt}')
 def step_user_prompt(context, user_prompt):
    context.prompts.append(user_prompt)
    context.n_prompts = len(context.prompts)
@step(u'a system prompt {system_prompt}')
@ -290,6 +296,12 @@ def step_prompt_passkey(context):
    context.prompt_passkey = context.text
@step(u'{n_prompts:d} fixed prompts')
 def step_fixed_prompts(context, n_prompts):
    context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
    context.n_prompts = n_prompts
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
 def step_prompt_passkey(context, passkey, i_pos):
    prompt = ""
@ -301,6 +313,7 @@ def step_prompt_passkey(context, passkey, i_pos):
        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
    context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
    context.n_prompts = len(context.prompts)
@step(u'an OAI compatible chat completions request with {api_error} api error')
@ -341,11 +354,13 @@ async def step_oai_chat_completions(context, api_error):
@step(u'a prompt')
 def step_a_prompt(context):
    context.prompts.append(context.text)
    context.n_prompts = len(context.prompts)
@step(u'a prompt {prompt}')
 def step_a_prompt_prompt(context, prompt):
    context.prompts.append(prompt)
    context.n_prompts = len(context.prompts)
@step(u'concurrent completion requests')
@ -430,25 +445,47 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
@step(u'embeddings are computed for')
@async_run_until_complete
 async def step_compute_embedding(context):
    context.n_prompts = 1
    context.embeddings = await request_embedding(context.text, base_url=context.base_url)
@step(u'all embeddings are the same')
@async_run_until_complete
 async def step_all_embeddings_are_the_same(context):
    n_embedding_requests = await gather_tasks_results(context)
    assert n_embedding_requests > 0
    embeddings = []
    for i in range(n_embedding_requests):
        embedding = context.tasks_result.pop().pop()
        embeddings.append(embedding)
        assert_embeddings(embedding)
    n = len(embeddings)
    for i in range(n-1):
        for j in range(i+1, n):
            embedding1 = np.array(embeddings[i])
            embedding2 = np.array(embeddings[j])
            if context.debug:
                print(f"embedding1: {embedding1[-8:]}\n")
                print(f"embedding2: {embedding2[-8:]}\n")
            similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
            msg = f"Similarity between {i} and {j}: {similarity:.10f}"
            if context.debug:
                print(f"{msg}\n")
            assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
@step(u'embeddings are generated')
 def step_assert_embeddings(context):
-    if len(context.prompts) == 0:
+    assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
-        assert_embeddings(context.embeddings)
+                                                             f"context.n_prompts={context.n_prompts}\n"
    else:
        assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
                                                                 f"context.prompts={context.prompts}\n"
                                                             f"context.embeddings={context.embeddings}")
    for embedding in context.embeddings:
            context.prompts.pop()
        assert_embeddings(embedding)
@step(u'an OAI compatible embeddings computation request for')
@async_run_until_complete
 async def step_oai_compute_embeddings(context):
    context.n_prompts = 1
    context.embeddings = await request_oai_embeddings(context.text,
                                                      base_url=context.base_url,
                                                      user_api_key=context.user_api_key,
@ -462,6 +499,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
                                                      base_url=context.base_url,
                                                      user_api_key=context.user_api_key,
                                                      model=context.model)
    context.prompts.clear()
@step(u'concurrent embedding requests')
@ -488,9 +526,9 @@ async def step_concurrent_oai_embedding_requests(context):
@async_run_until_complete()
 async def all_embeddings_are_generated(context):
    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests > 0
+    assert n_embedding_requests == context.n_prompts
    for i in range(n_embedding_requests):
-        assert_embeddings(context.tasks_result.pop())
+        assert_embeddings(context.tasks_result.pop().pop())
@step(u'tokenizing')
@ -588,11 +626,11 @@ def step_supported_models(context, i_model, param, preposition, param_value):
 async def concurrent_requests(context, f_completion, *args, **kwargs):
-    n_prompts = len(context.prompts)
+    context.n_prompts = len(context.prompts)
    if context.debug:
-        print(f"starting {n_prompts} concurrent completion requests...")
+        print(f"starting {context.n_prompts} concurrent completion requests...")
-    assert n_prompts > 0
+    assert context.n_prompts > 0
-    for prompt_no in range(n_prompts):
+    for prompt_no in range(context.n_prompts):
        shifted_args = [context.prompts.pop(), *args]
        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
    await asyncio.sleep(0.1)
@ -765,7 +803,7 @@ async def request_embedding(content, base_url=None):
                                }) as response:
            assert response.status == 200
            response_json = await response.json()
-            return response_json['embedding']
+            return [response_json['embedding']]
 async def request_oai_embeddings(input,
@ -775,6 +813,7 @@ async def request_oai_embeddings(input,
    user_api_key = user_api_key if user_api_key is not None else 'nope'
    if async_client:
        origin = 'llama.cpp'
        headers=[]
        if user_api_key is not None:
            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
        async with aiohttp.ClientSession() as session:
@ -783,14 +822,21 @@ async def request_oai_embeddings(input,
                                        "input": input,
                                        "model": model,
                                    },
-                                    headers=headers) as response:
+                                    headers=headers,
                                    timeout=3600) as response:
                assert response.status == 200, f"received status code not expected: {response.status}"
                assert response.headers['Access-Control-Allow-Origin'] == origin
                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
                response_json = await response.json()
                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
                assert response_json['object'] == 'list'
-                return response_json['data']
+                if isinstance(input, collections.abc.Sequence):
                    embeddings = []
                    for an_oai_embeddings in response_json['data']:
                        embeddings.append(an_oai_embeddings['embedding'])
                else:
                    embeddings = [response_json['data']['embedding']]
                return embeddings
    else:
        openai.api_key = user_api_key
        openai.api_base = f'{base_url}/v1'
@ -804,7 +850,7 @@ async def request_oai_embeddings(input,
            for an_oai_embeddings in oai_embeddings.data:
                embeddings.append(an_oai_embeddings.embedding)
        else:
-            embeddings = oai_embeddings.data.embedding
+            embeddings = [oai_embeddings.data.embedding]
        return embeddings
@ -899,6 +945,8 @@ def assert_embeddings(embeddings):
    assert len(embeddings) > 0
    embeddings_computed = False
    for emb in embeddings:
        if not isinstance(emb, float):
            assert False, f"Bad embeddings: {embeddings}"
        if emb != 0:
            embeddings_computed = True
    assert embeddings_computed, f"Embeddings: {embeddings}"
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -1,5 +1,6 @@
 aiohttp~=3.9.3
 behave~=1.2.6
 huggingface_hub~=0.20.3
 numpy~=1.24.4
 openai~=0.25.0
 prometheus-client~=0.20.0
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -1,15 +1,16 @@
 #pragma once
-#include <string>
+#include "llama.h"
-#include <vector>
+#include "common.h"
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
-#include "../llava/clip.h"
+#include <string>
 #include <vector>
 #include <sstream>
 #include <random>
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 using json = nlohmann::json;
@ -37,61 +38,13 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-enum server_state {
+template <typename T>
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+static T json_value(const json &body, const std::string &key, const T &default_value) {
-    SERVER_STATE_READY,          // Server is ready and model is loaded
+    // Fallback null to default value
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
+    return body.contains(key) && !body.at(key).is_null()
-};
+        ? body.value(key, default_value)
-
+        : default_value;
-enum task_type {
+}
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE,
    TASK_TYPE_METRICS
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // completion token output with probabilities
 struct completion_token_output {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
    std::stringstream ss_tid;
@ -102,18 +55,18 @@ static inline void server_log(const char *level, const char *function, int line,
    };
    if (server_log_json) {
-        log.merge_patch(
+        log.merge_patch( {
                {
            {"level",    level},
            {"function", function},
            {"line",     line},
            {"msg",      message},
        });
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
-        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
+        printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
    } else {
        char buf[1024];
        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
@ -136,22 +89,13 @@ static inline void server_log(const char *level, const char *function, int line,
 }
 //
-// server utils
+// chat template utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 inline bool verify_custom_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
-    std::vector<char> buf(1);
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
    return res >= 0;
 }
@ -163,7 +107,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
-        auto &curr_msg = messages[i];
+        const auto & curr_msg = messages[i];
        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
        alloc_size     += str[i*2 + 1].length();
@ -183,261 +127,13 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
-    std::string formatted_chat(buf.data(), res);
+    const std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_run_slots;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
            LOG_VERBOSE("new task id", {{"new_id", task.id}});
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        int new_id = id++;
        LOG_VERBOSE("new task id", {{"new_id", new_id}});
        return new_id;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask when it is finished
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when all slots data is ready to be processed
    void on_run_slots(std::function<void(void)> callback) {
        callback_run_slots = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // end the start_loop routine
    void terminate() {
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            running = false;
        }
        condition_tasks.notify_all();
    }
    /**
     * Main loop consists of these steps:
     * - Wait until a new task arrives
     * - Process the task (i.e. maybe copy data into slot)
     * - Check if multitask is finished
     * - Run all slots
     */
    void start_loop() {
        running = true;
        while (true) {
            LOG_VERBOSE("new task may arrive", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
                    callback_new_task(task);
                }
                LOG_VERBOSE("update_multitasks", {});
                // check if we have any finished multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is processed, slots data is now ready
                LOG_VERBOSE("callback_run_slots", {});
                callback_run_slots();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    if (!running) {
                        LOG_VERBOSE("ending start_loop", {});
                        return;
                    }
                    condition_tasks.wait(lock, [&]{
                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    // add the task_id to the list of tasks waiting for response
    void add_waiting_task_id(int task_id) {
        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int task_id) {
        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {{"task_id", result.id}});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
                queue_results.push_back(result);
                condition_results.notify_all();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
@ -447,13 +143,11 @@ static const std::string base64_chars =
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
-static inline bool is_base64(uint8_t c)
+static inline bool is_base64(uint8_t c) {
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
@ -465,13 +159,10 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    std::vector<uint8_t> ret;
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
+        if (i == 4) {
-        {
+            for (i = 0; i < 4; i++) {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
@ -479,23 +170,20 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-            for (i = 0; (i < 3); i++)
+            for (i = 0; (i < 3); i++) {
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
-    if (i)
+    if (i) {
-    {
+        for (j = i; j < 4; j++) {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
-        for (j = 0; j <4; j++)
+        for (j = 0; j < 4; j++) {
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
@ -503,8 +191,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-        for (j = 0; (j < i - 1); j++)
+        for (j = 0; j < i - 1; j++) {
        {
            ret.push_back(char_array_3[j]);
        }
    }
@ -516,8 +203,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
 // random string / id
 //
-static std::string random_string()
+static std::string random_string() {
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
@ -532,10 +218,10 @@ static std::string random_string()
    return result;
 }
-static std::string gen_chatcmplid()
+static std::string gen_chatcmplid() {
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
@ -543,91 +229,316 @@ static std::string gen_chatcmplid()
 // other common utils
 //
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
 {
    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
-    {
+
    }
    return i;
 }
-static bool ends_with(const std::string &str, const std::string &suffix)
+static bool ends_with(const std::string & str, const std::string & suffix) {
-{
+    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
    return str.size() >= suffix.size() &&
           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
-static size_t find_partial_stop_string(const std::string &stop,
+static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
-                                       const std::string &text)
+    if (!text.empty() && !stop.empty()) {
 {
    if (!text.empty() && !stop.empty())
    {
        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-        {
+            if (stop[char_index] == text_last_char) {
            if (stop[char_index] == text_last_char)
            {
                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
+                if (ends_with(text, current_partial)) {
                {
                    return text.size() - char_index - 1;
                }
            }
        }
    }
    return std::string::npos;
 }
 // TODO: reuse llama_detokenize
 template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 {
    std::string ret;
-    for (; begin != end; ++begin)
+    for (; begin != end; ++begin) {
    {
        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
 // format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
 {
    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
        std::string res(ss.str());
        out = "byte: \\x" + res;
    }
    return out;
 }
 struct completion_token_output {
    llama_token tok;
    std::string text_to_send;
    struct token_prob {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
 };
 // convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
+static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
 {
    json out = json::array();
-    for (const auto &prob : probs)
+
-    {
+    for (const auto & prob : probs) {
        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
+
-        {
+        for (const auto & p : prob.probs) {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
+            probs_for_token.push_back(json {
            {
                {"tok_str", tok_str},
                {"prob",    p.prob},
            });
        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+
        const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
        out.push_back(json {
            {"content", tok_str},
            {"probs",   probs_for_token},
        });
    }
    return out;
 }
 //
 // OAI utils
 //
 static json oaicompat_completion_params_parse(
    const struct llama_model * model,
    const json & body, /* openai api json semantics */
    const std::string & chat_template) {
    json llama_params;
    llama_params["__oaicompat"] = true;
    // Map OpenAI parameters to llama.cpp parameters
    //
    // For parameters that are defined by the OpenAI documentation (e.g.
    // temperature), we explicitly specify OpenAI's intended default; we
    // need to do that because sometimes OpenAI disagrees with llama.cpp
    //
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
    llama_params["prompt"]            = format_chat(model, chat_template,       body["messages"]);
    llama_params["cache_prompt"]      = json_value(body,   "cache_prompt",      false);
    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
    llama_params["top_k"]             = json_value(body,   "top_k",             default_sparams.top_k);
    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
    llama_params["mirostat"]          = json_value(body,   "mirostat",          default_sparams.mirostat);
    llama_params["mirostat_tau"]      = json_value(body,   "mirostat_tau",      default_sparams.mirostat_tau);
    llama_params["mirostat_eta"]      = json_value(body,   "mirostat_eta",      default_sparams.mirostat_eta);
    llama_params["penalize_nl"]       = json_value(body,   "penalize_nl",       default_sparams.penalize_nl);
    llama_params["typical_p"]         = json_value(body,   "typical_p",         default_sparams.typical_p);
    llama_params["repeat_last_n"]     = json_value(body,   "repeat_last_n",     default_sparams.penalty_last_n);
    llama_params["ignore_eos"]        = json_value(body,   "ignore_eos",        false);
    llama_params["tfs_z"]             = json_value(body,   "tfs_z",             default_sparams.tfs_z);
    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
    }
    // Handle 'stop' field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
    // Ensure there is ChatML-specific end sequence among stop words
    llama_params["stop"].push_back("<|im_end|>");
    return llama_params;
 }
 static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) {
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
    std::string content      = json_value(result, "content", std::string(""));
    std::string finish_reason = "length";
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    json choices =
        streaming ? json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"delta", json::object()}}})
                  : json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"message", json{{"content", content},
                                                         {"role", "assistant"}}}}});
    std::time_t t = std::time(0);
    json res = json {
        {"choices", choices},
        {"created", t},
        {"model",
            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
        {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
        {"usage", json {
            {"completion_tokens", num_tokens_predicted},
            {"prompt_tokens",     num_prompt_tokens},
            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
        }},
        {"id", gen_chatcmplid()}
    };
    if (server_verbose) {
        res["__verbose"] = result;
    }
    if (result.contains("completion_probabilities")) {
        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
    }
    return res;
 }
 // return value is vector as there is one case where we might need to generate two responses
 static std::vector<json> format_partial_response_oaicompat(json result) {
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({result});
    }
    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
    bool stopped_word   = json_value(result, "stopped_word",  false);
    bool stopped_eos    = json_value(result, "stopped_eos",   false);
    bool stopped_limit  = json_value(result, "stopped_limit", false);
    std::string content = json_value(result, "content",       std::string(""));
    std::string finish_reason;
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    if (stopped_limit) {
        finish_reason = "length";
    }
    std::time_t t = std::time(0);
    json choices;
    if (!finish_reason.empty()) {
        choices = json::array({json{{"finish_reason", finish_reason},
                                    {"index", 0},
                                    {"delta", json::object()}}});
    } else {
        if (first) {
            if (content.empty()) {
                choices = json::array({json{{"finish_reason", nullptr},
                                            {"index", 0},
                                            {"delta", json{{"role", "assistant"}}}}});
            } else {
                // We have to send this as two updates to conform to openai behavior
                json initial_ret = json{{"choices", json::array({json{
                                        {"finish_reason", nullptr},
                                        {"index", 0},
                                        {"delta", json{
                                            {"role", "assistant"}
                                        }}}})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                json second_ret = json{
                            {"choices", json::array({json{{"finish_reason", nullptr},
                                                            {"index", 0},
                                                            {"delta", json{
                                                            {"content", content}}}
                                                            }})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                return std::vector<json>({initial_ret, second_ret});
            }
        } else {
            // Some idiosyncrasy in task processing logic makes several trailing calls
            // with empty content, we ignore these at the calee site.
            if (content.empty()) {
                return std::vector<json>({json::object()});
            }
            choices = json::array({json{
                {"finish_reason", nullptr},
                {"index", 0},
                {"delta",
                json{
                    {"content", content},
                }},
            }});
        }
    }
    json ret = json {
        {"choices", choices},
        {"created", t},
        {"id",      gen_chatcmplid()},
        {"model",   modelname},
        {"object",  "chat.completion.chunk"}
    };
    return std::vector<json>({ret});
 }
 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
    json res = json {
        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
        {"object", "list"},
        {"usage", json {
            {"prompt_tokens", 0},
            {"total_tokens", 0}
        }},
        {"data", embeddings}
    };
    return res;
 }
 static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
    return json {
        {"tokens", tokens}
    };
 }
 static json format_detokenized_response(const std::string & content) {
    return json {
        {"content", content}
    };
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -464,8 +464,8 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
 }
 // NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    int8x16_t res;
+    uint8x16_t res;
    res[ 0] = a[b[ 0]];
    res[ 1] = a[b[ 1]];
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -3769,8 +3769,42 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
    std::ofstream logfile;
    logfile.open(filename);
    for(size_t i=0; i<total_elements; i++){
        logfile << local_buf[i] <<" ";
        if((i+1)%20 ==0) logfile <<std::endl;
    }
    logfile <<std::endl;
    logfile.close();
    if(src_on_device) ggml_sycl_host_free(local_buf);
 }
 void log_ggml_var_device_fp16(const char*name, sycl::half *src, size_t total_elements, bool src_on_device){
    if(!g_ggml_sycl_debug) return;
    if(!src){
        printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
        return;
    }
    char filename[1024];
    sprintf(filename, "%s.txt", name);
    printf("GGML Tensor:%s save to %s\n", name, filename);
    size_t total_size = total_elements*sizeof(sycl::half);
    sycl::half *local_buf = NULL;
    if(src_on_device) {
        local_buf = (sycl::half *) ggml_sycl_host_malloc(total_size);
        ggml_sycl_set_device(g_main_device);
        dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
        main_stream->memcpy(local_buf, src, total_size).wait();
    }
    else {
        local_buf = (sycl::half *)src;
    }
    std::ofstream logfile;
    logfile.open(filename);
    for(size_t i=0; i<total_elements; i++){
        logfile << local_buf[i] <<" ";
        if((i+1)%20 ==0) logfile <<std::endl;
        else logfile << local_buf[i] <<" ";
    }
    logfile <<std::endl;
    logfile.close();
@ -14126,7 +14160,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
            dst_f16.get(), dpct::library_data_t::real_half, ldc,
            dpct::library_data_t::real_half)));
-
+        g_sycl_handles[id]->wait();
        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
    }
@ -14159,6 +14193,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
            dpct::get_value(&alpha, *g_sycl_handles[id]), src0_ddf_i, ne00,
            src1_ddf1_i, ne10, dpct::get_value(&beta, *g_sycl_handles[id]),
            dst_dd_i, ldc)));
        g_sycl_handles[id]->wait();
    }
    (void) dst;
    (void) src1_ddq_i;
@ -15295,8 +15330,8 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    sycl_pool_alloc<sycl::half> dst_f16;
    char * dst_t;
-    dpct::library_data_t cu_compute_type = dpct::library_data_t::real_half;
+    dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
-    dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
+    dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
    // dst strides
    size_t nbd2 = dst->nb[2];
@ -15308,15 +15343,13 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    const float alpha_f32 = 1.0f;
    const float beta_f32  = 0.0f;
-    const void * alpha = &alpha_f16;
+    const void * alpha = &alpha_f32;
-    const void * beta  = &beta_f16;
+    const void * beta  = &beta_f32;
    // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
-    // once oneMKL open source supports half, half, float, float: datatypes
+    // oneMKL open source supports half, half, float, float: datatypes
    dst_t = (char *) dst_f16.alloc(ne_dst);
-    nbd2 /= sizeof(float) / sizeof(sycl::half);
+    dst_t = (char *) dst_ddf;
    nbd3 /= sizeof(float) / sizeof(sycl::half);
    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
@ -15356,6 +15389,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
            nb11 / nb10, nb12 / nb10, beta,
            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
            ne12 * ne13, cu_compute_type)));
        g_sycl_handles[g_main_device]->wait();
    } else {
        const int ne23 = ne12*ne13;
@ -15386,7 +15420,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
                                         nb02, nb03, nb12_scaled, nb13_scaled,
                                         nbd2, nbd3, r2, r3, item_ct1);
                                 });
-            });
+            }).wait();
        }
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
            *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@ -15397,11 +15431,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
            dpct::library_data_t::real_half, nb11 / nb10, beta,
            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
            cu_compute_type)));
        g_sycl_handles[g_main_device]->wait();
    }
 #endif
    const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
    to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
--- a/ggml.c
+++ b/ggml.c
@ -2158,6 +2158,9 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
 #   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
 #       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
 #   endif
    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
 #endif
--- a/llama.cpp
+++ b/llama.cpp
@ -14146,18 +14146,22 @@ LLAMA_API int32_t llama_chat_apply_template(
            curr_tmpl = std::string(model_template.data(), model_template.size());
        }
    }
    // format the chat to string
    std::vector<const llama_chat_message *> chat_vec;
    chat_vec.resize(n_msg);
    for (size_t i = 0; i < n_msg; i++) {
        chat_vec[i] = &chat[i];
    }
    std::string formatted_chat;
    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
    if (res < 0) {
        return res;
    }
    if (buf && length > 0) {
        strncpy(buf, formatted_chat.c_str(), length);
    }
    return res;
 }
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@ -18,7 +18,7 @@ except ImportError as e:
 KEY_PROPERTIES = [
    "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
    "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
-    "type_k", "type_v", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen"
+    "type_k", "type_v", "no_kv_offload", "tensor_split", "n_prompt", "n_gen"
 ]
 # Properties that are boolean and are converted to Yes/No for the table: