Merge remote-tracking branch 'origin/master' into sl/backend-sched

2024-01-11 12:16:53 +01:00 · 2024-01-11 12:16:53 +01:00 · c3681af783
commit c3681af783
parent 42aa835c58 64802ec00d
8 changed files with 173 additions and 166 deletions
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -126,24 +126,7 @@ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::str
 }
 static std::string get_ftype(int ftype) {
-    switch (ftype) {
+    return ggml_type_name(static_cast<ggml_type>(ftype));
    case 0:
        return "f32";
    case 1:
        return "f16";
    case 2:
        return "q4_0";
    case 3:
        return "q4_1";
    case 6:
        return "q5_0";
    case 7:
        return "q5_1";
    case 8:
        return "q8_0";
    default:
        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
    }
 }
 //
@ -533,6 +516,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    buffer_size += n_tensors * 128 /* CLIP PADDING */;
    clip_ctx * new_clip = new clip_ctx;
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@ -543,6 +527,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    printf("%s: CLIP using Metal backend\n", __func__);
 #endif
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
        printf("%s: CLIP using CPU backend\n", __func__);
@ -931,26 +916,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    ggml_type type = GGML_TYPE_Q4_1;
-    switch (itype) {
+    assert(itype < GGML_TYPE_COUNT);
-        case 2:
+    type = static_cast<ggml_type>(itype);
            type = GGML_TYPE_Q4_0;
            break;
        case 3:
            type = GGML_TYPE_Q4_1;
            break;
        case 6:
            type = GGML_TYPE_Q5_0;
            break;
        case 7:
            type = GGML_TYPE_Q5_1;
            break;
        case 8:
            type = GGML_TYPE_Q8_0;
            break;
        default:
            fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
            return false;
    };
    auto * ctx_clip = clip_model_load(fname_inp, 2);
@ -1010,6 +977,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        if (quantize) {
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
                // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@ -1054,6 +1025,21 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                case GGML_TYPE_Q8_0: {
                    new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
                case GGML_TYPE_Q2_K: {
                    new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
                case GGML_TYPE_Q3_K: {
                    new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
                case GGML_TYPE_Q4_K: {
                    new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
                case GGML_TYPE_Q5_K: {
                    new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
                case GGML_TYPE_Q6_K: {
                    new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
                } break;
                default: {
                    fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
                    return false;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -110,6 +110,10 @@ node index.js
 ```
 ## API Endpoints
 - **GET** `/health`: Returns the current state of the server:
    - `{"status": "loading model"}` if the model is still being loaded.
    - `{"status": "error"}` if the model failed to load.
    - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
 -   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -26,6 +26,7 @@
 #include <mutex>
 #include <chrono>
 #include <condition_variable>
 #include <atomic>
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@ -146,9 +147,15 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 // parallel
 //
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
-    COMPLETION_TASK,
+    TASK_TYPE_COMPLETION,
-    CANCEL_TASK
+    TASK_TYPE_CANCEL,
 };
 struct task_server {
@ -1395,7 +1402,7 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
-        task.type = COMPLETION_TASK;
+        task.type = TASK_TYPE_COMPLETION;
        task.multitask_id = multitask_id;
        // when a completion task's prompt array is not a singleton, we split it into multiple requests
@ -1517,7 +1524,7 @@ struct llama_server_context
        std::unique_lock<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
-        task.type = CANCEL_TASK;
+        task.type = TASK_TYPE_CANCEL;
        task.target_id = task_id;
        queue_tasks.push_back(task);
        condition_tasks.notify_one();
@ -1553,7 +1560,7 @@ struct llama_server_context
            queue_tasks.erase(queue_tasks.begin());
            switch (task.type)
            {
-                case COMPLETION_TASK: {
+                case TASK_TYPE_COMPLETION: {
                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
                    if (slot == nullptr)
                    {
@ -1582,7 +1589,7 @@ struct llama_server_context
                        break;
                    }
                } break;
-                case CANCEL_TASK: { // release slot linked with the task id
+                case TASK_TYPE_CANCEL: { // release slot linked with the task id
                    for (auto & slot : slots)
                    {
                        if (slot.task_id == task.target_id)
@ -2453,7 +2460,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
    }
 }
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
@ -2790,15 +2796,117 @@ int main(int argc, char **argv)
                                {"system_info", llama_print_system_info()},
                            });
-    // load the model
+    httplib::Server svr;
-    if (!llama.load_model(params))
+
    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
    svr.set_default_headers({{"Server", "llama.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
                             {"Access-Control-Allow-Headers", "content-type"}});
    svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
        server_state current_state = state.load();
        switch(current_state) {
            case SERVER_STATE_READY:
                res.set_content(R"({"status": "ok"})", "application/json");
                res.status = 200; // HTTP OK
                break;
            case SERVER_STATE_LOADING_MODEL:
                res.set_content(R"({"status": "loading model"})", "application/json");
                res.status = 503; // HTTP Service Unavailable
                break;
            case SERVER_STATE_ERROR:
                res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
                res.status = 500; // HTTP Internal Server Error
                break;
        }
    });
    svr.set_logger(log_server_request);
    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
            {
                const char fmt[] = "500 Internal Server Error\n%s";
                char buf[BUFSIZ];
                try
                {
                    std::rethrow_exception(std::move(ep));
                }
                catch (std::exception &e)
                {
                    snprintf(buf, sizeof(buf), fmt, e.what());
                }
                catch (...)
                {
                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
                }
                res.set_content(buf, "text/plain; charset=utf-8");
                res.status = 500;
            });
    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
            {
                if (res.status == 401)
                {
                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
                }
                if (res.status == 400)
                {
                    res.set_content("Invalid request", "text/plain; charset=utf-8");
                }
                else if (res.status == 404)
                {
                    res.set_content("File Not Found", "text/plain; charset=utf-8");
                    res.status = 404;
                }
            });
    // set timeouts and change hostname and port
    svr.set_read_timeout (sparams.read_timeout);
    svr.set_write_timeout(sparams.write_timeout);
    if (!svr.bind_to_port(sparams.hostname, sparams.port))
    {
        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
        return 1;
    }
-    llama.initialize();
+    // Set the base directory for serving static files
    svr.set_base_dir(sparams.public_path);
-    httplib::Server svr;
+    // to make it ctrl+clickable:
    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
    std::unordered_map<std::string, std::string> log_data;
    log_data["hostname"] = sparams.hostname;
    log_data["port"] = std::to_string(sparams.port);
    if (!sparams.api_key.empty()) {
        log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
    }
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()
            {
                if (!svr.listen_after_bind())
                {
                    state.store(SERVER_STATE_ERROR);
                    return 1;
                }
                return 0;
            });
    // load the model
    if (!llama.load_model(params))
    {
        state.store(SERVER_STATE_ERROR);
        return 1;
    } else {
        llama.initialize();
        state.store(SERVER_STATE_READY);
    }
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
@ -2826,10 +2934,6 @@ int main(int argc, char **argv)
        return false;
    };
    svr.set_default_headers({{"Server", "llama.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
                             {"Access-Control-Allow-Headers", "content-type"}});
    // this is only called if no index.html is found in the public --path
    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
            {
@ -2937,8 +3041,6 @@ int main(int argc, char **argv)
                }
            });
    svr.Get("/v1/models", [&params](const httplib::Request&, httplib::Response& res)
            {
                std::time_t t = std::time(0);
@ -3157,81 +3259,6 @@ int main(int argc, char **argv)
                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });
    svr.set_logger(log_server_request);
    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
            {
                const char fmt[] = "500 Internal Server Error\n%s";
                char buf[BUFSIZ];
                try
                {
                    std::rethrow_exception(std::move(ep));
                }
                catch (std::exception &e)
                {
                    snprintf(buf, sizeof(buf), fmt, e.what());
                }
                catch (...)
                {
                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
                }
                res.set_content(buf, "text/plain; charset=utf-8");
                res.status = 500;
            });
    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
            {
                if (res.status == 401)
                {
                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
                }
                if (res.status == 400)
                {
                    res.set_content("Invalid request", "text/plain; charset=utf-8");
                }
                else if (res.status == 404)
                {
                    res.set_content("File Not Found", "text/plain; charset=utf-8");
                    res.status = 404;
                }
            });
    // set timeouts and change hostname and port
    svr.set_read_timeout (sparams.read_timeout);
    svr.set_write_timeout(sparams.write_timeout);
    if (!svr.bind_to_port(sparams.hostname, sparams.port))
    {
        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
        return 1;
    }
    // Set the base directory for serving static files
    svr.set_base_dir(sparams.public_path);
    // to make it ctrl+clickable:
    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
    std::unordered_map<std::string, std::string> log_data;
    log_data["hostname"] = sparams.hostname;
    log_data["port"] = std::to_string(sparams.port);
    if (!sparams.api_key.empty()) {
        log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
    }
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()
            {
                if (!svr.listen_after_bind())
                {
                    return 1;
                }
                return 0;
            });
    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
    //     "Bus error: 10" - this is on macOS, it does not crash on Linux
    //std::thread t2([&]()
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -1067,6 +1067,8 @@ bool ggml_metal_graph_compute(
                    GGML_ASSERT(!"unsupported op");
                }
                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
@ -2423,6 +2425,8 @@ bool ggml_metal_graph_compute(
                            GGML_ASSERT(false);
                        }
                }
                [encoder popDebugGroup];
            }
            if (encoder != nil) {
--- a/ggml.c
+++ b/ggml.c
@ -132,7 +132,7 @@ void ggml_print_backtrace(void) {
            "-ex", "bt -frame-info source-and-location",
            "-ex", "detach",
            "-ex", "quit",
-            NULL);
+            (char *) NULL);
    } else {
        waitpid(pid, NULL, 0);
    }
@ -4315,13 +4315,13 @@ struct ggml_tensor * ggml_set_2d_inplace(
 static struct ggml_tensor * ggml_cpy_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
+        struct ggml_tensor  * b) {
        bool inplace) {
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
    bool is_node = false;
-    if (!inplace && (a->grad || b->grad)) {
+    if (a->grad || b->grad) {
        // inplace is false and either one have a grad
        is_node = true;
    }
@ -4345,14 +4345,7 @@ struct ggml_tensor * ggml_cpy(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, false);
+    return ggml_cpy_impl(ctx, a, b);
 }
 struct ggml_tensor * ggml_cpy_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_cpy_impl(ctx, a, b, true);
 }
 struct ggml_tensor * ggml_cast(
@ -4376,15 +4369,14 @@ struct ggml_tensor * ggml_cast(
 static struct ggml_tensor * ggml_cont_impl(
        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
+        struct ggml_tensor  * a) {
        bool inplace) {
    bool is_node = false;
-    if (!inplace && a->grad) {
+    if (a->grad) {
        is_node = true;
    }
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
    ggml_format_name(result, "%s (cont)", a->name);
    result->op   = GGML_OP_CONT;
@ -4397,13 +4389,7 @@ static struct ggml_tensor * ggml_cont_impl(
 struct ggml_tensor * ggml_cont(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a, false);
+    return ggml_cont_impl(ctx, a);
 }
 struct ggml_tensor * ggml_cont_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    return ggml_cont_impl(ctx, a, true);
 }
 // make contiguous, with new shape
--- a/ggml.h
+++ b/ggml.h
@ -218,7 +218,9 @@
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
@ -1161,12 +1163,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // a -> b, in-place, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_cast(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1177,11 +1173,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // make contiguous, in-place
    GGML_API struct ggml_tensor * ggml_cont_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // make contiguous, with new shape
    GGML_API struct ggml_tensor * ggml_cont_1d(
            struct ggml_context * ctx,
--- a/llama.cpp
+++ b/llama.cpp
@ -2835,6 +2835,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                switch (hparams.n_layer) {
                    case 24: model.type = e_model::MODEL_1B; break;
                    case 32: model.type = e_model::MODEL_3B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
@ -3151,7 +3152,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
-    LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
+    if (ml.n_elements >= 1e12) {
        LLAMA_LOG_INFO("%s: model params     = %.2f T\n", __func__, ml.n_elements*1e-12);
    } else if (ml.n_elements >= 1e9) {
        LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
    } else if (ml.n_elements >= 1e6) {
        LLAMA_LOG_INFO("%s: model params     = %.2f M\n", __func__, ml.n_elements*1e-6);
    } else {
        LLAMA_LOG_INFO("%s: model params     = %.2f K\n", __func__, ml.n_elements*1e-3);
    }
    if (ml.n_bytes < GiB) {
        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
    } else {
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-f96711108d55bdbbd277e6be07204dce6a94fb93
+979cc23b345006504cfc1f67c0fdf627805e3319
`@ -1 +1 @@`
	`f96711108d55bdbbd277e6be07204dce6a94fb93`	`979cc23b345006504cfc1f67c0fdf627805e3319`