Merge branch 'ggerganov:master' into master

2024-10-08 18:36:38 +05:00 · 2024-10-08 18:36:38 +05:00 · 8110f783d1
commit 8110f783d1
parent 98b204c918 dca1d4b58a
22 changed files with 204 additions and 193 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1859,9 +1859,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.endpoint_metrics = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    add_opt(llama_arg(
+        {"--slots"},
+        format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_slots = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    add_opt(llama_arg(
+        {"--props"},
+        format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_props = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
    add_opt(llama_arg(
        {"--no-slots"},
-        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        "disables slots monitoring endpoint",
        [](gpt_params & params) {
            params.endpoint_slots = false;
        }
--- a/common/common.h
+++ b/common/common.h
@ -294,7 +294,10 @@ struct gpt_params {
    std::string ssl_file_key  = "";                                                                         // NOLINT
    std::string ssl_file_cert = "";                                                                         // NOLINT

-    bool endpoint_slots   = true;
+    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
+    bool endpoint_slots   = false;
+    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

    bool log_json = false;
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -314,9 +314,9 @@ struct lora_merge_ctx {
            // optionally dequantize it
            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
            auto nels = ggml_nelements(inp_base);
-            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+            const auto * qtype = ggml_get_type_traits(base->type);
            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
-            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
        } else {
            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 }

 static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
 ) {
    if (layer->type == GGML_TYPE_F16) {
@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(

 // Run quantization function for a single layer and update error stats
 static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
 ) {
@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
            continue;
        }
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (qfns.from_float && qfns.to_float) {
+        const auto *  qfns = ggml_get_type_traits(type);
+        if (qfns->from_float && qfns->to_float) {
            if (params.verbose) {
                printf("testing %s ...\n",  ggml_type_name(type));
            }
@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
                test_roundtrip_on_layer(
                        layer_name,
                        params.per_layer_stats,
-                        qfns,
+                        *qfns,
                        params.reference,
                        kv_tensor.second,
                        input_scratch,
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co

 ## Usage

+<!-- Note for contributors: The list below is generated by llama-gen-docs -->
+
 **Common params**

 | Argument | Explanation |
@ -149,7 +151,9 @@ The project is under active development, and we are [looking for feedback and co
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
+| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
@ -380,8 +384,6 @@ node index.js

    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`

-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-
    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.

 **Response format**
@ -519,34 +521,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B

 Takes a prefix and a suffix and returns the predicted completion as stream.

-    *Options:*
+*Options:*

-    `input_prefix`: Set the prefix of the code to infill.
+- `input_prefix`: Set the prefix of the code to infill.
+- `input_suffix`: Set the suffix of the code to infill.

-    `input_suffix`: Set the suffix of the code to infill.
+It also accepts all the options of `/completion` except `stream` and `prompt`.

-    It also accepts all the options of `/completion` except `stream` and `prompt`.
+### **GET** `/props`: Get server global properties.

- **GET** `/props`: Return current server settings.
+This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`

 **Response format**

 ```json
 {
-  "assistant_name": "",
-  "user_name": "",
+  "system_prompt": "",
  "default_generation_settings": { ... },
  "total_slots": 1,
  "chat_template": ""
 }
 ```

- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template

+### POST `/props`: Change server global properties.
+
+To use this endpoint with POST method, you need to start server with `--props`
+
+*Options:*
+
+- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
+
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API

 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
@ -813,28 +822,6 @@ To know the `id` of the adapter, use GET `/lora-adapters`

 ## More examples

-### Change system prompt on runtime
-
-To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
-
-`prompt`: Specify a context that you want all connecting clients to respect.
-
-`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
-
-`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
-
-```json
-{
-    "system_prompt": {
-        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
-        "anti_prompt": "User:",
-        "assistant_name": "Assistant:"
-    }
-}
-```
-
-**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
-
 ### Interactive mode

 Check the sample in [chat.mjs](chat.mjs).
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1106,12 +1106,7 @@ struct server_context {
        SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());

        system_prompt = sys_prompt;
-
-        // release all slots
-        for (server_slot & slot : slots) {
-            slot.release();
-        }
-
+        // update system_tokens and KV cache as soon as all slots are idle
        system_need_update = true;
        return true;
    }
@ -1627,16 +1622,6 @@ struct server_context {
                        break;
                    }

-                    if (task.data.contains("system_prompt")) {
-                        std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
-                        system_prompt_set(sys_prompt);
-
-                        for (server_slot & slot : slots) {
-                            slot.n_past    = 0;
-                            slot.n_past_se = 0;
-                        }
-                    }
-
                    slot->reset();

                    slot->id_task   = task.id;
@ -1862,10 +1847,6 @@ struct server_context {
    }

    void update_slots() {
-        if (system_need_update) {
-            system_prompt_update();
-        }
-
        // check if all slots are idle
        {
            bool all_idle = true;
@ -1878,6 +1859,10 @@ struct server_context {
            }

            if (all_idle) {
+                if (system_need_update) {
+                    system_prompt_update();
+                }
+
                SRV_INF("%s", "all slots are idle\n");
                if (system_prompt.empty() && clean_kv_cache) {
                    kv_cache_clear();
@ -2536,20 +2521,10 @@ int main(int argc, char ** argv) {
    //

    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
-        // TODO: should we apply API key to all endpoints, including "/health" and "/models"?
-        static const std::unordered_set<std::string> protected_endpoints = {
-            "/props",
-            "/completion",
-            "/completions",
-            "/v1/completions",
-            "/chat/completions",
-            "/v1/chat/completions",
-            "/infill",
-            "/tokenize",
-            "/detokenize",
-            "/embedding",
-            "/embeddings",
-            "/v1/embeddings",
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/models",
+            "/v1/models",
        };

        // If API key is not set, skip validation
@ -2557,8 +2532,8 @@ int main(int argc, char ** argv) {
            return true;
        }

-        // If path is not in protected_endpoints list, skip validation
-        if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
+        // If path is public, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end()) {
            return true;
        }

@ -2620,7 +2595,7 @@ int main(int argc, char ** argv) {

    const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
        if (!params.endpoint_slots) {
-            res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
+            res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }

@ -2869,24 +2844,31 @@ int main(int argc, char ** argv) {
    };

    const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
-        std::string template_key = "tokenizer.chat_template", curr_tmpl;
-        int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
-        if (tlen > 0) {
-            std::vector<char> curr_tmpl_buf(tlen + 1, 0);
-            if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
-                curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
-            }
-        }
        json data = {
-            { "system_prompt",               ctx_server.system_prompt.c_str() },
+            { "system_prompt",               ctx_server.system_prompt },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params.n_parallel },
-            { "chat_template",               curr_tmpl.c_str() },
+            { "chat_template",               llama_get_chat_template(ctx_server.model) },
        };

        res_ok(res, data);
    };

+    const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+        if (!ctx_server.params.endpoint_props) {
+            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        json data = json::parse(req.body);
+        if (data.contains("system_prompt")) {
+            std::string system_prompt = data.at("system_prompt");
+            ctx_server.system_prompt_set(system_prompt);
+        }
+
+        res_ok(res, {{ "success", true }});
+    };
+
    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
        if (ctx_server.params.embedding || ctx_server.params.reranking) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
@ -3265,6 +3247,12 @@ int main(int argc, char ** argv) {
        svr->set_base_dir(params.public_path);
    }

+    if (!params.api_keys.empty()) {
+        // for now, if API key is set, web UI is unusable
+        svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
+            return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
+        });
+    } else {
        // using embedded static files
        svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
        svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
@ -3283,12 +3271,15 @@ int main(int argc, char ** argv) {
        svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
        svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
        svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+    }

    // register API routes
-    svr->Get ("/health",              handle_health);
+    svr->Get ("/health",              handle_health); // public endpoint (no API key check)
    svr->Get ("/metrics",             handle_metrics);
    svr->Get ("/props",               handle_props);
-    svr->Get ("/v1/models",           handle_models);
+    svr->Post("/props",               handle_props_change);
+    svr->Get ("/models",              handle_models); // public endpoint (no API key check)
+    svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check)
    svr->Post("/completion",          handle_completions); // legacy
    svr->Post("/completions",         handle_completions);
    svr->Post("/v1/completions",      handle_completions);
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@ -5,7 +5,7 @@ Feature: Security
  Background: Server startup with an api key defined
    Given a server listening on localhost:8080
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a server api key llama.cpp
+    And   a server api key THIS_IS_THE_KEY
    Then  the server is starting
    Then  the server is healthy

@ -17,8 +17,8 @@ Feature: Security

    Examples: Prompts
      | api_key         | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
+      | THIS_IS_THE_KEY | no        |
+      | THIS_IS_THE_KEY | no        |
      | hackeme         | raised    |
      |                 | raised    |

@ -33,8 +33,8 @@ Feature: Security

    Examples: Prompts
      | api_key         | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
+      | THIS_IS_THE_KEY | no        |
+      | THIS_IS_THE_KEY | no        |
      | hackme          | raised    |

  Scenario Outline: OAI Compatibility (invalid response formats)
@ -55,7 +55,7 @@ Feature: Security


  Scenario Outline: CORS Options
-    Given a user api key llama.cpp
+    Given a user api key THIS_IS_THE_KEY
    When  an OPTIONS request is sent from <origin>
    Then  CORS header <cors_header> is set to <cors_header_value>

--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1299,7 +1299,8 @@ async def wait_for_slots_status(context,

    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        while True:
-            async with await session.get(f'{base_url}/slots', params=params) as slots_response:
+            headers = {'Authorization': f'Bearer {context.server_api_key}'}
+            async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response:
                status_code = slots_response.status
                slots = await slots_response.json()
                if context.debug:
@ -1387,6 +1388,7 @@ def start_server_background(context):
        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
    server_listen_addr = context.server_fqdn
    server_args = [
+        '--slots', # requires to get slot status via /slots endpoint
        '--host', server_listen_addr,
        '--port', context.server_port,
    ]
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -90,6 +90,19 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
    return formatted_chat;
 }

+static std::string llama_get_chat_template(const struct llama_model * model) {
+    std::string template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
+    if (res < 0) {
+        return "";
+    } else {
+        std::vector<char> model_template(res, 0);
+        llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size());
+    }
+}
+
 //
 // base64 utils (TODO: move to common in the future)
 //
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2535,7 +2535,7 @@ extern "C" {
    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);

-    typedef struct {
+    struct ggml_type_traits {
        const char             * type_name;
        int64_t                  blck_size;
        int64_t                  blck_size_interleave; // interleave elements in blocks
@ -2551,9 +2551,9 @@ extern "C" {
        int64_t                  ncols; // number of columns to process simultaneously
        ggml_gemv_t              gemv;
        ggml_gemm_t              gemm;
-    } ggml_type_traits_t;
+    };

-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
                op->type != GGML_TYPE_IQ1_S   &&
                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
        case GGML_OP_ROPE_BACK:
            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
        case GGML_OP_IM2COL_BACK:
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg

    // convert src0 to float
    if (type != GGML_TYPE_F32) {
-        ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
-        ggml_to_float_t const to_float = type_traits.to_float;
+        const auto * type_traits = ggml_get_type_traits(type);
+        ggml_to_float_t const to_float = type_traits->to_float;

        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s
            // TODO: find the optimal value
            const int64_t min_batch = 32;

-            return (ggml_is_contiguous(src0) &&
+            return ggml_is_contiguous(src0) &&
                   ggml_is_contiguous(src1) &&
                   src1->type == GGML_TYPE_F32 &&
-                    (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
+                   (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
        }

        case GGML_OP_OUT_PROD:
-            return (op->src[0]->type == GGML_TYPE_F32 &&
+            return op->src[0]->type == GGML_TYPE_F32 &&
                   op->src[1]->type == GGML_TYPE_F32 &&
                   ggml_is_matrix(src0) &&
                   ggml_is_matrix(src1) &&
                   ggml_is_contiguous(src0) &&
-                    (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
+                   (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);

        default:
            return false;
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
        return;
    }

-    ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
+    const auto * tt = ggml_get_type_traits(quant);

-    ggml_to_float_t dequant_fn = tt.to_float;
+    ggml_to_float_t dequant_fn = tt->to_float;

    dequant_fn(from, to, ne);
 }
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);

-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
    [GGML_TYPE_I8] = {
        .type_name                = "i8",
        .blck_size                = 1,
@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 };

 // For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
    GGML_ASSERT(type < GGML_TYPE_COUNT);
-    return type_traits[type];
+    return &type_traits[type];
 }

 //
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@ -136,7 +136,7 @@ int main(int argc, char** argv) {

    auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;

-    auto funcs = ggml_internal_get_type_traits(ggml_type);
+    const auto * funcs = ggml_get_type_traits(ggml_type);

    Stat simple, ggml;

@ -156,8 +156,8 @@ int main(int argc, char** argv) {

        t1 = std::chrono::high_resolution_clock::now();
        float fs;
-        if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
-        else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
+        if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
+        else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
        t2 = std::chrono::high_resolution_clock::now();
        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
        if (iloop > 3) ggml.addResult(fs, t);
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@ -236,7 +236,7 @@ int main(int argc, char** argv) {
    int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
    int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);

-    auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
+    const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);

    std::vector<block_q4_0> q40;
    std::vector<block_q4_1> q41;
@ -261,9 +261,9 @@ int main(int argc, char** argv) {
        // Note, we do not include this in the timing as in practical application
        // we already have the quantized model weights.
        if (useQ4_1) {
-            funcs.from_float(x1.data(), q41.data(), kVecSize);
+            funcs->from_float(x1.data(), q41.data(), kVecSize);
        } else {
-            funcs.from_float(x1.data(), q40.data(), kVecSize);
+            funcs->from_float(x1.data(), q40.data(), kVecSize);
        }

        // Now measure time the dot product needs using the "scalar" version above
@ -282,10 +282,10 @@ int main(int argc, char** argv) {
            dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
        }
        else {
-            auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
-            vdot.from_float(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
-            else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
+            const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
+            vdot->from_float(y1.data(), q8.data(), kVecSize);
+            if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
+            else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
        }
        sumq += result;
        t2 = std::chrono::high_resolution_clock::now();
--- a/scripts/debug-test.sh
+++ b/scripts/debug-test.sh
@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
 ###########################################################

 # Note: test-eval-callback requires -DLLAMA_CURL
-cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
+cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment"
 pushd "$build_dir"
 make -j || abort "Failed to compile"
 popd > /dev/null || exit 1
@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
 pushd "$build_dir"
 tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
 if [ ${#tests[@]} -eq 0 ]; then
-    abort "No tests avaliable... check your compliation process..."
+    abort "No tests available... check your compilation process..."
 fi
 popd > /dev/null || exit 1

@ -137,7 +137,7 @@ popd > /dev/null || exit 1

 # Select test number
 if [ -z $test_number ]; then
-    # List out avaliable tests
+    # List out available tests
    printf "Which test would you like to debug?\n"
    id=0
    for s in "${tests[@]}"
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -17872,10 +17872,9 @@ static void llama_tensor_dequantize_internal(
    }
    float * f32_output = (float *) output.data();

-    ggml_type_traits_t qtype;
+    const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
    if (ggml_is_quantized(tensor->type)) {
-        qtype = ggml_internal_get_type_traits(tensor->type);
-        if (qtype.to_float == NULL) {
+        if (qtype->to_float == NULL) {
            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
        }
    } else if (tensor->type != GGML_TYPE_F16 &&
@ -17889,7 +17888,7 @@ static void llama_tensor_dequantize_internal(
        } else if (tensor->type == GGML_TYPE_BF16) {
            ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor->type)) {
-            qtype.to_float(tensor->data, f32_output, nelements);
+            qtype->to_float(tensor->data, f32_output, nelements);
        } else {
            GGML_ABORT("fatal error"); // unreachable
        }
@ -17925,7 +17924,7 @@ static void llama_tensor_dequantize_internal(
            } else if (typ == GGML_TYPE_BF16) {
                ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
            } else {
-                qtype.to_float(inbuf, outbuf, nels);
+                qtype->to_float(inbuf, outbuf, nels);
            }
        };
        workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };

-// list is always in ascending order, to enable binary searh
+// list is always in ascending order, to enable binary search
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
 {0x01E921, 0x01E943},
 };

-// list is always in ascending order, to enable binary searh
+// list is always in ascending order, to enable binary search
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -133,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
    std::vector<uint8_t> buf(ggml_nbytes(t));
    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));

-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
+    const auto * tt = ggml_get_type_traits(t->type);
    size_t bs = ggml_blck_size(t->type);
    std::vector<float> vq(ggml_blck_size(t->type));
    bool quantized = ggml_is_quantized(t->type);
@ -159,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                    } else if (t->type == GGML_TYPE_I8) {
                        tv.push_back((float)*(int8_t *) &buf[i]);
                    } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), bs);
+                        tt->to_float(&buf[i], vq.data(), bs);
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ABORT("fatal error");
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
 }

 // Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
    std::vector<uint8_t> tmp_q(2*test_size);
    std::vector<float> tmp_out(test_size);

-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
    return array_rmse(test_data, tmp_out.data(), test_size);
 }

 // Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
    std::vector<uint8_t> tmp_q(2*test_size);
    std::vector<float> tmp_out(test_size);
    std::vector<float> tmp_out_ref(test_size);

-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);

-    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);

    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {

 // Total dot product error
 static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
+    const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
 ) {
    std::vector<uint8_t> tmp_q1(2*test_size);
    std::vector<uint8_t> tmp_q2(2*test_size);

-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+    const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);

-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
+    qfns->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);

    float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);

    const float dot_ref = dot_product(test_data1, test_data2, test_size);

@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {

    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);

        // deprecated - skip
-        if (qfns.blck_size == 0) {
+        if (qfns->blck_size == 0) {
            continue;
        }

@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
        printf("Testing %s\n", ggml_type_name((ggml_type) i));
        ggml_quantize_init(ei);

-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
            const float max_quantization_error =
                type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -122,9 +122,9 @@ static void usage(char * argv[]) {
    printf("  --type TYPE           set test type as");
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
        if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
+            if (qfns->from_float && qfns->to_float) {
                printf(" %s", ggml_type_name(type));
            }
        }
@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {

    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
            continue;
        }

-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
            printf("%s\n", ggml_type_name(type));

            ggml_quantize_init(type);
@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_ref(test_data1, test_q1, size);
+                        qfns->from_float_ref(test_data1, test_q1, size);
                        return test_q1[0];
                    };
                    size_t quantized_size = ggml_row_size(type, size);
@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
+                        qfns->from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
                    size_t quantized_size = ggml_row_size(type, size);
@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {

            if (params.op_dequantize_row_q) {
                printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data1, test_q1, largest);
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
+                        qfns->to_float(test_q1, test_out, size);
                        return test_out[0];
                    };
                    size_t quantized_size = ggml_row_size(type, size);
@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
+                        const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
                    size_t quantized_size = ggml_row_size(type, size);
@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {

            if (params.op_vec_dot_q) {
                printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
+                qfns->from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data2, test_q2, largest);
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
                        float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                        return result;
                    };
                    size_t quantized_size = ggml_row_size(type, size);