sync: minja (a72057e519) (#11774 )

Update README.md [no ci] (#11781 )
typo: `\` -> `/` Change the UNIX path separator to` \`.
2025-02-10 09:34:09 +00:00 · 2025-02-10 09:05:57 +01:00 · 2025-02-10 07:17:21 +01:00 · 2025-02-10 07:08:22 +01:00 · 2025-02-09 10:34:49 +00:00 · 2025-02-09 08:43:51 +01:00
76 changed files with 8732 additions and 3644 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -81,13 +81,36 @@ jobs:
        with:
          node-version: '22.11.0'

+      - name: WebUI - Install dependencies
+        id: webui_lint
+        run: |
+          cd examples/server/webui
+          npm ci
+
+      - name: WebUI - Check code format
+        id: webui_format
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd examples/server/webui
+          git status
+
+          npm run format
+          git status
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Files do not follow coding style. To fix: npm run format"
+            echo "${modified_files}"
+            exit 1
+          fi
+
      - name: Verify bundled index.html
        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
          cd examples/server/webui
          git status
-          npm ci
+
          npm run build
          git status
          modified_files="$(git status -s)"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -233,4 +233,4 @@ configure_file(cmake/llama.pc.in
        @ONLY)

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        DESTINATION lib/pkgconfig)
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
--- a/README.md
+++ b/README.md
@ -189,6 +189,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [ramalama](https://github.com/containers/ramalama) (MIT)
 - [semperai/amica](https://github.com/semperai/amica) (MIT)
 - [withcatai/catai](https://github.com/withcatai/catai) (MIT)
+- [Autopen](https://github.com/blackhole89/autopen) (GPL)

 </details>

--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@ -1,10 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/lib
-includedir=${prefix}/include
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
-Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lggml  -lggml-base -lllama
+Version: @LLAMA_INSTALL_VERSION@
+Libs: -L${libdir} -lggml -lggml-base -lllama
 Cflags: -I${includedir}
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2324,5 +2324,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_TTS}));

+    add_opt(common_arg(
+        {"--embd-bge-small-en-default"},
+        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-e5-small-en-default"},
+        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.hf_file = "e5-small-v2-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-gte-small-default"},
+        string_format("use default gte-small model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.hf_file = "gte-small-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
    return ctx_arg;
 }
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@ -249,16 +249,30 @@ class chat_template {
                    inputs.add_generation_prompt = false;
                    full = apply(inputs);
                }
-
-                if (full.find(prefix) != 0) {
-                    if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
-                        prefix = prefix.substr(0, prefix.size() - eos_token_.size());
+                auto eos_pos_last = full.rfind(eos_token_);
+                if (eos_pos_last == prefix.size() - eos_token_.size() ||
+                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
+                    full = full.substr(0, eos_pos_last);
+                }
+                size_t common_prefix_length = 0;
+                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
+                    if (prefix[i] != full[i]) {
+                        break;
                    }
+                    if (prefix[i] == '<') {
+                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
+                        // but it removes thinking tags for past messages.
+                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
+                        continue;
+                    }
+                    common_prefix_length = i + 1;
                }
-                if (full.find(prefix) != 0) {
+                auto example = full.substr(common_prefix_length);
+                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
+                } else {
+                    tool_call_example_ = example;
                }
-                tool_call_example_ = full.substr(prefix.size());
            }
        } catch (const std::exception & e) {
            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@ -363,7 +377,7 @@ class chat_template {
            if (polyfill_tools) {
                adjusted_messages = add_system(inputs.messages,
                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
-                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
+                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
            } else {
                adjusted_messages = inputs.messages;
            }
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
        };
    }

-    return new llama_sampler{
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx,
-    };
+        /* .ctx   = */ ctx
+    );
 }

 #else
--- a/common/log.h
+++ b/common/log.h
@ -2,6 +2,7 @@

 #include "ggml.h" // for ggml_log_level

+#define LOG_CLR_TO_EOL  "\033[K\r"
 #define LOG_COL_DEFAULT "\033[0m"
 #define LOG_COL_BOLD    "\033[1m"
 #define LOG_COL_RED     "\033[31m"
--- a/common/minja.hpp
+++ b/common/minja.hpp
@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
  return s.substr(start, end - start + 1);
 }

+static std::string capitalize(const std::string & s) {
+  if (s.empty()) return s;
+  auto result = s;
+  result[0] = std::toupper(result[0]);
+  return result;
+}
+
 static std::string html_escape(const std::string & s) {
  std::string result;
  result.reserve(s.size());
@ -1462,6 +1469,9 @@ public:
          if (method->get_name() == "strip") {
            vargs.expectArgs("strip method", {0, 0}, {0, 0});
            return Value(strip(str));
+          } else if (method->get_name() == "capitalize") {
+            vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
+            return Value(capitalize(str));
          } else if (method->get_name() == "endswith") {
            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
            auto suffix = vargs.args[0].get<std::string>();
@ -1792,7 +1802,7 @@ private:
        auto left = parseStringConcat();
        if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");

-        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
+        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
        static std::regex not_tok(R"(not\b)");
        std::string op_str;
        while (!(op_str = consumeToken(compare_tok)).empty()) {
@ -2171,7 +2181,7 @@ private:
    using TemplateTokenIterator = TemplateTokenVector::const_iterator;

    std::vector<std::string> parseVarNames() {
-      static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
+      static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");

      std::vector<std::string> group;
      if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@ -2194,13 +2204,13 @@ private:
    }

    TemplateTokenVector tokenize() {
-      static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
+      static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
      static std::regex expr_open_regex(R"(\{\{([-~])?)");
-      static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
+      static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
      static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
-      static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
-      static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
+      static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
+      static std::regex block_close_regex(R"(\s*([-~])?%\})");

      TemplateTokenVector tokens;
      std::vector<std::string> group;
@ -2284,7 +2294,7 @@ private:
              auto post_space = parseBlockClose();
              tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
            } else if (keyword == "set") {
-              static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
+              static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");

              std::string ns;
              std::vector<std::string> var_names;
@ -2336,6 +2346,11 @@ private:
              throw std::runtime_error("Unexpected block: " + keyword);
            }
          } else if (std::regex_search(it, end, match, non_text_open_regex)) {
+            if (!match.position()) {
+                if (match[0] != "{#")
+                    throw std::runtime_error("Internal error: Expected a comment");
+                throw std::runtime_error("Missing end of comment tag");
+            }
            auto text_end = it + match.position();
            text = std::string(it, text_end);
            it = text_end;
@ -2400,7 +2415,7 @@ private:

              auto text = text_token->text;
              if (post_space == SpaceHandling::Strip) {
-                static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
+                static std::regex trailing_space_regex(R"(\s+$)");
                text = std::regex_replace(text, trailing_space_regex, "");
              } else if (options.lstrip_blocks && it != end) {
                auto i = text.size();
@ -2410,7 +2425,7 @@ private:
                }
              }
              if (pre_space == SpaceHandling::Strip) {
-                static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
+                static std::regex leading_space_regex(R"(^\s+)");
                text = std::regex_replace(text, leading_space_regex, "");
              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
                if (text.length() > 0 && text[0] == '\n') {
--- a/docs/build.md
+++ b/docs/build.md
@ -125,21 +125,66 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).

 ## CUDA

-This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
+This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.

-If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
+#### Download directly from NVIDIA
+You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).

- Using `CMake`:

-  ```bash
-  cmake -B build -DGGML_CUDA=ON
-  cmake --build build --config Release
-  ```
+#### Compile and run inside a Fedora Toolbox Container
+We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).

-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
+**Recommended for:**
+
+- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
+- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
+
+
+### Compilation
+```bash
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release
+```
+
+### Override Compute Capability Specifications
+
+If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
+ ```text
+nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
+```
+
+To override the `native` GPU detection:
+
+#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
+
+```text
+GeForce RTX 4090      8.9
+GeForce RTX 3080 Ti   8.6
+GeForce RTX 3070      8.6
+```
+
+#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
+```
+
+### Runtime CUDA environmental variables
+
+You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
+
+```bash
+# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
+CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
+```
+
+### Unified Memory

 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.

+### Performance Tuning
+
 The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
--- a/docs/cuda-fedora.md
+++ b/docs/cuda-fedora.md
@ -1,17 +1,16 @@
 # Setting Up CUDA on Fedora

 In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for:
+
 - [Fedora Workstation](https://fedoraproject.org/workstation/)
 - [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/)
 - [Fedora Spins](https://fedoraproject.org/spins)
- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.`, `Arch Linux`, and `Ubuntu`.
-
+- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.5`, `Arch Linux`, and `Ubuntu`.

 ## Table of Contents

 - [Prerequisites](#prerequisites)
- [Monitoring NVIDIA CUDA Repositories](#monitoring-nvidia-cuda-repositories)
- [Using the Fedora 39 CUDA Repository](#using-the-fedora-39-cuda-repository)
+- [Using the Fedora 41 CUDA Repository](#using-the-fedora-41-cuda-repository)
 - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
 - [Installing Essential Development Tools](#installing-essential-development-tools)
 - [Adding the CUDA Repository](#adding-the-cuda-repository)
@ -29,44 +28,33 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
 ## Prerequisites

 - **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/).
- **NVIDIA Drivers and Graphics Card installed on Host System (optional)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
+- **NVIDIA Drivers and Graphics Card installed on Host System (recommended)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA).
 - **Internet connectivity** to download packages.

-### Monitoring NVIDIA CUDA Repositories
+### Using the Fedora 41 CUDA Repository

-Before proceeding, it is advisable to check if NVIDIA has updated their CUDA repositories for your Fedora version. NVIDIA's repositories can be found at:
+The latest release is 41.

- [Fedora 40 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora40/x86_64/)
 - [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/)

-As of the latest update, these repositories do not contain the `cuda` meta-package or are missing essential components.
-
-### Using the Fedora 39 CUDA Repository
-
-Since the newer repositories are incomplete, we'll use the Fedora 39 repository:
-
- [Fedora 39 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/)
-
-**Note:** Fedora 39 is no longer maintained, so we recommend using a toolbox environment to prevent system conflicts.
+**Note:** We recommend using a toolbox environment to prevent system conflicts.

 ## Creating a Fedora Toolbox Environment

-This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using a Fedora 39 toolbox allows us to install the necessary packages without affecting the host system.
+This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using the Fedora Toolbox allows us to install the necessary packages without affecting the host system.

 **Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker.

-We do not recommend installing on the host system, as Fedora 39 is out-of-maintenance, and instead you should upgrade to a maintained version of Fedora for your host.
-
-1. **Create a Fedora 39 Toolbox:**
+1. **Create a Fedora 41 Toolbox:**

   ```bash
-   toolbox create --image registry.fedoraproject.org/fedora-toolbox:39 --container fedora-toolbox-39-cuda
+   toolbox create --image registry.fedoraproject.org/fedora-toolbox:41 --container fedora-toolbox-41-cuda
   ```

 2. **Enter the Toolbox:**

   ```bash
-   toolbox enter --container fedora-toolbox-39-cuda
+   toolbox enter --container fedora-toolbox-41-cuda
   ```

   Inside the toolbox, you have root privileges and can install packages without affecting the host system.
@ -85,7 +73,7 @@ We do not recommend installing on the host system, as Fedora 39 is out-of-mainte
   sudo dnf install vim-default-editor --allowerasing
   ```

-   The `--allowerasing` flag resolves any package conflicts.
+   The `--allowerasing` flag will allow the removal of the conflicting `nano-default-editor` package.

 3. **Install Development Tools and Libraries:**

@ -100,7 +88,7 @@ We do not recommend installing on the host system, as Fedora 39 is out-of-mainte
 Add the NVIDIA CUDA repository to your DNF configuration:

 ```bash
-sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/cuda-fedora39.repo
+sudo dnf config-manager addrepo --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/cuda-fedora41.repo
 ```

 After adding the repository, synchronize the package manager again:
@ -109,106 +97,62 @@ After adding the repository, synchronize the package manager again:
 sudo dnf distro-sync
 ```

-## Installing `nvidia-driver-libs`
+## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`

-Attempt to install `nvidia-driver-libs`:
+We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).

 ```bash
-sudo dnf install nvidia-driver-libs
+ls -la /usr/lib64/libcuda.so.1
 ```

 **Explanation:**

- `nvidia-driver-libs` contains necessary NVIDIA driver libraries required by CUDA.
- This step might fail due to conflicts with existing NVIDIA drivers on the host system.
+- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
+  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.

-## Manually Resolving Package Conflicts
+### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
+
+```bash
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+```
+
+### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).

 If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.

-### 1. Download the `nvidia-driver-libs` RPM
+#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)

 ```bash
-sudo dnf download --arch x86_64 nvidia-driver-libs
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
 ```

-You should see a file similar to:
-
-```
-nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
-```
-
-### 2. Attempt to Install the RPM
+#### 2. Update the RPM database to assume the installation of these packages.

 ```bash
-sudo dnf install nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
-```
-
-**Expected Error:**
-
-Installation may fail with errors pointing to conflicts with `egl-gbm` and `egl-wayland`.
-
-**Note: It is important to carefully read the error messages to identify the exact paths that need to be excluded.**
-
-### 3. Download Dependencies
-
-```bash
-sudo dnf download --arch x86_64 egl-gbm egl-wayland
-```
-
-### 4. Install `egl-gbm` with Excluded Paths
-
-Exclude conflicting files during installation:
-
-```bash
-sudo rpm --install --verbose --hash \
-  --excludepath=/usr/lib64/libnvidia-egl-gbm.so.1.1.2 \
-  --excludepath=/usr/share/egl/egl_external_platform.d/15_nvidia_gbm.json \
-  egl-gbm-1.1.2^20240919gitb24587d-3.fc39.x86_64.rpm
-```
-
-**Explanation:**
-
- The `--excludepath` option skips installing files that conflict with existing files.
- Adjust the paths based on the error messages you receive.
-
-### 5. Install `egl-wayland` with Excluded Paths
-
-```bash
-sudo rpm --install --verbose --hash \
-  --excludepath=/usr/share/egl/egl_external_platform.d/10_nvidia_wayland.json \
-  egl-wayland-1.1.17^20241118giteeb29e1-5.fc39.x86_64.rpm
-```
-
-### 6. Install `nvidia-driver-libs` with Excluded Paths
-
-```bash
-sudo rpm --install --verbose --hash \
-  --excludepath=/usr/share/glvnd/egl_vendor.d/10_nvidia.json \
-  --excludepath=/usr/share/nvidia/nvoptix.bin \
-  nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm
+sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
 ```

 **Note:**

- Replace the paths with the ones causing conflicts in your installation if they differ.
- The `--verbose` and `--hash` options provide detailed output during installation.
+- The `--justdb` option only updates the RPM database, without touching the filesystem.

-## Finalizing the Installation of `nvidia-driver-libs`
+#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`

 After manually installing the dependencies, run:

 ```bash
-sudo dnf install nvidia-driver-libs
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
 ```

 You should receive a message indicating the package is already installed:

 ```
-Package nvidia-driver-libs-3:560.35.05-1.fc39.x86_64 is already installed.
-Dependencies resolved.
+Updating and loading repositories:
+Repositories loaded.
+Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+
 Nothing to do.
-Complete!
 ```

 ## Installing the CUDA Meta-Package
@ -233,7 +177,7 @@ To use CUDA, add its binary directory to your system's `PATH`.

   **Explanation:**

-   - We add to  `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
+   - We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system.
   - The backslash `\` before `$PATH` ensures the variable is correctly written into the script.

 2. **Make the Script Executable:**
@ -262,26 +206,33 @@ You should see output similar to:

 ```
 nvcc: NVIDIA (R) Cuda compiler driver
-Copyright (c) 2005-2024 NVIDIA Corporation
-Built on Tue_Oct_29_23:50:19_PDT_2024
-Cuda compilation tools, release 12.6, V12.6.85
-Build cuda_12.6.r12.6/compiler.35059454_0
+Copyright (c) 2005-2025 NVIDIA Corporation
+Built on Wed_Jan_15_19:20:09_PST_2025
+Cuda compilation tools, release 12.8, V12.8.61
+Build cuda_12.8.r12.8/compiler.35404655_0
 ```

 This output confirms that the CUDA compiler is accessible and indicates the installed version.

 ## Conclusion

-You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 39 CUDA repository. By manually resolving package conflicts and configuring the environment, you can develop CUDA applications without affecting your host system.
+You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 41 CUDA repository. By manually updating the RPM db and configuring the environment, you can develop CUDA applications without affecting your host system.

 ## Troubleshooting

 - **Installation Failures:**
-  - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
-  - Use the `--excludepath` option with `rpm` to exclude conflicting files during manual installations.

- **Driver Conflicts:**
-  - Since the host system may already have NVIDIA drivers installed, conflicts can arise. Using the toolbox environment helps isolate these issues.
+  - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies.
+  - You may use the `--excludepath` option with `rpm` to exclude conflicting files during manual RPM installations.
+
+- **Rebooting the Container:**
+
+  - Sometimes there may be a bug in the NVIDIA driver host passthrough (such as missing a shared library). Rebooting the container may solve this issue:
+
+  ```bash
+  # on the host system
+  podman container restart --all
+  ```

 - **Environment Variables Not Set:**
  - If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`.
@ -291,10 +242,12 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t
 ## Additional Notes

 - **Updating CUDA in the Future:**
+
  - Keep an eye on the official NVIDIA repositories for updates to your Fedora version.
  - When an updated repository becomes available, adjust your `dnf` configuration accordingly.

 - **Building `llama.cpp`:**
+
  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
  - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.

--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp.

 ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
-./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```

 ### Windows:
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -346,7 +346,7 @@ class HttpClient {
        if (!output_file.empty()) {
            output_file_partial = output_file + ".partial";
            if (!out.open(output_file_partial, "ab")) {
-                printe("Failed to open file\n");
+                printe("Failed to open file for writing\n");

                return 1;
            }
@ -535,8 +535,7 @@ class HttpClient {

    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
                               const std::string & progress_suffix) {
-        printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
-               progress_suffix.c_str());
+        printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
    }
    // Function to write data to a file
    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
@ -797,16 +796,13 @@ class LlamaData {
    llama_model_ptr initialize_model(Opt & opt) {
        ggml_backend_load_all();
        resolve_model(opt.model_);
-        printe(
-            "\r%*s"
-            "\rLoading model",
-            get_terminal_width(), " ");
+        printe("\r" LOG_CLR_TO_EOL "Loading model");
        llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
        if (!model) {
            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }

-        printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
+        printe("\r" LOG_CLR_TO_EOL);
        return model;
    }

@ -969,10 +965,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
 static int read_user_input(std::string & user_input) {
    static const char * prompt_prefix = "> ";
 #ifdef WIN32
-    printf(
-        "\r%*s"
-        "\r" LOG_COL_DEFAULT "%s",
-        get_terminal_width(), " ", prompt_prefix);
+    printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);

    std::getline(std::cin, user_input);
    if (std::cin.eof()) {
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -220,7 +220,7 @@ services:
 The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.

 The web UI is developed using:
- `vue` framework for frontend development
+- `react` framework for frontend development
 - `tailwindcss` and `daisyui` for styling
 - `vite` for build tooling

--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -334,24 +334,24 @@ struct server_task {
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                  = json_value(data, "json_schema", json::object());
-                LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+                SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
                params.sampling.grammar      = json_schema_to_grammar(schema);
-                LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+                SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
            } catch (const std::exception & e) {
                throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
            }
        } else {
            params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
-            LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+            SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
            params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
-            LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+            SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
        }

        {
            auto it = data.find("chat_format");
            if (it != data.end()) {
                params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
            } else {
                params.oaicompat_chat_format = defaults.oaicompat_chat_format;
            }
@ -367,12 +367,12 @@ struct server_task {

                    auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
                    if (ids.size() == 1) {
-                        LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
                        params.sampling.grammar_trigger_tokens.push_back(ids[0]);
                        params.sampling.preserved_tokens.insert(ids[0]);
                        continue;
                    }
-                    LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
+                    SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
                    params.sampling.grammar_trigger_words.push_back(trigger);
                }
            }
@ -381,11 +381,11 @@ struct server_task {
                for (const auto & t : *preserved_tokens) {
                    auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
                    if (ids.size() == 1) {
-                        LOG_DBG("Preserved token: %d\n", ids[0]);
+                        SRV_DBG("Preserved token: %d\n", ids[0]);
                        params.sampling.preserved_tokens.insert(ids[0]);
                    } else {
                        // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
-                        LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
+                        SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
                    }
                }
            }
@ -717,7 +717,7 @@ struct server_task_result_cmpl_final : server_task_result {
        std::string finish_reason = "length";
        common_chat_msg msg;
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            LOG_DBG("Parsing chat message: %s\n", content.c_str());
+            SRV_DBG("Parsing chat message: %s\n", content.c_str());
            msg = common_chat_parse(content, oaicompat_chat_format);
            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
        } else {
@ -1885,7 +1885,7 @@ struct server_context {
        }

        if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
-            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
            chat_templates = common_chat_templates_from_model(model, "chatml");
        } else {
            chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
@ -3355,10 +3355,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp

    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch

-    LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);

-    LOG_DBG("request:  %s\n", req.body.c_str());
-    LOG_DBG("response: %s\n", res.body.c_str());
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
 }

 std::function<void(int)> shutdown_handler;
@ -3860,7 +3860,9 @@ int main(int argc, char ** argv) {

        try {
            const auto & prompt = data.at("prompt");
-            LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+            // TODO: this log can become very long, put it behind a flag or think about a more compact format
+            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+
            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
            tasks.reserve(tokenized_prompts.size());
            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@ -4376,6 +4378,9 @@ int main(int argc, char ** argv) {
                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
                } else {
                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
                }
                return false;
--- a/examples/server/webui/.gitignore
+++ b/examples/server/webui/.gitignore
@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
--- a/examples/server/webui/.prettierignore
+++ b/examples/server/webui/.prettierignore
@ -0,0 +1,10 @@
+**/.vscode
+**/.github
+**/.git
+**/.svn
+**/.hg
+**/node_modules
+**/dist
+**/build
+
+*.config.js
--- a/examples/server/webui/eslint.config.js
+++ b/examples/server/webui/eslint.config.js
@ -0,0 +1,26 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+
+export default tseslint.config(
+  { ignores: ['dist'] },
+  {
+    extends: [js.configs.recommended, ...tseslint.configs.recommended],
+    files: ['**/*.{ts,tsx}'],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+    plugins: {
+      'react-hooks': reactHooks,
+      'react-refresh': reactRefresh,
+    },
+    rules: {
+      ...reactHooks.configs.recommended.rules,
+      'react-refresh/only-export-components': 'off',
+      '@typescript-eslint/no-unused-vars': 'off',
+    },
+  },
+)
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@ -1,341 +1,16 @@
-<!DOCTYPE html>
+<!doctype html>
 <html>
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
-  <meta name="color-scheme" content="light dark">
-  <title>🦙 llama.cpp - chat</title>
-</head>
-
-<body>
-  <div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
-    <div class="flex flex-row drawer lg:drawer-open">
-      <input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
-
-      <!-- sidebar -->
-      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
-        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
-        <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
-          <div class="flex flex-row items-center justify-between mb-4 mt-4">
-            <h2 class="font-bold ml-4">Conversations</h2>
-
-            <!-- close sidebar button -->
-            <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
-              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
-                <path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
-              </svg>
-            </label>
-          </div>
-
-          <!-- list of conversations -->
-          <div :class="{
-            'btn btn-ghost justify-start': true,
-            'btn-active': messages.length === 0,
-          }" @click="newConversation">
-            + New conversation
-          </div>
-          <div v-for="conv in conversations" :class="{
-            'btn btn-ghost justify-start font-normal': true,
-            'btn-active': conv.id === viewingConvId,
-          }" @click="setViewingConv(conv.id)" dir="auto">
-            <span class="truncate">{{ conv.messages[0].content }}</span>
-          </div>
-          <div class="text-center text-xs opacity-40 mt-auto mx-4">
-            Conversations are saved to browser's localStorage
-          </div>
-        </div>
-      </div>
-
-      <!-- main view -->
-      <div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
-        <!-- header -->
-        <div class="flex flex-row items-center mt-6 mb-6">
-          <!-- open sidebar button -->
-          <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
-            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
-              <path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
-            </svg>
-          </label>
-
-          <div class="grow text-2xl font-bold ml-2">llama.cpp</div>
-
-          <!-- action buttons (top right) -->
-          <div class="flex items-center">
-            <div v-if="messages.length > 0" class="dropdown dropdown-end">
-              <!-- "..." button -->
-              <button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
-                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
-                  <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
-                </svg>
-              </button>
-              <!-- "delete" dropdown menu -->
-              <ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
-                <li @click="downloadConv(viewingConvId)"><a>Download</a></li>
-                <li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
-              </ul>
-            </div>
-            <div class="tooltip tooltip-bottom" data-tip="Settings">
-              <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
-                <!-- settings button -->
-                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
-                  <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
-                  <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
-                </svg>
-              </button>
-            </div>
-
-            <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
-            <div class="tooltip tooltip-bottom" data-tip="Themes">
-              <div class="dropdown dropdown-end dropdown-bottom">
-                <div tabindex="0" role="button" class="btn m-1">
-                  <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
-                    <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
-                  </svg>
-                </div>
-                <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
-                  <li>
-                    <button
-                      class="btn btn-sm btn-block btn-ghost justify-start"
-                      :class="{ 'btn-active': selectedTheme === 'auto' }"
-                      @click="setSelectedTheme('auto')">
-                      auto
-                    </button>
-                  </li>
-                  <li v-for="theme in themes">
-                    <input
-                      type="radio"
-                      name="theme-dropdown"
-                      class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
-                      :aria-label="theme"
-                      :value="theme"
-                      :checked="selectedTheme === theme"
-                      @click="setSelectedTheme(theme)" />
-                  </li>
-                </ul>
-              </div>
-            </div>
-          </div>
-        </div>
-
-        <!-- chat messages -->
-        <div id="messages-list" class="flex flex-col grow overflow-y-auto">
-          <div class="mt-auto flex justify-center">
-            <!-- placeholder to shift the message to the bottom -->
-            {{ messages.length === 0 ? 'Send a message to start' : '' }}
-          </div>
-          <div v-for="msg in messages" class="group">
-            <message-bubble
-              :config="config"
-              :msg="msg"
-              :key="msg.id"
-              :is-generating="isGenerating"
-              :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
-              :regenerate-msg="regenerateMsg"></message-bubble>
-          </div>
-
-          <!-- pending (ongoing) assistant message -->
-          <div id="pending-msg" class="group">
-            <message-bubble
-              v-if="pendingMsg"
-              :config="config"
-              :msg="pendingMsg"
-              :key="pendingMsg.id"
-              :is-generating="isGenerating"
-              :show-thought-in-progress="config.showThoughtInProgress"
-              :edit-user-msg-and-regenerate="() => {}"
-              :regenerate-msg="() => {}"></message-bubble>
-          </div>
-        </div>
-
-        <!-- chat input -->
-        <div class="flex flex-row items-center mt-8 mb-6">
-          <textarea
-            class="textarea textarea-bordered w-full"
-            placeholder="Type a message (Shift+Enter to add a new line)"
-            v-model="inputMsg"
-            @keydown.enter.exact.prevent="sendMessage"
-            id="msg-input"
-            dir="auto"
-          ></textarea>
-          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
-          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
-        </div>
-      </div>
-
-    </div>
-
-
-    <!-- modal for editing config -->
-    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
-      <div class="modal-box">
-        <h3 class="text-lg font-bold mb-6">Settings</h3>
-        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
-          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
-          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
-          <label class="form-control mb-2">
-            <div class="label">System Message</div>
-            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
-          </label>
-          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-          </template>
-          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
-          <!-- Section: Other sampler settings -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Other sampler settings</summary>
-            <div class="collapse-content">
-              <!-- Samplers queue -->
-              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
-              <!-- Samplers -->
-              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-              </template>
-            </div>
-          </details>
-          <!-- Section: Penalties settings -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Penalties settings</summary>
-            <div class="collapse-content">
-              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-              </template>
-            </div>
-          </details>
-          <!-- Section: Reasoning models -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Reasoning models</summary>
-            <div class="collapse-content">
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
-                <span class="ml-4">Expand though process by default for generating message</span>
-              </div>
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
-                <span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
-              </div>
-            </div>
-          </details>
-          <!-- Section: Advanced config -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Advanced config</summary>
-            <div class="collapse-content">
-              <div class="flex flex-row items-center mb-2" v-if="isDev">
-                <!-- this button only shows in dev mode, used to import a demo conversation to test message rendering -->
-                <button class="btn" @click="debugImportDemoConv()">(debug) Import demo conversation</button>
-              </div>
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
-                <span class="ml-4">Show tokens per second</span>
-              </div>
-              <label class="form-control mb-2">
-                <!-- Custom parameters input -->
-                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
-                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
-              </label>
-            </div>
-          </details>
-        </div>
-
-        <!-- action buttons -->
-        <div class="modal-action">
-          <button class="btn" @click="resetConfigDialog">Reset to default</button>
-          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
-          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
-        </div>
-      </div>
-    </dialog>
-
-  </div>
-
-
-  <!-- Template to be used as message bubble -->
-  <template id="message-bubble">
-    <div :class="{
-      'chat': true,
-      'chat-start': msg.role !== 'user',
-      'chat-end': msg.role === 'user',
-    }">
-      <div :class="{
-        'chat-bubble markdown': true,
-        'chat-bubble-base-300': msg.role !== 'user',
-      }">
-        <!-- textarea for editing message -->
-        <template v-if="editingContent !== null">
-          <textarea
-            dir="auto"
-            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
-            v-model="editingContent"></textarea>
-          <br/>
-          <button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
-          <button class="btn mt-2" @click="editMsg()">Submit</button>
-        </template>
-        <template v-else>
-          <!-- show loading dots for pending message -->
-          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
-          <!-- render message as markdown -->
-          <div v-else dir="auto">
-            <details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
-              <summary class="collapse-title">
-                <span v-if="splitMsgContent.isThinking">
-                  <span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
-                  <b>Thinking</b>
-                </span>
-                <b v-else>Thought Process</b>
-              </summary>
-              <vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
-            </details>
-            <vue-markdown :source="splitMsgContent.content"></vue-markdown>
-          </div>
-          <!-- render timings if enabled -->
-          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
-            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
-            <div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
-              <b>Prompt</b><br/>
-              - Tokens: {{ timings.prompt_n }}<br/>
-              - Time: {{ timings.prompt_ms }} ms<br/>
-              - Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
-              <b>Generation</b><br/>
-              - Tokens: {{ timings.predicted_n }}<br/>
-              - Time: {{ timings.predicted_ms }} ms<br/>
-              - Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
-            </div>
-          </div>
-        </template>
-      </div>
-    </div>
-    <!-- actions for each message -->
-    <div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
-      <!-- user message -->
-      <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
-        ✍️ Edit
-      </button>
-      <!-- assistant message -->
-      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-        🔄 Regenerate
-      </button>
-      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
-        📋 Copy
-      </button>
-    </div>
-  </template>
-
-
-  <!-- Template to be used by settings modal -->
-  <template id="settings-modal-short-input">
-    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
-      <!-- Show help message on hovering on the input label -->
-      <div class="dropdown dropdown-hover">
-        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
-        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
-          {{ configInfo[configKey] || '(no help message available)' }}
-        </div>
-      </div>
-      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
-      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
-    </label>
-  </template>
-
-  <script type="module" src="/src/main.js"></script>
-</body>
-
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, initial-scale=1, maximum-scale=1"
+    />
+    <meta name="color-scheme" content="light dark" />
+    <title>🦙 llama.cpp - chat</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
 </html>
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@ -5,26 +5,55 @@
  "type": "module",
  "scripts": {
    "dev": "vite",
-    "build": "vite build",
-    "preview": "vite preview",
-    "analyze": "ANALYZE=1 npx vite-bundle-visualizer"
-  },
-  "devDependencies": {
-    "sass-embedded": "^1.83.0",
-    "vite": "^5.4.10"
+    "build": "tsc -b && vite build",
+    "format": "eslint . && prettier --write .",
+    "lint": "eslint .",
+    "preview": "vite preview"
  },
  "dependencies": {
+    "@heroicons/react": "^2.2.0",
    "@sec-ant/readable-stream": "^0.6.0",
    "@vscode/markdown-it-katex": "^1.1.1",
    "autoprefixer": "^10.4.20",
    "daisyui": "^4.12.14",
    "highlight.js": "^11.10.0",
    "katex": "^0.16.15",
-    "markdown-it": "^14.1.0",
    "postcss": "^8.4.49",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "react-markdown": "^9.0.3",
+    "react-router": "^7.1.5",
+    "rehype-highlight": "^7.0.2",
+    "rehype-katex": "^7.0.1",
+    "remark-breaks": "^4.0.0",
+    "remark-gfm": "^4.0.0",
+    "remark-math": "^6.0.0",
    "tailwindcss": "^3.4.15",
    "textlinestream": "^1.1.1",
-    "vite-plugin-singlefile": "^2.0.3",
-    "vue": "^3.5.13"
+    "vite-plugin-singlefile": "^2.0.3"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.17.0",
+    "@types/markdown-it": "^14.1.2",
+    "@types/node": "^22.13.1",
+    "@types/react": "^18.3.18",
+    "@types/react-dom": "^18.3.5",
+    "@vitejs/plugin-react": "^4.3.4",
+    "eslint": "^9.17.0",
+    "eslint-plugin-react-hooks": "^5.0.0",
+    "eslint-plugin-react-refresh": "^0.4.16",
+    "globals": "^15.14.0",
+    "prettier": "^3.4.2",
+    "sass-embedded": "^1.83.4",
+    "typescript": "~5.6.2",
+    "typescript-eslint": "^8.18.2",
+    "vite": "^6.0.5"
+  },
+  "prettier": {
+    "trailingComma": "es5",
+    "tabWidth": 2,
+    "semi": true,
+    "singleQuote": true,
+    "bracketSameLine": false
  }
 }
--- a/examples/server/webui/public/demo-conversation.json
+++ b/examples/server/webui/public/demo-conversation.json
@ -11,7 +11,7 @@
    {
      "id": 1734087548327,
      "role": "assistant",
-      "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
+      "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\n$2x + y = z$\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
      "timings": {
        "prompt_n": 1,
        "prompt_ms": 28.923,
--- a/examples/server/webui/src/App.tsx
+++ b/examples/server/webui/src/App.tsx
@ -0,0 +1,47 @@
+import { HashRouter, Outlet, Route, Routes } from 'react-router';
+import Header from './components/Header';
+import Sidebar from './components/Sidebar';
+import { AppContextProvider, useAppContext } from './utils/app.context';
+import ChatScreen from './components/ChatScreen';
+import SettingDialog from './components/SettingDialog';
+
+function App() {
+  return (
+    <HashRouter>
+      <div className="flex flex-row drawer lg:drawer-open">
+        <AppContextProvider>
+          <Routes>
+            <Route element={<AppLayout />}>
+              <Route path="/chat/:convId" element={<ChatScreen />} />
+              <Route path="*" element={<ChatScreen />} />
+            </Route>
+          </Routes>
+        </AppContextProvider>
+      </div>
+    </HashRouter>
+  );
+}
+
+function AppLayout() {
+  const { showSettings, setShowSettings } = useAppContext();
+  return (
+    <>
+      <Sidebar />
+      <div
+        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
+        id="main-scroll"
+      >
+        <Header />
+        <Outlet />
+      </div>
+      {
+        <SettingDialog
+          show={showSettings}
+          onClose={() => setShowSettings(false)}
+        />
+      }
+    </>
+  );
+}
+
+export default App;
--- a/examples/server/webui/src/Config.ts
+++ b/examples/server/webui/src/Config.ts
@ -0,0 +1,92 @@
+import daisyuiThemes from 'daisyui/src/theming/themes';
+import { isNumeric } from './utils/misc';
+
+export const isDev = import.meta.env.MODE === 'development';
+
+// constants
+export const BASE_URL = new URL('.', document.baseURI).href
+  .toString()
+  .replace(/\/$/, '');
+
+export const CONFIG_DEFAULT = {
+  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+  // Do not use nested objects, keep it single level. Prefix the key if you need to group them.
+  apiKey: '',
+  systemMessage: 'You are a helpful assistant.',
+  showTokensPerSecond: false,
+  showThoughtInProgress: false,
+  excludeThoughtOnReq: true,
+  // make sure these default values are in sync with `common.h`
+  samplers: 'edkypmxt',
+  temperature: 0.8,
+  dynatemp_range: 0.0,
+  dynatemp_exponent: 1.0,
+  top_k: 40,
+  top_p: 0.95,
+  min_p: 0.05,
+  xtc_probability: 0.0,
+  xtc_threshold: 0.1,
+  typical_p: 1.0,
+  repeat_last_n: 64,
+  repeat_penalty: 1.0,
+  presence_penalty: 0.0,
+  frequency_penalty: 0.0,
+  dry_multiplier: 0.0,
+  dry_base: 1.75,
+  dry_allowed_length: 2,
+  dry_penalty_last_n: -1,
+  max_tokens: -1,
+  custom: '', // custom json-stringified object
+  // experimental features
+  pyIntepreterEnabled: false,
+};
+export const CONFIG_INFO: Record<string, string> = {
+  apiKey: 'Set the API Key if you are using --api-key option for the server.',
+  systemMessage: 'The starting message that defines how model should behave.',
+  samplers:
+    'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
+  temperature:
+    'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+  dynatemp_range:
+    'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+  dynatemp_exponent:
+    'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+  top_k: 'Keeps only k top tokens.',
+  top_p:
+    'Limits tokens to those that together have a cumulative probability of at least p',
+  min_p:
+    'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+  xtc_probability:
+    'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+  xtc_threshold:
+    'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+  typical_p:
+    'Sorts and limits tokens based on the difference between log-probability and entropy.',
+  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+  repeat_penalty:
+    'Controls the repetition of token sequences in the generated text',
+  presence_penalty:
+    'Limits tokens based on whether they appear in the output or not.',
+  frequency_penalty:
+    'Limits tokens based on how often they appear in the output.',
+  dry_multiplier:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+  dry_base:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+  dry_allowed_length:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+  dry_penalty_last_n:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+  max_tokens: 'The maximum number of token per output.',
+  custom: '', // custom json-stringified object
+};
+// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+export const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT)
+  .filter((e) => isNumeric(e[1]))
+  .map((e) => e[0]);
+// list of themes supported by daisyui
+export const THEMES = ['light', 'dark']
+  // make sure light & dark are always at the beginning
+  .concat(
+    Object.keys(daisyuiThemes).filter((t) => t !== 'light' && t !== 'dark')
+  );
--- a/examples/server/webui/src/components/CanvasPyInterpreter.tsx
+++ b/examples/server/webui/src/components/CanvasPyInterpreter.tsx
@ -0,0 +1,195 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { OpenInNewTab, XCloseButton } from '../utils/common';
+import { CanvasType } from '../utils/types';
+import { PlayIcon, StopIcon } from '@heroicons/react/24/outline';
+import StorageUtils from '../utils/storage';
+
+const canInterrupt = typeof SharedArrayBuffer === 'function';
+
+// adapted from https://pyodide.org/en/stable/usage/webworker.html
+const WORKER_CODE = `
+importScripts("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.js");
+
+let stdOutAndErr = [];
+
+let pyodideReadyPromise = loadPyodide({
+  stdout: (data) => stdOutAndErr.push(data),
+  stderr: (data) => stdOutAndErr.push(data),
+});
+
+let alreadySetBuff = false;
+
+self.onmessage = async (event) => {
+  stdOutAndErr = [];
+
+  // make sure loading is done
+  const pyodide = await pyodideReadyPromise;
+  const { id, python, context, interruptBuffer } = event.data;
+
+  if (interruptBuffer && !alreadySetBuff) {
+    pyodide.setInterruptBuffer(interruptBuffer);
+    alreadySetBuff = true;
+  }
+
+  // Now load any packages we need, run the code, and send the result back.
+  await pyodide.loadPackagesFromImports(python);
+
+  // make a Python dictionary with the data from content
+  const dict = pyodide.globals.get("dict");
+  const globals = dict(Object.entries(context));
+  try {
+    self.postMessage({ id, running: true });
+    // Execute the python code in this context
+    const result = pyodide.runPython(python, { globals });
+    self.postMessage({ result, id, stdOutAndErr });
+  } catch (error) {
+    self.postMessage({ error: error.message, id });
+  }
+  interruptBuffer[0] = 0;
+};
+`;
+
+let worker: Worker;
+const interruptBuffer = canInterrupt
+  ? new Uint8Array(new SharedArrayBuffer(1))
+  : null;
+
+const startWorker = () => {
+  if (!worker) {
+    worker = new Worker(
+      URL.createObjectURL(new Blob([WORKER_CODE], { type: 'text/javascript' }))
+    );
+  }
+};
+
+if (StorageUtils.getConfig().pyIntepreterEnabled) {
+  startWorker();
+}
+
+const runCodeInWorker = (
+  pyCode: string,
+  callbackRunning: () => void
+): {
+  donePromise: Promise<string>;
+  interrupt: () => void;
+} => {
+  startWorker();
+  const id = Math.random() * 1e8;
+  const context = {};
+  if (interruptBuffer) {
+    interruptBuffer[0] = 0;
+  }
+
+  const donePromise = new Promise<string>((resolve) => {
+    worker.onmessage = (event) => {
+      const { error, stdOutAndErr, running } = event.data;
+      if (id !== event.data.id) return;
+      if (running) {
+        callbackRunning();
+        return;
+      } else if (error) {
+        resolve(error.toString());
+      } else {
+        resolve(stdOutAndErr.join('\n'));
+      }
+    };
+    worker.postMessage({ id, python: pyCode, context, interruptBuffer });
+  });
+
+  const interrupt = () => {
+    console.log('Interrupting...');
+    console.trace();
+    if (interruptBuffer) {
+      interruptBuffer[0] = 2;
+    }
+  };
+
+  return { donePromise, interrupt };
+};
+
+export default function CanvasPyInterpreter() {
+  const { canvasData, setCanvasData } = useAppContext();
+
+  const [code, setCode] = useState(canvasData?.content ?? ''); // copy to avoid direct mutation
+  const [running, setRunning] = useState(false);
+  const [output, setOutput] = useState('');
+  const [interruptFn, setInterruptFn] = useState<() => void>();
+  const [showStopBtn, setShowStopBtn] = useState(false);
+
+  const runCode = async (pycode: string) => {
+    interruptFn?.();
+    setRunning(true);
+    setOutput('Loading Pyodide...');
+    const { donePromise, interrupt } = runCodeInWorker(pycode, () => {
+      setOutput('Running...');
+      setShowStopBtn(canInterrupt);
+    });
+    setInterruptFn(() => interrupt);
+    const out = await donePromise;
+    setOutput(out);
+    setRunning(false);
+    setShowStopBtn(false);
+  };
+
+  // run code on mount
+  useEffect(() => {
+    setCode(canvasData?.content ?? '');
+    runCode(canvasData?.content ?? '');
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [canvasData?.content]);
+
+  if (canvasData?.type !== CanvasType.PY_INTERPRETER) {
+    return null;
+  }
+
+  return (
+    <div className="card bg-base-200 w-full h-full shadow-xl">
+      <div className="card-body">
+        <div className="flex justify-between items-center mb-4">
+          <span className="text-lg font-bold">Python Interpreter</span>
+          <XCloseButton
+            className="bg-base-100"
+            onClick={() => setCanvasData(null)}
+          />
+        </div>
+        <div className="grid grid-rows-3 gap-4 h-full">
+          <textarea
+            className="textarea textarea-bordered w-full h-full font-mono"
+            value={code}
+            onChange={(e) => setCode(e.target.value)}
+          ></textarea>
+          <div className="font-mono flex flex-col row-span-2">
+            <div className="flex items-center mb-2">
+              <button
+                className="btn btn-sm bg-base-100"
+                onClick={() => runCode(code)}
+                disabled={running}
+              >
+                <PlayIcon className="h-6 w-6" /> Run
+              </button>
+              {showStopBtn && (
+                <button
+                  className="btn btn-sm bg-base-100 ml-2"
+                  onClick={() => interruptFn?.()}
+                >
+                  <StopIcon className="h-6 w-6" /> Stop
+                </button>
+              )}
+              <span className="grow text-right text-xs">
+                <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/11762">
+                  Report a bug
+                </OpenInNewTab>
+              </span>
+            </div>
+            <textarea
+              className="textarea textarea-bordered h-full dark-color"
+              value={output}
+              readOnly
+            ></textarea>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/ChatMessage.tsx
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@ -0,0 +1,235 @@
+import { useMemo, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { Message, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
+
+interface SplitMessage {
+  content: PendingMessage['content'];
+  thought?: string;
+  isThinking?: boolean;
+}
+
+export default function ChatMessage({
+  msg,
+  id,
+  scrollToBottom,
+  isPending,
+}: {
+  msg: Message | PendingMessage;
+  id?: string;
+  scrollToBottom: (requiresNearBottom: boolean) => void;
+  isPending?: boolean;
+}) {
+  const { viewingConversation, replaceMessageAndGenerate, config } =
+    useAppContext();
+  const [editingContent, setEditingContent] = useState<string | null>(null);
+  const timings = useMemo(
+    () =>
+      msg.timings
+        ? {
+            ...msg.timings,
+            prompt_per_second:
+              (msg.timings.prompt_n / msg.timings.prompt_ms) * 1000,
+            predicted_per_second:
+              (msg.timings.predicted_n / msg.timings.predicted_ms) * 1000,
+          }
+        : null,
+    [msg.timings]
+  );
+
+  // for reasoning model, we split the message into content and thought
+  // TODO: implement this as remark/rehype plugin in the future
+  const { content, thought, isThinking }: SplitMessage = useMemo(() => {
+    if (msg.content === null || msg.role !== 'assistant') {
+      return { content: msg.content };
+    }
+    let actualContent = '';
+    let thought = '';
+    let isThinking = false;
+    let thinkSplit = msg.content.split('<think>', 2);
+    actualContent += thinkSplit[0];
+    while (thinkSplit[1] !== undefined) {
+      // <think> tag found
+      thinkSplit = thinkSplit[1].split('</think>', 2);
+      thought += thinkSplit[0];
+      isThinking = true;
+      if (thinkSplit[1] !== undefined) {
+        // </think> closing tag found
+        isThinking = false;
+        thinkSplit = thinkSplit[1].split('<think>', 2);
+        actualContent += thinkSplit[0];
+      }
+    }
+    return { content: actualContent, thought, isThinking };
+  }, [msg]);
+
+  if (!viewingConversation) return null;
+
+  const regenerate = async () => {
+    replaceMessageAndGenerate(viewingConversation.id, msg.id, undefined, () =>
+      scrollToBottom(true)
+    );
+  };
+
+  return (
+    <div className="group" id={id}>
+      <div
+        className={classNames({
+          chat: true,
+          'chat-start': msg.role !== 'user',
+          'chat-end': msg.role === 'user',
+        })}
+      >
+        <div
+          className={classNames({
+            'chat-bubble markdown': true,
+            'chat-bubble-base-300': msg.role !== 'user',
+          })}
+        >
+          {/* textarea for editing message */}
+          {editingContent !== null && (
+            <>
+              <textarea
+                dir="auto"
+                className="textarea textarea-bordered bg-base-100 text-base-content max-w-2xl w-[calc(90vw-8em)] h-24"
+                value={editingContent}
+                onChange={(e) => setEditingContent(e.target.value)}
+              ></textarea>
+              <br />
+              <button
+                className="btn btn-ghost mt-2 mr-2"
+                onClick={() => setEditingContent(null)}
+              >
+                Cancel
+              </button>
+              <button
+                className="btn mt-2"
+                onClick={() =>
+                  replaceMessageAndGenerate(
+                    viewingConversation.id,
+                    msg.id,
+                    editingContent
+                  )
+                }
+              >
+                Submit
+              </button>
+            </>
+          )}
+          {/* not editing content, render message */}
+          {editingContent === null && (
+            <>
+              {content === null ? (
+                <>
+                  {/* show loading dots for pending message */}
+                  <span className="loading loading-dots loading-md"></span>
+                </>
+              ) : (
+                <>
+                  {/* render message as markdown */}
+                  <div dir="auto">
+                    {thought && (
+                      <details
+                        className="collapse bg-base-200 collapse-arrow mb-4"
+                        open={isThinking && config.showThoughtInProgress}
+                      >
+                        <summary className="collapse-title">
+                          {isPending && isThinking ? (
+                            <span>
+                              <span
+                                v-if="isGenerating"
+                                className="loading loading-spinner loading-md mr-2"
+                                style={{ verticalAlign: 'middle' }}
+                              ></span>
+                              <b>Thinking</b>
+                            </span>
+                          ) : (
+                            <b>Thought Process</b>
+                          )}
+                        </summary>
+                        <div className="collapse-content">
+                          <MarkdownDisplay
+                            content={thought}
+                            isGenerating={isPending}
+                          />
+                        </div>
+                      </details>
+                    )}
+                    <MarkdownDisplay
+                      content={content}
+                      isGenerating={isPending}
+                    />
+                  </div>
+                </>
+              )}
+              {/* render timings if enabled */}
+              {timings && config.showTokensPerSecond && (
+                <div className="dropdown dropdown-hover dropdown-top mt-2">
+                  <div
+                    tabIndex={0}
+                    role="button"
+                    className="cursor-pointer font-semibold text-sm opacity-60"
+                  >
+                    Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                  </div>
+                  <div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+                    <b>Prompt</b>
+                    <br />- Tokens: {timings.prompt_n}
+                    <br />- Time: {timings.prompt_ms} ms
+                    <br />- Speed: {timings.prompt_per_second.toFixed(1)} t/s
+                    <br />
+                    <b>Generation</b>
+                    <br />- Tokens: {timings.predicted_n}
+                    <br />- Time: {timings.predicted_ms} ms
+                    <br />- Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                    <br />
+                  </div>
+                </div>
+              )}
+            </>
+          )}
+        </div>
+      </div>
+
+      {/* actions for each message */}
+      {msg.content !== null && (
+        <div
+          className={classNames({
+            'mx-4 mt-2 mb-2': true,
+            'text-right': msg.role === 'user',
+          })}
+        >
+          {/* user message */}
+          {msg.role === 'user' && (
+            <button
+              className="badge btn-mini show-on-hover"
+              onClick={() => setEditingContent(msg.content)}
+              disabled={msg.content === null}
+            >
+              ✍️ Edit
+            </button>
+          )}
+          {/* assistant message */}
+          {msg.role === 'assistant' && (
+            <>
+              {!isPending && (
+                <button
+                  className="badge btn-mini show-on-hover mr-2"
+                  onClick={regenerate}
+                  disabled={msg.content === null}
+                >
+                  🔄 Regenerate
+                </button>
+              )}
+              <CopyButton
+                className="badge btn-mini show-on-hover mr-2"
+                content={msg.content}
+              />
+            </>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@ -0,0 +1,146 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import StorageUtils from '../utils/storage';
+import { useNavigate } from 'react-router';
+import ChatMessage from './ChatMessage';
+import { CanvasType, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import CanvasPyInterpreter from './CanvasPyInterpreter';
+
+export default function ChatScreen() {
+  const {
+    viewingConversation,
+    sendMessage,
+    isGenerating,
+    stopGenerating,
+    pendingMessages,
+    canvasData,
+  } = useAppContext();
+  const [inputMsg, setInputMsg] = useState('');
+  const navigate = useNavigate();
+
+  const currConvId = viewingConversation?.id ?? '';
+  const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId];
+
+  const scrollToBottom = (requiresNearBottom: boolean) => {
+    const mainScrollElem = document.getElementById('main-scroll');
+    if (!mainScrollElem) return;
+    const spaceToBottom =
+      mainScrollElem.scrollHeight -
+      mainScrollElem.scrollTop -
+      mainScrollElem.clientHeight;
+    if (!requiresNearBottom || spaceToBottom < 50) {
+      setTimeout(
+        () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
+        1
+      );
+    }
+  };
+
+  // scroll to bottom when conversation changes
+  useEffect(() => {
+    scrollToBottom(false);
+  }, [viewingConversation?.id]);
+
+  const sendNewMessage = async () => {
+    if (inputMsg.trim().length === 0 || isGenerating(currConvId)) return;
+    const convId = viewingConversation?.id ?? StorageUtils.getNewConvId();
+    const lastInpMsg = inputMsg;
+    setInputMsg('');
+    if (!viewingConversation) {
+      // if user is creating a new conversation, redirect to the new conversation
+      navigate(`/chat/${convId}`);
+    }
+    scrollToBottom(false);
+    // auto scroll as message is being generated
+    const onChunk = () => scrollToBottom(true);
+    if (!(await sendMessage(convId, inputMsg, onChunk))) {
+      // restore the input message if failed
+      setInputMsg(lastInpMsg);
+    }
+  };
+
+  const hasCanvas = !!canvasData;
+
+  return (
+    <div
+      className={classNames({
+        'grid lg:gap-8 grow transition-[300ms]': true,
+        'grid-cols-[1fr_0fr] lg:grid-cols-[1fr_1fr]': hasCanvas, // adapted for mobile
+        'grid-cols-[1fr_0fr]': !hasCanvas,
+      })}
+    >
+      <div
+        className={classNames({
+          'flex flex-col w-full max-w-[900px] mx-auto': true,
+          'hidden lg:flex': hasCanvas, // adapted for mobile
+          flex: !hasCanvas,
+        })}
+      >
+        {/* chat messages */}
+        <div id="messages-list" className="grow">
+          <div className="mt-auto flex justify-center">
+            {/* placeholder to shift the message to the bottom */}
+            {viewingConversation ? '' : 'Send a message to start'}
+          </div>
+          {viewingConversation?.messages.map((msg) => (
+            <ChatMessage
+              key={msg.id}
+              msg={msg}
+              scrollToBottom={scrollToBottom}
+            />
+          ))}
+
+          {pendingMsg && (
+            <ChatMessage
+              msg={pendingMsg}
+              scrollToBottom={scrollToBottom}
+              isPending
+              id="pending-msg"
+            />
+          )}
+        </div>
+
+        {/* chat input */}
+        <div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
+          <textarea
+            className="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            value={inputMsg}
+            onChange={(e) => setInputMsg(e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === 'Enter' && e.shiftKey) return;
+              if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendNewMessage();
+              }
+            }}
+            id="msg-input"
+            dir="auto"
+          ></textarea>
+          {isGenerating(currConvId) ? (
+            <button
+              className="btn btn-neutral ml-2"
+              onClick={() => stopGenerating(currConvId)}
+            >
+              Stop
+            </button>
+          ) : (
+            <button
+              className="btn btn-primary ml-2"
+              onClick={sendNewMessage}
+              disabled={inputMsg.trim().length === 0}
+            >
+              Send
+            </button>
+          )}
+        </div>
+      </div>
+      <div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
+        {canvasData?.type === CanvasType.PY_INTERPRETER && (
+          <CanvasPyInterpreter />
+        )}
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@ -0,0 +1,176 @@
+import { useEffect, useState } from 'react';
+import StorageUtils from '../utils/storage';
+import { useAppContext } from '../utils/app.context';
+import { classNames } from '../utils/misc';
+import daisyuiThemes from 'daisyui/src/theming/themes';
+import { THEMES } from '../Config';
+import { useNavigate } from 'react-router';
+
+export default function Header() {
+  const navigate = useNavigate();
+  const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
+  const { setShowSettings } = useAppContext();
+
+  const setTheme = (theme: string) => {
+    StorageUtils.setTheme(theme);
+    setSelectedTheme(theme);
+  };
+
+  useEffect(() => {
+    document.body.setAttribute('data-theme', selectedTheme);
+    document.body.setAttribute(
+      'data-color-scheme',
+      // @ts-expect-error daisyuiThemes complains about index type, but it should work
+      daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
+    );
+  }, [selectedTheme]);
+
+  const { isGenerating, viewingConversation } = useAppContext();
+  const isCurrConvGenerating = isGenerating(viewingConversation?.id ?? '');
+
+  const removeConversation = () => {
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    if (window.confirm('Are you sure to delete this conversation?')) {
+      StorageUtils.remove(convId);
+      navigate('/');
+    }
+  };
+
+  const downloadConversation = () => {
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    const conversationJson = JSON.stringify(viewingConversation, null, 2);
+    const blob = new Blob([conversationJson], { type: 'application/json' });
+    const url = URL.createObjectURL(blob);
+    const a = document.createElement('a');
+    a.href = url;
+    a.download = `conversation_${convId}.json`;
+    document.body.appendChild(a);
+    a.click();
+    document.body.removeChild(a);
+    URL.revokeObjectURL(url);
+  };
+
+  return (
+    <div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
+      {/* open sidebar button */}
+      <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          width="16"
+          height="16"
+          fill="currentColor"
+          className="bi bi-list"
+          viewBox="0 0 16 16"
+        >
+          <path
+            fillRule="evenodd"
+            d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
+          />
+        </svg>
+      </label>
+
+      <div className="grow text-2xl font-bold ml-2">llama.cpp</div>
+
+      {/* action buttons (top right) */}
+      <div className="flex items-center">
+        <div v-if="messages.length > 0" className="dropdown dropdown-end">
+          {/* "..." button */}
+          <button
+            tabIndex={0}
+            role="button"
+            className="btn m-1"
+            disabled={isCurrConvGenerating}
+          >
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-three-dots-vertical"
+              viewBox="0 0 16 16"
+            >
+              <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
+            </svg>
+          </button>
+          {/* dropdown menu */}
+          <ul
+            tabIndex={0}
+            className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
+          >
+            <li onClick={downloadConversation}>
+              <a>Download</a>
+            </li>
+            <li className="text-error" onClick={removeConversation}>
+              <a>Delete</a>
+            </li>
+          </ul>
+        </div>
+        <div className="tooltip tooltip-bottom" data-tip="Settings">
+          <button className="btn" onClick={() => setShowSettings(true)}>
+            {/* settings button */}
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-gear"
+              viewBox="0 0 16 16"
+            >
+              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
+              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
+            </svg>
+          </button>
+        </div>
+
+        {/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
+        <div className="tooltip tooltip-bottom" data-tip="Themes">
+          <div className="dropdown dropdown-end dropdown-bottom">
+            <div tabIndex={0} role="button" className="btn m-1">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                width="16"
+                height="16"
+                fill="currentColor"
+                className="bi bi-palette2"
+                viewBox="0 0 16 16"
+              >
+                <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
+              </svg>
+            </div>
+            <ul
+              tabIndex={0}
+              className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
+            >
+              <li>
+                <button
+                  className={classNames({
+                    'btn btn-sm btn-block btn-ghost justify-start': true,
+                    'btn-active': selectedTheme === 'auto',
+                  })}
+                  onClick={() => setTheme('auto')}
+                >
+                  auto
+                </button>
+              </li>
+              {THEMES.map((theme) => (
+                <li key={theme}>
+                  <input
+                    type="radio"
+                    name="theme-dropdown"
+                    className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
+                    aria-label={theme}
+                    value={theme}
+                    checked={selectedTheme === theme}
+                    onChange={(e) => e.target.checked && setTheme(theme)}
+                  />
+                </li>
+              ))}
+            </ul>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/MarkdownDisplay.tsx
+++ b/examples/server/webui/src/components/MarkdownDisplay.tsx
@ -0,0 +1,310 @@
+import React, { useMemo, useState } from 'react';
+import Markdown, { ExtraProps } from 'react-markdown';
+import remarkGfm from 'remark-gfm';
+import rehypeHightlight from 'rehype-highlight';
+import rehypeKatex from 'rehype-katex';
+import remarkMath from 'remark-math';
+import remarkBreaks from 'remark-breaks';
+import 'katex/dist/katex.min.css';
+import { classNames, copyStr } from '../utils/misc';
+import { ElementContent, Root } from 'hast';
+import { visit } from 'unist-util-visit';
+import { useAppContext } from '../utils/app.context';
+import { CanvasType } from '../utils/types';
+
+export default function MarkdownDisplay({
+  content,
+  isGenerating,
+}: {
+  content: string;
+  isGenerating?: boolean;
+}) {
+  const preprocessedContent = useMemo(
+    () => preprocessLaTeX(content),
+    [content]
+  );
+  return (
+    <Markdown
+      remarkPlugins={[remarkGfm, remarkMath, remarkBreaks]}
+      rehypePlugins={[rehypeHightlight, rehypeKatex, rehypeCustomCopyButton]}
+      components={{
+        button: (props) => (
+          <CodeBlockButtons
+            {...props}
+            isGenerating={isGenerating}
+            origContent={preprocessedContent}
+          />
+        ),
+        // note: do not use "pre", "p" or other basic html elements here, it will cause the node to re-render when the message is being generated (this should be a bug with react-markdown, not sure how to fix it)
+      }}
+    >
+      {preprocessedContent}
+    </Markdown>
+  );
+}
+
+const CodeBlockButtons: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement> &
+    ExtraProps & { origContent: string; isGenerating?: boolean }
+> = ({ node, origContent, isGenerating }) => {
+  const { config } = useAppContext();
+  const startOffset = node?.position?.start.offset ?? 0;
+  const endOffset = node?.position?.end.offset ?? 0;
+
+  const copiedContent = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, endOffset)
+        .replace(/^```[^\n]+\n/g, '')
+        .replace(/```$/g, ''),
+    [origContent, startOffset, endOffset]
+  );
+
+  const codeLanguage = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, startOffset + 10)
+        .match(/^```([^\n]+)\n/)?.[1] ?? '',
+    [origContent, startOffset]
+  );
+
+  const canRunCode =
+    !isGenerating &&
+    config.pyIntepreterEnabled &&
+    codeLanguage.startsWith('py');
+
+  return (
+    <div
+      className={classNames({
+        'text-right sticky top-[7em] mb-2 mr-2 h-0': true,
+        'display-none': !node?.position,
+      })}
+    >
+      <CopyButton className="badge btn-mini" content={copiedContent} />
+      {canRunCode && (
+        <RunPyCodeButton
+          className="badge btn-mini ml-2"
+          content={copiedContent}
+        />
+      )}
+    </div>
+  );
+};
+
+export const CopyButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const [copied, setCopied] = useState(false);
+  return (
+    <button
+      className={className}
+      onClick={() => {
+        copyStr(content);
+        setCopied(true);
+      }}
+      onMouseLeave={() => setCopied(false)}
+    >
+      {copied ? 'Copied!' : '📋 Copy'}
+    </button>
+  );
+};
+
+export const RunPyCodeButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const { setCanvasData } = useAppContext();
+  return (
+    <>
+      <button
+        className={className}
+        onClick={() =>
+          setCanvasData({
+            type: CanvasType.PY_INTERPRETER,
+            content,
+          })
+        }
+      >
+        ▶️ Run
+      </button>
+    </>
+  );
+};
+
+/**
+ * This injects the "button" element before each "pre" element.
+ * The actual button will be replaced with a react component in the MarkdownDisplay.
+ * We don't replace "pre" node directly because it will cause the node to re-render, which causes this bug: https://github.com/ggerganov/llama.cpp/issues/9608
+ */
+function rehypeCustomCopyButton() {
+  return function (tree: Root) {
+    visit(tree, 'element', function (node) {
+      if (node.tagName === 'pre' && !node.properties.visited) {
+        const preNode = { ...node };
+        // replace current node
+        preNode.properties.visited = 'true';
+        node.tagName = 'div';
+        node.properties = {};
+        // add node for button
+        const btnNode: ElementContent = {
+          type: 'element',
+          tagName: 'button',
+          properties: {},
+          children: [],
+          position: node.position,
+        };
+        node.children = [btnNode, preNode];
+      }
+    });
+  };
+}
+
+/**
+ * The part below is copied and adapted from:
+ * https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
+ * (MIT License)
+ */
+
+// Regex to check if the processed content contains any potential LaTeX patterns
+const containsLatexRegex =
+  /\\\(.*?\\\)|\\\[.*?\\\]|\$.*?\$|\\begin\{equation\}.*?\\end\{equation\}/;
+
+// Regex for inline and block LaTeX expressions
+const inlineLatex = new RegExp(/\\\((.+?)\\\)/, 'g');
+const blockLatex = new RegExp(/\\\[(.*?[^\\])\\\]/, 'gs');
+
+// Function to restore code blocks
+const restoreCodeBlocks = (content: string, codeBlocks: string[]) => {
+  return content.replace(
+    /<<CODE_BLOCK_(\d+)>>/g,
+    (_, index) => codeBlocks[index]
+  );
+};
+
+// Regex to identify code blocks and inline code
+const codeBlockRegex = /(```[\s\S]*?```|`.*?`)/g;
+
+export const processLaTeX = (_content: string) => {
+  let content = _content;
+  // Temporarily replace code blocks and inline code with placeholders
+  const codeBlocks: string[] = [];
+  let index = 0;
+  content = content.replace(codeBlockRegex, (match) => {
+    codeBlocks[index] = match;
+    return `<<CODE_BLOCK_${index++}>>`;
+  });
+
+  // Escape dollar signs followed by a digit or space and digit
+  let processedContent = content.replace(/(\$)(?=\s?\d)/g, '\\$');
+
+  // If no LaTeX patterns are found, restore code blocks and return the processed content
+  if (!containsLatexRegex.test(processedContent)) {
+    return restoreCodeBlocks(processedContent, codeBlocks);
+  }
+
+  // Convert LaTeX expressions to a markdown compatible format
+  processedContent = processedContent
+    .replace(inlineLatex, (_: string, equation: string) => `$${equation}$`) // Convert inline LaTeX
+    .replace(blockLatex, (_: string, equation: string) => `$$${equation}$$`); // Convert block LaTeX
+
+  // Restore code blocks
+  return restoreCodeBlocks(processedContent, codeBlocks);
+};
+
+/**
+ * Preprocesses LaTeX content by replacing delimiters and escaping certain characters.
+ *
+ * @param content The input string containing LaTeX expressions.
+ * @returns The processed string with replaced delimiters and escaped characters.
+ */
+export function preprocessLaTeX(content: string): string {
+  // Step 1: Protect code blocks
+  const codeBlocks: string[] = [];
+  content = content.replace(/(```[\s\S]*?```|`[^`\n]+`)/g, (_, code) => {
+    codeBlocks.push(code);
+    return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
+  });
+
+  // Step 2: Protect existing LaTeX expressions
+  const latexExpressions: string[] = [];
+
+  // Protect block math ($$...$$), \[...\], and \(...\) as before.
+  content = content.replace(
+    /(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g,
+    (match) => {
+      latexExpressions.push(match);
+      return `<<LATEX_${latexExpressions.length - 1}>>`;
+    }
+  );
+
+  // Protect inline math ($...$) only if it does NOT match a currency pattern.
+  // We assume a currency pattern is one where the inner content is purely numeric (with optional decimals).
+  content = content.replace(/\$([^$]+)\$/g, (match, inner) => {
+    if (/^\s*\d+(?:\.\d+)?\s*$/.test(inner)) {
+      // This looks like a currency value (e.g. "$123" or "$12.34"),
+      // so don't protect it.
+      return match;
+    } else {
+      // Otherwise, treat it as a LaTeX expression.
+      latexExpressions.push(match);
+      return `<<LATEX_${latexExpressions.length - 1}>>`;
+    }
+  });
+
+  // Step 3: Escape dollar signs that are likely currency indicators.
+  // (Now that inline math is protected, this will only escape dollars not already protected)
+  content = content.replace(/\$(?=\d)/g, '\\$');
+
+  // Step 4: Restore LaTeX expressions
+  content = content.replace(
+    /<<LATEX_(\d+)>>/g,
+    (_, index) => latexExpressions[parseInt(index)]
+  );
+
+  // Step 5: Restore code blocks
+  content = content.replace(
+    /<<CODE_BLOCK_(\d+)>>/g,
+    (_, index) => codeBlocks[parseInt(index)]
+  );
+
+  // Step 6: Apply additional escaping functions
+  content = escapeBrackets(content);
+  content = escapeMhchem(content);
+
+  return content;
+}
+
+export function escapeBrackets(text: string): string {
+  const pattern =
+    /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g;
+  return text.replace(
+    pattern,
+    (
+      match: string,
+      codeBlock: string | undefined,
+      squareBracket: string | undefined,
+      roundBracket: string | undefined
+    ): string => {
+      if (codeBlock != null) {
+        return codeBlock;
+      } else if (squareBracket != null) {
+        return `$$${squareBracket}$$`;
+      } else if (roundBracket != null) {
+        return `$${roundBracket}$`;
+      }
+      return match;
+    }
+  );
+}
+
+export function escapeMhchem(text: string) {
+  return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{');
+}
--- a/examples/server/webui/src/components/SettingDialog.tsx
+++ b/examples/server/webui/src/components/SettingDialog.tsx
@ -0,0 +1,536 @@
+import { useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { CONFIG_DEFAULT, CONFIG_INFO } from '../Config';
+import { isDev } from '../Config';
+import StorageUtils from '../utils/storage';
+import { classNames, isBoolean, isNumeric, isString } from '../utils/misc';
+import {
+  BeakerIcon,
+  ChatBubbleOvalLeftEllipsisIcon,
+  Cog6ToothIcon,
+  FunnelIcon,
+  HandRaisedIcon,
+  SquaresPlusIcon,
+} from '@heroicons/react/24/outline';
+import { OpenInNewTab } from '../utils/common';
+
+type SettKey = keyof typeof CONFIG_DEFAULT;
+
+const BASIC_KEYS: SettKey[] = [
+  'temperature',
+  'top_k',
+  'top_p',
+  'min_p',
+  'max_tokens',
+];
+const SAMPLER_KEYS: SettKey[] = [
+  'dynatemp_range',
+  'dynatemp_exponent',
+  'typical_p',
+  'xtc_probability',
+  'xtc_threshold',
+];
+const PENALTY_KEYS: SettKey[] = [
+  'repeat_last_n',
+  'repeat_penalty',
+  'presence_penalty',
+  'frequency_penalty',
+  'dry_multiplier',
+  'dry_base',
+  'dry_allowed_length',
+  'dry_penalty_last_n',
+];
+
+enum SettingInputType {
+  SHORT_INPUT,
+  LONG_INPUT,
+  CHECKBOX,
+  CUSTOM,
+}
+
+interface SettingFieldInput {
+  type: Exclude<SettingInputType, SettingInputType.CUSTOM>;
+  label: string | React.ReactElement;
+  help?: string | React.ReactElement;
+  key: SettKey;
+}
+
+interface SettingFieldCustom {
+  type: SettingInputType.CUSTOM;
+  key: SettKey;
+  component:
+    | string
+    | React.FC<{
+        value: string | boolean | number;
+        onChange: (value: string) => void;
+      }>;
+}
+
+interface SettingSection {
+  title: React.ReactElement;
+  fields: (SettingFieldInput | SettingFieldCustom)[];
+}
+
+const ICON_CLASSNAME = 'w-4 h-4 mr-1 inline';
+
+const SETTING_SECTIONS: SettingSection[] = [
+  {
+    title: (
+      <>
+        <Cog6ToothIcon className={ICON_CLASSNAME} />
+        General
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'API Key',
+        key: 'apiKey',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: 'System Message (will be disabled if left empty)',
+        key: 'systemMessage',
+      },
+      ...BASIC_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <FunnelIcon className={ICON_CLASSNAME} />
+        Samplers
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'Samplers queue',
+        key: 'samplers',
+      },
+      ...SAMPLER_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <HandRaisedIcon className={ICON_CLASSNAME} />
+        Penalties
+      </>
+    ),
+    fields: PENALTY_KEYS.map((key) => ({
+      type: SettingInputType.SHORT_INPUT,
+      label: key,
+      key,
+    })),
+  },
+  {
+    title: (
+      <>
+        <ChatBubbleOvalLeftEllipsisIcon className={ICON_CLASSNAME} />
+        Reasoning
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Expand though process by default for generating message',
+        key: 'showThoughtInProgress',
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label:
+          'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
+        key: 'excludeThoughtOnReq',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <SquaresPlusIcon className={ICON_CLASSNAME} />
+        Advanced
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => {
+          const debugImportDemoConv = async () => {
+            const res = await fetch('/demo-conversation.json');
+            const demoConv = await res.json();
+            StorageUtils.remove(demoConv.id);
+            for (const msg of demoConv.messages) {
+              StorageUtils.appendMsg(demoConv.id, msg);
+            }
+          };
+          return (
+            <button className="btn" onClick={debugImportDemoConv}>
+              (debug) Import demo conversation
+            </button>
+          );
+        },
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Show tokens per second',
+        key: 'showTokensPerSecond',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: (
+          <>
+            Custom JSON config (For more info, refer to{' '}
+            <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md">
+              server documentation
+            </OpenInNewTab>
+            )
+          </>
+        ),
+        key: 'custom',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <BeakerIcon className={ICON_CLASSNAME} />
+        Experimental
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => (
+          <>
+            <p className="mb-8">
+              Experimental features are not guaranteed to work correctly.
+              <br />
+              <br />
+              If you encounter any problems, create a{' '}
+              <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/new?template=019-bug-misc.yml">
+                Bug (misc.)
+              </OpenInNewTab>{' '}
+              report on Github. Please also specify <b>webui/experimental</b> on
+              the report title and include screenshots.
+              <br />
+              <br />
+              Some features may require packages downloaded from CDN, so they
+              need internet connection.
+            </p>
+          </>
+        ),
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: (
+          <>
+            <b>Enable Python interpreter</b>
+            <br />
+            <small className="text-xs">
+              This feature uses{' '}
+              <OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
+              downloaded from CDN. To use this feature, ask the LLM to generate
+              python code inside a markdown code block. You will see a "Run"
+              button on the code block, near the "Copy" button.
+            </small>
+          </>
+        ),
+        key: 'pyIntepreterEnabled',
+      },
+    ],
+  },
+];
+
+export default function SettingDialog({
+  show,
+  onClose,
+}: {
+  show: boolean;
+  onClose: () => void;
+}) {
+  const { config, saveConfig } = useAppContext();
+  const [sectionIdx, setSectionIdx] = useState(0);
+
+  // clone the config object to prevent direct mutation
+  const [localConfig, setLocalConfig] = useState<typeof CONFIG_DEFAULT>(
+    JSON.parse(JSON.stringify(config))
+  );
+
+  const resetConfig = () => {
+    if (window.confirm('Are you sure to reset all settings?')) {
+      setLocalConfig(CONFIG_DEFAULT);
+    }
+  };
+
+  const handleSave = () => {
+    // copy the local config to prevent direct mutation
+    const newConfig: typeof CONFIG_DEFAULT = JSON.parse(
+      JSON.stringify(localConfig)
+    );
+    // validate the config
+    for (const key in newConfig) {
+      const value = newConfig[key as SettKey];
+      const mustBeBoolean = isBoolean(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeString = isString(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]);
+      if (mustBeString) {
+        if (!isString(value)) {
+          alert(`Value for ${key} must be string`);
+          return;
+        }
+      } else if (mustBeNumeric) {
+        const trimedValue = value.toString().trim();
+        const numVal = Number(trimedValue);
+        if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
+          alert(`Value for ${key} must be numeric`);
+          return;
+        }
+        // force conversion to number
+        // @ts-expect-error this is safe
+        newConfig[key] = numVal;
+      } else if (mustBeBoolean) {
+        if (!isBoolean(value)) {
+          alert(`Value for ${key} must be boolean`);
+          return;
+        }
+      } else {
+        console.error(`Unknown default type for key ${key}`);
+      }
+    }
+    if (isDev) console.log('Saving config', newConfig);
+    saveConfig(newConfig);
+    onClose();
+  };
+
+  const onChange = (key: SettKey) => (value: string | boolean) => {
+    // note: we do not perform validation here, because we may get incomplete value as user is still typing it
+    setLocalConfig({ ...localConfig, [key]: value });
+  };
+
+  return (
+    <dialog className={classNames({ modal: true, 'modal-open': show })}>
+      <div className="modal-box w-11/12 max-w-3xl">
+        <h3 className="text-lg font-bold mb-6">Settings</h3>
+        <div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
+          {/* Left panel, showing sections - Desktop version */}
+          <div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
+            {SETTING_SECTIONS.map((section, idx) => (
+              <div
+                key={idx}
+                className={classNames({
+                  'btn btn-ghost justify-start font-normal w-44 mb-1': true,
+                  'btn-active': sectionIdx === idx,
+                })}
+                onClick={() => setSectionIdx(idx)}
+                dir="auto"
+              >
+                {section.title}
+              </div>
+            ))}
+          </div>
+
+          {/* Left panel, showing sections - Mobile version */}
+          <div className="md:hidden flex flex-row gap-2 mb-4">
+            <details className="dropdown">
+              <summary className="btn bt-sm w-full m-1">
+                {SETTING_SECTIONS[sectionIdx].title}
+              </summary>
+              <ul className="menu dropdown-content bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                {SETTING_SECTIONS.map((section, idx) => (
+                  <div
+                    key={idx}
+                    className={classNames({
+                      'btn btn-ghost justify-start font-normal': true,
+                      'btn-active': sectionIdx === idx,
+                    })}
+                    onClick={() => setSectionIdx(idx)}
+                    dir="auto"
+                  >
+                    {section.title}
+                  </div>
+                ))}
+              </ul>
+            </details>
+          </div>
+
+          {/* Right panel, showing setting fields */}
+          <div className="grow overflow-y-auto px-4">
+            {SETTING_SECTIONS[sectionIdx].fields.map((field, idx) => {
+              const key = `${sectionIdx}-${idx}`;
+              if (field.type === SettingInputType.SHORT_INPUT) {
+                return (
+                  <SettingsModalShortInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.LONG_INPUT) {
+                return (
+                  <SettingsModalLongInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key].toString()}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CHECKBOX) {
+                return (
+                  <SettingsModalCheckbox
+                    key={key}
+                    configKey={field.key}
+                    value={!!localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CUSTOM) {
+                return (
+                  <div key={key} className="mb-2">
+                    {typeof field.component === 'string'
+                      ? field.component
+                      : field.component({
+                          value: localConfig[field.key],
+                          onChange: onChange(field.key),
+                        })}
+                  </div>
+                );
+              }
+            })}
+
+            <p className="opacity-40 mb-6 text-sm mt-8">
+              Settings are saved in browser's localStorage
+            </p>
+          </div>
+        </div>
+
+        <div className="modal-action">
+          <button className="btn" onClick={resetConfig}>
+            Reset to default
+          </button>
+          <button className="btn" onClick={onClose}>
+            Close
+          </button>
+          <button className="btn btn-primary" onClick={handleSave}>
+            Save
+          </button>
+        </div>
+      </div>
+    </dialog>
+  );
+}
+
+function SettingsModalLongInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: string;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  return (
+    <label className="form-control mb-2">
+      <div className="label inline">{label || configKey}</div>
+      <textarea
+        className="textarea textarea-bordered h-24"
+        placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+        value={value}
+        onChange={(e) => onChange(e.target.value)}
+      />
+    </label>
+  );
+}
+
+function SettingsModalShortInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  value: any;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  const helpMsg = CONFIG_INFO[configKey];
+
+  return (
+    <>
+      {/* on mobile, we simply show the help message here */}
+      {helpMsg && (
+        <div className="block md:hidden mb-1">
+          <b>{label || configKey}</b>
+          <br />
+          <p className="text-xs">{helpMsg}</p>
+        </div>
+      )}
+      <label className="input input-bordered join-item grow flex items-center gap-2 mb-2">
+        <div className="dropdown dropdown-hover">
+          <div tabIndex={0} role="button" className="font-bold hidden md:block">
+            {label || configKey}
+          </div>
+          {helpMsg && (
+            <div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+              {helpMsg}
+            </div>
+          )}
+        </div>
+        <input
+          type="text"
+          className="grow"
+          placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+          value={value}
+          onChange={(e) => onChange(e.target.value)}
+        />
+      </label>
+    </>
+  );
+}
+
+function SettingsModalCheckbox({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: boolean;
+  onChange: (value: boolean) => void;
+  label: string;
+}) {
+  return (
+    <div className="flex flex-row items-center mb-2">
+      <input
+        type="checkbox"
+        className="toggle"
+        checked={value}
+        onChange={(e) => onChange(e.target.checked)}
+      />
+      <span className="ml-4">{label || configKey}</span>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/Sidebar.tsx
+++ b/examples/server/webui/src/components/Sidebar.tsx
@ -0,0 +1,95 @@
+import { useEffect, useMemo, useState } from 'react';
+import { classNames } from '../utils/misc';
+import { Conversation } from '../utils/types';
+import StorageUtils from '../utils/storage';
+import { useNavigate, useParams } from 'react-router';
+
+export default function Sidebar() {
+  const params = useParams();
+  const navigate = useNavigate();
+  const currConv = useMemo(
+    () => StorageUtils.getOneConversation(params.convId ?? ''),
+    [params.convId]
+  );
+
+  const [conversations, setConversations] = useState<Conversation[]>([]);
+
+  useEffect(() => {
+    const handleConversationChange = () => {
+      setConversations(StorageUtils.getAllConversations());
+    };
+    StorageUtils.onConversationChanged(handleConversationChange);
+    handleConversationChange();
+    return () => {
+      StorageUtils.offConversationChanged(handleConversationChange);
+    };
+  }, []);
+
+  return (
+    <>
+      <input
+        id="toggle-drawer"
+        type="checkbox"
+        className="drawer-toggle"
+        defaultChecked
+      />
+
+      <div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+        <label
+          htmlFor="toggle-drawer"
+          aria-label="close sidebar"
+          className="drawer-overlay"
+        ></label>
+        <div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
+          <div className="flex flex-row items-center justify-between mb-4 mt-4">
+            <h2 className="font-bold ml-4">Conversations</h2>
+
+            {/* close sidebar button */}
+            <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                width="16"
+                height="16"
+                fill="currentColor"
+                className="bi bi-arrow-bar-left"
+                viewBox="0 0 16 16"
+              >
+                <path
+                  fillRule="evenodd"
+                  d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
+                />
+              </svg>
+            </label>
+          </div>
+
+          {/* list of conversations */}
+          <div
+            className={classNames({
+              'btn btn-ghost justify-start': true,
+              'btn-active': !currConv,
+            })}
+            onClick={() => navigate('/')}
+          >
+            + New conversation
+          </div>
+          {conversations.map((conv) => (
+            <div
+              key={conv.id}
+              className={classNames({
+                'btn btn-ghost justify-start font-normal': true,
+                'btn-active': conv.id === currConv?.id,
+              })}
+              onClick={() => navigate(`/chat/${conv.id}`)}
+              dir="auto"
+            >
+              <span className="truncate">{conv.messages[0].content}</span>
+            </div>
+          ))}
+          <div className="text-center text-xs opacity-40 mt-auto mx-4">
+            Conversations are saved to browser's localStorage
+          </div>
+        </div>
+      </div>
+    </>
+  );
+}
--- a/examples/server/webui/src/highlight-config.js
+++ b/examples/server/webui/src/highlight-config.js
@ -1,60 +0,0 @@
-import hljs from 'highlight.js/lib/core';
-
-// only import commonly used languages to reduce bundle size
-
-import python from 'highlight.js/lib/languages/python';
-import javascript from 'highlight.js/lib/languages/javascript';
-import json from 'highlight.js/lib/languages/json';
-import bash from 'highlight.js/lib/languages/bash';
-import yaml from 'highlight.js/lib/languages/yaml';
-import markdown from 'highlight.js/lib/languages/markdown';
-import scss from 'highlight.js/lib/languages/scss';
-import xml from 'highlight.js/lib/languages/xml';
-import ruby from 'highlight.js/lib/languages/ruby';
-import go from 'highlight.js/lib/languages/go';
-import java from 'highlight.js/lib/languages/java';
-import rust from 'highlight.js/lib/languages/rust';
-import scala from 'highlight.js/lib/languages/scala';
-import cpp from 'highlight.js/lib/languages/cpp';
-import csharp from 'highlight.js/lib/languages/csharp';
-import swift from 'highlight.js/lib/languages/swift';
-import dart from 'highlight.js/lib/languages/dart';
-import elixir from 'highlight.js/lib/languages/elixir';
-import kotlin from 'highlight.js/lib/languages/kotlin';
-import lua from 'highlight.js/lib/languages/lua';
-import php from 'highlight.js/lib/languages/php';
-import latex from 'highlight.js/lib/languages/latex';
-
-hljs.registerLanguage('python', python);
-hljs.registerLanguage('javascript', javascript);
-hljs.registerLanguage('json', json);
-hljs.registerLanguage('yaml', yaml);
-hljs.registerLanguage('markdown', markdown);
-hljs.registerLanguage('xml', xml);
-hljs.registerLanguage('ruby', ruby);
-hljs.registerLanguage('go', go);
-hljs.registerLanguage('java', java);
-hljs.registerLanguage('rust', rust);
-hljs.registerLanguage('scala', scala);
-hljs.registerLanguage('csharp', csharp);
-hljs.registerLanguage('swift', swift);
-hljs.registerLanguage('dart', dart);
-hljs.registerLanguage('elixir', elixir);
-hljs.registerLanguage('kotlin', kotlin);
-hljs.registerLanguage('lua', lua);
-hljs.registerLanguage('php', php);
-hljs.registerLanguage('latex', latex);
-
-// reuse some languages to further reduce bundle size
-
-hljs.registerLanguage('shell', bash);
-hljs.registerLanguage('bash', bash);
-hljs.registerLanguage('sh', bash);
-
-hljs.registerLanguage('css', scss);
-hljs.registerLanguage('scss', scss);
-
-hljs.registerLanguage('c', cpp);
-hljs.registerLanguage('cpp', cpp);
-
-export default hljs;
--- a/examples/server/webui/src/styles.scss
+++ b/examples/server/webui/src/styles.scss
@ -1,15 +1,28 @@
-@use "sass:meta";
+@use 'sass:meta';

@tailwind base;
@tailwind components;
@tailwind utilities;

 .markdown {
-  h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+  h1,
+  h2,
+  h3,
+  h4,
+  h5,
+  h6,
+  ul,
+  ol,
+  li {
+    all: revert;
+  }
  pre {
    @apply whitespace-pre-wrap rounded-lg p-2;
    border: 1px solid currentColor;
  }
+  p {
+    @apply mb-2;
+  }
  /* TODO: fix markdown table */
 }

@ -19,7 +32,9 @@
 .btn-mini {
  @apply cursor-pointer hover:shadow-md;
 }
-.chat-screen { max-width: 900px; }
+.chat-screen {
+  max-width: 900px;
+}

 .chat-bubble-base-300 {
  --tw-bg-opacity: 1;
@ -30,6 +45,9 @@
 /* Highlight.js */
 [data-color-scheme='light'] {
  @include meta.load-css('highlight.js/styles/stackoverflow-light');
+  .dark-color {
+    @apply bg-base-content text-base-100;
+  }
 }
 [data-color-scheme='dark'] {
  @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -37,6 +55,9 @@
 [data-color-scheme='auto'] {
  @media (prefers-color-scheme: light) {
    @include meta.load-css('highlight.js/styles/stackoverflow-light');
+    .dark-color {
+      @apply bg-base-content text-base-100;
+    }
  }
  @media (prefers-color-scheme: dark) {
    @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -46,3 +67,7 @@
  background: transparent !important;
  padding: 0.5em !important;
 }
+
+.katex-display {
+  margin: 0 0 !important;
+}
--- a/examples/server/webui/src/katex-gpt.js
+++ b/examples/server/webui/src/katex-gpt.js
@ -1,66 +0,0 @@
-import katex from 'katex';
-
-// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
-// MIT license
-
-const defaultOptions = {
-  delimiters: [
-    { left: '\\[', right: '\\]', display: true },
-    { left: '\\(', right: '\\)', display: false },
-  ],
-};
-
-export function renderLatexHTML(content, display = false) {
-  return katex.renderToString(content, {
-    throwOnError: false,
-    output: 'mathml',
-    displayMode: display,
-  });
-}
-
-function escapedBracketRule(options) {
-  return (state, silent) => {
-    const max = state.posMax;
-    const start = state.pos;
-
-    for (const { left, right, display } of options.delimiters) {
-
-      // Check if it starts with the left delimiter
-      if (!state.src.slice(start).startsWith(left)) continue;
-
-      // Skip the length of the left delimiter
-      let pos = start + left.length;
-
-      // Find the matching right delimiter
-      while (pos < max) {
-        if (state.src.slice(pos).startsWith(right)) {
-          break;
-        }
-        pos++;
-      }
-
-      // No matching right delimiter found, skip to the next match
-      if (pos >= max) continue;
-
-      // If not in silent mode, convert LaTeX formula to MathML
-      if (!silent) {
-        const content = state.src.slice(start + left.length, pos);
-        try {
-          const renderedContent = renderLatexHTML(content, display);
-          const token = state.push('html_inline', '', 0);
-          token.content = renderedContent;
-        } catch (e) {
-          console.error(e);
-        }
-      }
-
-      // Update position, skip the length of the right delimiter
-      state.pos = pos + right.length;
-      return true;
-    }
-  }
-}
-
-export default function (md, options = defaultOptions) {
-  md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
-}
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@ -1,704 +0,0 @@
-import './styles.scss';
-import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
-import MarkdownIt from 'markdown-it';
-import TextLineStream from 'textlinestream';
-
-// math formula rendering
-import 'katex/dist/katex.min.css';
-import markdownItKatexGpt from './katex-gpt';
-import markdownItKatexNormal from '@vscode/markdown-it-katex';
-
-// code highlighting
-import hljs from './highlight-config';
-import daisyuiThemes from 'daisyui/src/theming/themes';
-
-// ponyfill for missing ReadableStream asyncIterator on Safari
-import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
-
-const isDev = import.meta.env.MODE === 'development';
-
-// types
-/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
-/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
-/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
-
-// utility functions
-const isString = (x) => !!x.toLowerCase;
-const isBoolean = (x) => x === true || x === false;
-const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
-const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
-const copyStr = (textToCopy) => {
-  // Navigator clipboard api needs a secure context (https)
-  if (navigator.clipboard && window.isSecureContext) {
-    navigator.clipboard.writeText(textToCopy);
-  } else {
-    // Use the 'out of viewport hidden text area' trick
-    const textArea = document.createElement('textarea');
-    textArea.value = textToCopy;
-    // Move textarea out of the viewport so it's not visible
-    textArea.style.position = 'absolute';
-    textArea.style.left = '-999999px';
-    document.body.prepend(textArea);
-    textArea.select();
-    document.execCommand('copy');
-  }
-};
-
-// constants
-const BASE_URL = isDev
-  ? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
-  : (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
-console.log({ BASE_URL });
-
-const CONFIG_DEFAULT = {
-  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
-  apiKey: '',
-  systemMessage: 'You are a helpful assistant.',
-  showTokensPerSecond: false,
-  showThoughtInProgress: false,
-  excludeThoughtOnReq: true,
-  // make sure these default values are in sync with `common.h`
-  samplers: 'edkypmxt',
-  temperature: 0.8,
-  dynatemp_range: 0.0,
-  dynatemp_exponent: 1.0,
-  top_k: 40,
-  top_p: 0.95,
-  min_p: 0.05,
-  xtc_probability: 0.0,
-  xtc_threshold: 0.1,
-  typical_p: 1.0,
-  repeat_last_n: 64,
-  repeat_penalty: 1.0,
-  presence_penalty: 0.0,
-  frequency_penalty: 0.0,
-  dry_multiplier: 0.0,
-  dry_base: 1.75,
-  dry_allowed_length: 2,
-  dry_penalty_last_n: -1,
-  max_tokens: -1,
-  custom: '', // custom json-stringified object
-};
-const CONFIG_INFO = {
-  apiKey: 'Set the API Key if you are using --api-key option for the server.',
-  systemMessage: 'The starting message that defines how model should behave.',
-  samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
-  temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
-  dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
-  dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
-  top_k: 'Keeps only k top tokens.',
-  top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
-  min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
-  xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
-  xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
-  typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
-  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
-  repeat_penalty: 'Controls the repetition of token sequences in the generated text',
-  presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
-  frequency_penalty: 'Limits tokens based on how often they appear in the output.',
-  dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
-  dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
-  dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
-  dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
-  max_tokens: 'The maximum number of token per output.',
-  custom: '', // custom json-stringified object
-};
-// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
-const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
-// list of themes supported by daisyui
-const THEMES = ['light', 'dark']
-  // make sure light & dark are always at the beginning
-  .concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
-
-// markdown support
-const VueMarkdown = defineComponent(
-  (props) => {
-    const md = shallowRef(new MarkdownIt({
-      breaks: true,
-      highlight: function (str, lang) { // Add highlight.js
-        if (lang && hljs.getLanguage(lang)) {
-          try {
-            return '<pre dir="auto"><code class="hljs">' +
-                   hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
-                   '</code></pre>';
-          } catch (__) {}
-        }
-        return '<pre dir="auto"><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
-      }
-    }));
-    // support latex with double dollar sign and square brackets
-    md.value.use(markdownItKatexGpt, {
-      delimiters: [
-        { left: '\\[', right: '\\]', display: true },
-        { left: '\\(', right: '\\)', display: false },
-        { left: '$$', right: '$$', display: false },
-        // do not add single dollar sign here, other wise it will confused with dollar used for money symbol
-      ],
-      throwOnError: false,
-    });
-    // support latex with single dollar sign
-    md.value.use(markdownItKatexNormal, { throwOnError: false });
-    // add copy button to code blocks
-    const origFenchRenderer = md.value.renderer.rules.fence;
-    md.value.renderer.rules.fence = (tokens, idx, ...args) => {
-      const content = tokens[idx].content;
-      const origRendered = origFenchRenderer(tokens, idx, ...args);
-      return `<div class="relative my-4">
-        <div class="text-right sticky top-4 mb-2 mr-2 h-0">
-          <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
-        </div>
-        ${origRendered}
-      </div>`;
-    };
-    window.copyStr = copyStr;
-    const content = computed(() => md.value.render(props.source));
-    return () => h('div', { innerHTML: content.value });
-  },
-  { props: ['source'] }
-);
-
-// input field to be used by settings modal
-const SettingsModalShortInput = defineComponent({
-  template: document.getElementById('settings-modal-short-input').innerHTML,
-  props: {
-    label: { type: String, required: false },
-    configKey: String,
-    configDefault: Object,
-    configInfo: Object,
-    modelValue: [Object, String, Number],
-  },
-});
-
-// message bubble component
-const MessageBubble = defineComponent({
-  components: {
-    VueMarkdown
-  },
-  template: document.getElementById('message-bubble').innerHTML,
-  props: {
-    config: Object,
-    msg: Object,
-    isGenerating: Boolean,
-    showThoughtInProgress: Boolean,
-    editUserMsgAndRegenerate: Function,
-    regenerateMsg: Function,
-  },
-  data() {
-    return {
-      editingContent: null,
-    };
-  },
-  computed: {
-    timings() {
-      if (!this.msg.timings) return null;
-      return {
-        ...this.msg.timings,
-        prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
-        predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
-      };
-    },
-    splitMsgContent() {
-      const content = this.msg.content;
-      if (this.msg.role !== 'assistant') {
-        return { content };
-      }
-      let actualContent = '';
-      let cot = '';
-      let isThinking = false;
-      let thinkSplit = content.split('<think>', 2);
-      actualContent += thinkSplit[0];
-      while (thinkSplit[1] !== undefined) {
-        // <think> tag found
-        thinkSplit = thinkSplit[1].split('</think>', 2);
-        cot += thinkSplit[0];
-        isThinking = true;
-        if (thinkSplit[1] !== undefined) {
-          // </think> closing tag found
-          isThinking = false;
-          thinkSplit = thinkSplit[1].split('<think>', 2);
-          actualContent += thinkSplit[0];
-        }
-      }
-      return { content: actualContent, cot, isThinking };
-    },
-  },
-  methods: {
-    copyMsg() {
-      copyStr(this.msg.content);
-    },
-    editMsg() {
-      this.editUserMsgAndRegenerate({
-        ...this.msg,
-        content: this.editingContent,
-      });
-      this.editingContent = null;
-    },
-  },
-});
-
-// coversations is stored in localStorage
-// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
-// convId is a string prefixed with 'conv-'
-const StorageUtils = {
-  /**
-   * manage conversations
-   * @returns {Array<Conversation>}
-   */
-  getAllConversations() {
-    const res = [];
-    for (const key in localStorage) {
-      if (key.startsWith('conv-')) {
-        res.push(JSON.parse(localStorage.getItem(key)));
-      }
-    }
-    res.sort((a, b) => b.lastModified - a.lastModified);
-    return res;
-  },
-  /**
-   * can return null if convId does not exist
-   * @param {string} convId
-   * @returns {Conversation | null}
-   */
-  getOneConversation(convId) {
-    return JSON.parse(localStorage.getItem(convId) || 'null');
-  },
-  /**
-   * if convId does not exist, create one
-   * @param {string} convId
-   * @param {Message} msg
-   */
-  appendMsg(convId, msg) {
-    if (msg.content === null) return;
-    const conv = StorageUtils.getOneConversation(convId) || {
-      id: convId,
-      lastModified: Date.now(),
-      messages: [],
-    };
-    conv.messages.push(msg);
-    conv.lastModified = Date.now();
-    localStorage.setItem(convId, JSON.stringify(conv));
-  },
-  /**
-   * Get new conversation id
-   * @returns {string}
-   */
-  getNewConvId() {
-    return `conv-${Date.now()}`;
-  },
-  /**
-   * remove conversation by id
-   * @param {string} convId
-   */
-  remove(convId) {
-    localStorage.removeItem(convId);
-  },
-  /**
-   * remove all conversations
-   * @param {string} convId
-   */
-  filterAndKeepMsgs(convId, predicate) {
-    const conv = StorageUtils.getOneConversation(convId);
-    if (!conv) return;
-    conv.messages = conv.messages.filter(predicate);
-    conv.lastModified = Date.now();
-    localStorage.setItem(convId, JSON.stringify(conv));
-  },
-  /**
-   * remove last message from conversation
-   * @param {string} convId
-   * @returns {Message | undefined}
-   */
-  popMsg(convId) {
-    const conv = StorageUtils.getOneConversation(convId);
-    if (!conv) return;
-    const msg = conv.messages.pop();
-    conv.lastModified = Date.now();
-    if (conv.messages.length === 0) {
-      StorageUtils.remove(convId);
-    } else {
-      localStorage.setItem(convId, JSON.stringify(conv));
-    }
-    return msg;
-  },
-
-  // manage config
-  getConfig() {
-    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
-    // to prevent breaking changes in the future, we always provide default value for missing keys
-    return {
-      ...CONFIG_DEFAULT,
-      ...savedVal,
-    };
-  },
-  setConfig(config) {
-    localStorage.setItem('config', JSON.stringify(config));
-  },
-  getTheme() {
-    return localStorage.getItem('theme') || 'auto';
-  },
-  setTheme(theme) {
-    if (theme === 'auto') {
-      localStorage.removeItem('theme');
-    } else {
-      localStorage.setItem('theme', theme);
-    }
-  },
-};
-
-// scroll to bottom of chat messages
-// if requiresNearBottom is true, only auto-scroll if user is near bottom
-const chatScrollToBottom = (requiresNearBottom) => {
-  const msgListElem = document.getElementById('messages-list');
-  const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
-  if (!requiresNearBottom || (spaceToBottom < 100)) {
-    setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
-  }
-};
-
-// wrapper for SSE
-async function* sendSSEPostRequest(url, fetchOptions) {
-  const res = await fetch(url, fetchOptions);
-  const lines = res.body
-    .pipeThrough(new TextDecoderStream())
-    .pipeThrough(new TextLineStream());
-  for await (const line of asyncIterator(lines)) {
-    if (isDev) console.log({line});
-    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
-      const data = JSON.parse(line.slice(5));
-      yield data;
-    } else if (line.startsWith('error:')) {
-      const data = JSON.parse(line.slice(6));
-      throw new Error(data.message || 'Unknown error');
-    }
-  }
-};
-
-const mainApp = createApp({
-  components: {
-    VueMarkdown,
-    SettingsModalShortInput,
-    MessageBubble,
-  },
-  data() {
-    return {
-      conversations: StorageUtils.getAllConversations(),
-      /** @type {Array<Message>} */
-      messages: [],
-      viewingConvId: StorageUtils.getNewConvId(),
-      inputMsg: '',
-      isGenerating: false,
-      /** @type {Array<Message> | null} */
-      pendingMsg: null, // the on-going message from assistant
-      stopGeneration: () => {},
-      selectedTheme: StorageUtils.getTheme(),
-      config: StorageUtils.getConfig(),
-      showConfigDialog: false,
-      // const
-      themes: THEMES,
-      /** @type {CONFIG_DEFAULT} */
-      configDefault: {...CONFIG_DEFAULT},
-      configInfo: {...CONFIG_INFO},
-      isDev,
-    }
-  },
-  computed: {},
-  mounted() {
-    document.getElementById('app').classList.remove('opacity-0'); // show app
-    // scroll to the bottom when the pending message height is updated
-    const pendingMsgElem = document.getElementById('pending-msg');
-    const resizeObserver = new ResizeObserver(() => {
-      if (this.isGenerating) chatScrollToBottom(true);
-    });
-    resizeObserver.observe(pendingMsgElem);
-    this.setSelectedTheme(this.selectedTheme);
-  },
-  watch: {
-    viewingConvId: function(val, oldVal) {
-      if (val != oldVal) {
-        this.fetchMessages();
-        chatScrollToBottom();
-        this.hideSidebar();
-      }
-    }
-  },
-  methods: {
-    hideSidebar() {
-      document.getElementById('toggle-drawer').checked = false;
-    },
-    setSelectedTheme(theme) {
-      this.selectedTheme = theme;
-      document.body.setAttribute('data-theme', theme);
-      document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
-      StorageUtils.setTheme(theme);
-    },
-    newConversation() {
-      if (this.isGenerating) return;
-      this.viewingConvId = StorageUtils.getNewConvId();
-    },
-    setViewingConv(convId) {
-      if (this.isGenerating) return;
-      this.viewingConvId = convId;
-    },
-    deleteConv(convId) {
-      if (this.isGenerating) return;
-      if (window.confirm('Are you sure to delete this conversation?')) {
-        StorageUtils.remove(convId);
-        if (this.viewingConvId === convId) {
-          this.viewingConvId = StorageUtils.getNewConvId();
-        }
-        this.fetchConversation();
-        this.fetchMessages();
-      }
-    },
-    downloadConv(convId) {
-      const conversation = StorageUtils.getOneConversation(convId);
-      if (!conversation) {
-        alert('Conversation not found.');
-        return;
-      }
-      const conversationJson = JSON.stringify(conversation, null, 2);
-      const blob = new Blob([conversationJson], { type: 'application/json' });
-      const url = URL.createObjectURL(blob);
-      const a = document.createElement('a');
-      a.href = url;
-      a.download = `conversation_${convId}.json`;
-      document.body.appendChild(a);
-      a.click();
-      document.body.removeChild(a);
-      URL.revokeObjectURL(url);
-    },
-    async sendMessage() {
-      // prevent sending empty message
-      // also allow typing the message while generating, but does not allow sending it (to match UX/UI behavior of other chat apps)
-      if (!this.inputMsg || this.isGenerating) return;
-
-      const currConvId = this.viewingConvId;
-
-      StorageUtils.appendMsg(currConvId, {
-        id: Date.now(),
-        role: 'user',
-        content: this.inputMsg,
-      });
-      this.fetchConversation();
-      this.fetchMessages();
-      this.inputMsg = '';
-      this.generateMessage(currConvId);
-      chatScrollToBottom();
-    },
-    async generateMessage(currConvId) {
-      if (this.isGenerating) return;
-      this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
-      this.isGenerating = true;
-
-      try {
-        /** @type {CONFIG_DEFAULT} */
-        const config = this.config;
-        const abortController = new AbortController();
-        this.stopGeneration = () => abortController.abort();
-        /** @type {Array<APIMessage>} */
-        let messages = [
-          { role: 'system', content: config.systemMessage },
-          ...normalizeMsgsForAPI(this.messages),
-        ];
-        if (config.excludeThoughtOnReq) {
-          messages = filterThoughtFromMsgs(messages);
-        }
-        if (isDev) console.log({messages});
-        const params = {
-          messages,
-          stream: true,
-          cache_prompt: true,
-          samplers: config.samplers,
-          temperature: config.temperature,
-          dynatemp_range: config.dynatemp_range,
-          dynatemp_exponent: config.dynatemp_exponent,
-          top_k: config.top_k,
-          top_p: config.top_p,
-          min_p: config.min_p,
-          typical_p: config.typical_p,
-          xtc_probability: config.xtc_probability,
-          xtc_threshold: config.xtc_threshold,
-          repeat_last_n: config.repeat_last_n,
-          repeat_penalty: config.repeat_penalty,
-          presence_penalty: config.presence_penalty,
-          frequency_penalty: config.frequency_penalty,
-          dry_multiplier: config.dry_multiplier,
-          dry_base: config.dry_base,
-          dry_allowed_length: config.dry_allowed_length,
-          dry_penalty_last_n: config.dry_penalty_last_n,
-          max_tokens: config.max_tokens,
-          timings_per_token: !!config.showTokensPerSecond,
-          ...(config.custom.length ? JSON.parse(config.custom) : {}),
-        };
-        const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
-          method: 'POST',
-          headers: {
-            'Content-Type': 'application/json',
-            ...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
-          },
-          body: JSON.stringify(params),
-          signal: abortController.signal,
-        });
-        for await (const chunk of chunks) {
-          const stop = chunk.stop;
-          const addedContent = chunk.choices[0].delta.content;
-          const lastContent = this.pendingMsg.content || '';
-          if (addedContent) {
-            this.pendingMsg = {
-              id: this.pendingMsg.id,
-              role: 'assistant',
-              content: lastContent + addedContent,
-            };
-          }
-          const timings = chunk.timings;
-          if (timings && config.showTokensPerSecond) {
-            // only extract what's really needed, to save some space
-            this.pendingMsg.timings = {
-              prompt_n: timings.prompt_n,
-              prompt_ms: timings.prompt_ms,
-              predicted_n: timings.predicted_n,
-              predicted_ms: timings.predicted_ms,
-            };
-          }
-        }
-
-        StorageUtils.appendMsg(currConvId, this.pendingMsg);
-        this.fetchConversation();
-        this.fetchMessages();
-        setTimeout(() => document.getElementById('msg-input').focus(), 1);
-      } catch (error) {
-        if (error.name === 'AbortError') {
-          // user stopped the generation via stopGeneration() function
-          StorageUtils.appendMsg(currConvId, this.pendingMsg);
-          this.fetchConversation();
-          this.fetchMessages();
-        } else {
-          console.error(error);
-          alert(error);
-          // pop last user message
-          const lastUserMsg = StorageUtils.popMsg(currConvId);
-          this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
-        }
-      }
-
-      this.pendingMsg = null;
-      this.isGenerating = false;
-      this.stopGeneration = () => {};
-      this.fetchMessages();
-      chatScrollToBottom();
-    },
-
-    // message actions
-    regenerateMsg(msg) {
-      if (this.isGenerating) return;
-      // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
-      const currConvId = this.viewingConvId;
-      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
-      this.fetchConversation();
-      this.fetchMessages();
-      this.generateMessage(currConvId);
-    },
-    editUserMsgAndRegenerate(msg) {
-      if (this.isGenerating) return;
-      const currConvId = this.viewingConvId;
-      const newContent = msg.content;
-      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
-      StorageUtils.appendMsg(currConvId, {
-        id: Date.now(),
-        role: 'user',
-        content: newContent,
-      });
-      this.fetchConversation();
-      this.fetchMessages();
-      this.generateMessage(currConvId);
-    },
-
-    // settings dialog methods
-    closeAndSaveConfigDialog() {
-      try {
-        if (this.config.custom.length) JSON.parse(this.config.custom);
-      } catch (error) {
-        alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
-        return;
-      }
-      for (const key of CONFIG_NUMERIC_KEYS) {
-        if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
-          alert(`Invalid number for ${key} (expected an integer or a float)`);
-          return;
-        }
-        this.config[key] = parseFloat(this.config[key]);
-      }
-      this.showConfigDialog = false;
-      StorageUtils.setConfig(this.config);
-    },
-    closeAndDiscardConfigDialog() {
-      this.showConfigDialog = false;
-      this.config = StorageUtils.getConfig();
-    },
-    resetConfigDialog() {
-      if (window.confirm('Are you sure to reset all settings?')) {
-        this.config = {...CONFIG_DEFAULT};
-      }
-    },
-
-    // sync state functions
-    fetchConversation() {
-      this.conversations = StorageUtils.getAllConversations();
-    },
-    fetchMessages() {
-      this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
-    },
-
-    // debug functions
-    async debugImportDemoConv() {
-      const res = await fetch('/demo-conversation.json');
-      const demoConv = await res.json();
-      StorageUtils.remove(demoConv.id);
-      for (const msg of demoConv.messages) {
-        StorageUtils.appendMsg(demoConv.id, msg);
-      }
-      this.fetchConversation();
-    }
-  },
-});
-mainApp.config.errorHandler = alert;
-try {
-  mainApp.mount('#app');
-} catch (err) {
-  console.error(err);
-  document.getElementById('app').innerHTML = `<div style="margin:2em auto">
-    Failed to start app. Please try clearing localStorage and try again.<br/>
-    <br/>
-    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
-  </div>`;
-}
-
-/**
- * filter out redundant fields upon sending to API
- * @param {Array<APIMessage>} messages
- * @returns {Array<APIMessage>}
- */
-function normalizeMsgsForAPI(messages) {
-  return messages.map((msg) => {
-    return {
-      role: msg.role,
-      content: msg.content,
-    };
-  });
-}
-
-/**
- * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
- * @param {Array<APIMessage>} messages
- * @returns {Array<APIMessage>}
- */
-function filterThoughtFromMsgs(messages) {
-  return messages.map((msg) => {
-    return {
-      role: msg.role,
-      content: msg.role === 'assistant'
-        ? msg.content.split('</think>').at(-1).trim()
-        : msg.content,
-    };
-  });
-}
--- a/examples/server/webui/src/main.tsx
+++ b/examples/server/webui/src/main.tsx
@ -0,0 +1,10 @@
+import { StrictMode } from 'react';
+import { createRoot } from 'react-dom/client';
+import './index.scss';
+import App from './App.tsx';
+
+createRoot(document.getElementById('root')!).render(
+  <StrictMode>
+    <App />
+  </StrictMode>
+);
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@ -0,0 +1,327 @@
+import React, { createContext, useContext, useEffect, useState } from 'react';
+import {
+  APIMessage,
+  CanvasData,
+  Conversation,
+  Message,
+  PendingMessage,
+} from './types';
+import StorageUtils from './storage';
+import {
+  filterThoughtFromMsgs,
+  normalizeMsgsForAPI,
+  getSSEStreamAsync,
+} from './misc';
+import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
+import { matchPath, useLocation } from 'react-router';
+
+interface AppContextValue {
+  // conversations and messages
+  viewingConversation: Conversation | null;
+  pendingMessages: Record<Conversation['id'], PendingMessage>;
+  isGenerating: (convId: string) => boolean;
+  sendMessage: (
+    convId: string,
+    content: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => Promise<boolean>;
+  stopGenerating: (convId: string) => void;
+  replaceMessageAndGenerate: (
+    convId: string,
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => Promise<void>;
+
+  // canvas
+  canvasData: CanvasData | null;
+  setCanvasData: (data: CanvasData | null) => void;
+
+  // config
+  config: typeof CONFIG_DEFAULT;
+  saveConfig: (config: typeof CONFIG_DEFAULT) => void;
+  showSettings: boolean;
+  setShowSettings: (show: boolean) => void;
+}
+
+// for now, this callback is only used for scrolling to the bottom of the chat
+type CallbackGeneratedChunk = () => void;
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+const AppContext = createContext<AppContextValue>({} as any);
+
+export const AppContextProvider = ({
+  children,
+}: {
+  children: React.ReactElement;
+}) => {
+  const { pathname } = useLocation();
+  const params = matchPath('/chat/:convId', pathname);
+  const convId = params?.params?.convId;
+
+  const [viewingConversation, setViewingConversation] =
+    useState<Conversation | null>(null);
+  const [pendingMessages, setPendingMessages] = useState<
+    Record<Conversation['id'], PendingMessage>
+  >({});
+  const [aborts, setAborts] = useState<
+    Record<Conversation['id'], AbortController>
+  >({});
+  const [config, setConfig] = useState(StorageUtils.getConfig());
+  const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
+  const [showSettings, setShowSettings] = useState(false);
+
+  // handle change when the convId from URL is changed
+  useEffect(() => {
+    // also reset the canvas data
+    setCanvasData(null);
+    const handleConversationChange = (changedConvId: string) => {
+      if (changedConvId !== convId) return;
+      setViewingConversation(StorageUtils.getOneConversation(convId));
+    };
+    StorageUtils.onConversationChanged(handleConversationChange);
+    setViewingConversation(StorageUtils.getOneConversation(convId ?? ''));
+    return () => {
+      StorageUtils.offConversationChanged(handleConversationChange);
+    };
+  }, [convId]);
+
+  const setPending = (convId: string, pendingMsg: PendingMessage | null) => {
+    // if pendingMsg is null, remove the key from the object
+    if (!pendingMsg) {
+      setPendingMessages((prev) => {
+        const newState = { ...prev };
+        delete newState[convId];
+        return newState;
+      });
+    } else {
+      setPendingMessages((prev) => ({ ...prev, [convId]: pendingMsg }));
+    }
+  };
+
+  const setAbort = (convId: string, controller: AbortController | null) => {
+    if (!controller) {
+      setAborts((prev) => {
+        const newState = { ...prev };
+        delete newState[convId];
+        return newState;
+      });
+    } else {
+      setAborts((prev) => ({ ...prev, [convId]: controller }));
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////
+  // public functions
+
+  const isGenerating = (convId: string) => !!pendingMessages[convId];
+
+  const generateMessage = async (
+    convId: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => {
+    if (isGenerating(convId)) return;
+
+    const config = StorageUtils.getConfig();
+    const currConversation = StorageUtils.getOneConversation(convId);
+    if (!currConversation) {
+      throw new Error('Current conversation is not found');
+    }
+
+    const abortController = new AbortController();
+    setAbort(convId, abortController);
+
+    let pendingMsg: PendingMessage = {
+      id: Date.now() + 1,
+      role: 'assistant',
+      content: null,
+    };
+    setPending(convId, pendingMsg);
+
+    try {
+      // prepare messages for API
+      let messages: APIMessage[] = [
+        ...(config.systemMessage.length === 0
+          ? []
+          : [{ role: 'system', content: config.systemMessage } as APIMessage]),
+        ...normalizeMsgsForAPI(currConversation?.messages ?? []),
+      ];
+      if (config.excludeThoughtOnReq) {
+        messages = filterThoughtFromMsgs(messages);
+      }
+      if (isDev) console.log({ messages });
+
+      // prepare params
+      const params = {
+        messages,
+        stream: true,
+        cache_prompt: true,
+        samplers: config.samplers,
+        temperature: config.temperature,
+        dynatemp_range: config.dynatemp_range,
+        dynatemp_exponent: config.dynatemp_exponent,
+        top_k: config.top_k,
+        top_p: config.top_p,
+        min_p: config.min_p,
+        typical_p: config.typical_p,
+        xtc_probability: config.xtc_probability,
+        xtc_threshold: config.xtc_threshold,
+        repeat_last_n: config.repeat_last_n,
+        repeat_penalty: config.repeat_penalty,
+        presence_penalty: config.presence_penalty,
+        frequency_penalty: config.frequency_penalty,
+        dry_multiplier: config.dry_multiplier,
+        dry_base: config.dry_base,
+        dry_allowed_length: config.dry_allowed_length,
+        dry_penalty_last_n: config.dry_penalty_last_n,
+        max_tokens: config.max_tokens,
+        timings_per_token: !!config.showTokensPerSecond,
+        ...(config.custom.length ? JSON.parse(config.custom) : {}),
+      };
+
+      // send request
+      const fetchResponse = await fetch(`${BASE_URL}/v1/chat/completions`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          ...(config.apiKey
+            ? { Authorization: `Bearer ${config.apiKey}` }
+            : {}),
+        },
+        body: JSON.stringify(params),
+        signal: abortController.signal,
+      });
+      if (fetchResponse.status !== 200) {
+        const body = await fetchResponse.json();
+        throw new Error(body?.error?.message || 'Unknown error');
+      }
+      const chunks = getSSEStreamAsync(fetchResponse);
+      for await (const chunk of chunks) {
+        // const stop = chunk.stop;
+        if (chunk.error) {
+          throw new Error(chunk.error?.message || 'Unknown error');
+        }
+        const addedContent = chunk.choices[0].delta.content;
+        const lastContent = pendingMsg.content || '';
+        if (addedContent) {
+          pendingMsg = {
+            id: pendingMsg.id,
+            role: 'assistant',
+            content: lastContent + addedContent,
+          };
+        }
+        const timings = chunk.timings;
+        if (timings && config.showTokensPerSecond) {
+          // only extract what's really needed, to save some space
+          pendingMsg.timings = {
+            prompt_n: timings.prompt_n,
+            prompt_ms: timings.prompt_ms,
+            predicted_n: timings.predicted_n,
+            predicted_ms: timings.predicted_ms,
+          };
+        }
+        setPending(convId, pendingMsg);
+        onChunk?.();
+      }
+    } catch (err) {
+      setPending(convId, null);
+      if ((err as Error).name === 'AbortError') {
+        // user stopped the generation via stopGeneration() function
+        // we can safely ignore this error
+      } else {
+        console.error(err);
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        alert((err as any)?.message ?? 'Unknown error');
+        throw err; // rethrow
+      }
+    }
+
+    if (pendingMsg.content) {
+      StorageUtils.appendMsg(currConversation.id, {
+        id: pendingMsg.id,
+        content: pendingMsg.content,
+        role: pendingMsg.role,
+        timings: pendingMsg.timings,
+      });
+    }
+    setPending(convId, null);
+    onChunk?.(); // trigger scroll to bottom
+  };
+
+  const sendMessage = async (
+    convId: string,
+    content: string,
+    onChunk?: CallbackGeneratedChunk
+  ): Promise<boolean> => {
+    if (isGenerating(convId) || content.trim().length === 0) return false;
+
+    StorageUtils.appendMsg(convId, {
+      id: Date.now(),
+      role: 'user',
+      content,
+    });
+
+    try {
+      await generateMessage(convId, onChunk);
+      return true;
+    } catch (_) {
+      // rollback
+      StorageUtils.popMsg(convId);
+    }
+    return false;
+  };
+
+  const stopGenerating = (convId: string) => {
+    setPending(convId, null);
+    aborts[convId]?.abort();
+  };
+
+  // if content is undefined, we remove last assistant message
+  const replaceMessageAndGenerate = async (
+    convId: string,
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => {
+    if (isGenerating(convId)) return;
+
+    StorageUtils.filterAndKeepMsgs(convId, (msg) => msg.id < origMsgId);
+    if (content) {
+      StorageUtils.appendMsg(convId, {
+        id: Date.now(),
+        role: 'user',
+        content,
+      });
+    }
+
+    await generateMessage(convId, onChunk);
+  };
+
+  const saveConfig = (config: typeof CONFIG_DEFAULT) => {
+    StorageUtils.setConfig(config);
+    setConfig(config);
+  };
+
+  return (
+    <AppContext.Provider
+      value={{
+        isGenerating,
+        viewingConversation,
+        pendingMessages,
+        sendMessage,
+        stopGenerating,
+        replaceMessageAndGenerate,
+        canvasData,
+        setCanvasData,
+        config,
+        saveConfig,
+        showSettings,
+        setShowSettings,
+      }}
+    >
+      {children}
+    </AppContext.Provider>
+  );
+};
+
+export const useAppContext = () => useContext(AppContext);
--- a/examples/server/webui/src/utils/common.tsx
+++ b/examples/server/webui/src/utils/common.tsx
@ -0,0 +1,38 @@
+export const XCloseButton: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement>
+> = ({ className, ...props }) => (
+  <button className={`btn btn-square btn-sm ${className ?? ''}`} {...props}>
+    <svg
+      xmlns="http://www.w3.org/2000/svg"
+      className="h-6 w-6"
+      fill="none"
+      viewBox="0 0 24 24"
+      stroke="currentColor"
+    >
+      <path
+        strokeLinecap="round"
+        strokeLinejoin="round"
+        strokeWidth="2"
+        d="M6 18L18 6M6 6l12 12"
+      />
+    </svg>
+  </button>
+);
+
+export const OpenInNewTab = ({
+  href,
+  children,
+}: {
+  href: string;
+  children: string;
+}) => (
+  <a
+    className="underline"
+    href={href}
+    target="_blank"
+    rel="noopener noreferrer"
+  >
+    {children}
+  </a>
+);
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@ -0,0 +1,90 @@
+// @ts-expect-error this package does not have typing
+import TextLineStream from 'textlinestream';
+import { APIMessage, Message } from './types';
+
+// ponyfill for missing ReadableStream asyncIterator on Safari
+import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
+import { isDev } from '../Config';
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isString = (x: any) => !!x.toLowerCase;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isBoolean = (x: any) => x === true || x === false;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isNumeric = (n: any) => !isString(n) && !isNaN(n) && !isBoolean(n);
+export const escapeAttr = (str: string) =>
+  str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+
+// wrapper for SSE
+export async function* getSSEStreamAsync(fetchResponse: Response) {
+  if (!fetchResponse.body) throw new Error('Response body is empty');
+  const lines: ReadableStream<string> = fetchResponse.body
+    .pipeThrough(new TextDecoderStream())
+    .pipeThrough(new TextLineStream());
+  // @ts-expect-error asyncIterator complains about type, but it should work
+  for await (const line of asyncIterator(lines)) {
+    if (isDev) console.log({ line });
+    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
+      const data = JSON.parse(line.slice(5));
+      yield data;
+    } else if (line.startsWith('error:')) {
+      const data = JSON.parse(line.slice(6));
+      throw new Error(data.message || 'Unknown error');
+    }
+  }
+}
+
+// copy text to clipboard
+export const copyStr = (textToCopy: string) => {
+  // Navigator clipboard api needs a secure context (https)
+  if (navigator.clipboard && window.isSecureContext) {
+    navigator.clipboard.writeText(textToCopy);
+  } else {
+    // Use the 'out of viewport hidden text area' trick
+    const textArea = document.createElement('textarea');
+    textArea.value = textToCopy;
+    // Move textarea out of the viewport so it's not visible
+    textArea.style.position = 'absolute';
+    textArea.style.left = '-999999px';
+    document.body.prepend(textArea);
+    textArea.select();
+    document.execCommand('copy');
+  }
+};
+
+/**
+ * filter out redundant fields upon sending to API
+ */
+export function normalizeMsgsForAPI(messages: Message[]) {
+  return messages.map((msg) => {
+    return {
+      role: msg.role,
+      content: msg.content,
+    };
+  }) as APIMessage[];
+}
+
+/**
+ * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
+ */
+export function filterThoughtFromMsgs(messages: APIMessage[]) {
+  return messages.map((msg) => {
+    return {
+      role: msg.role,
+      content:
+        msg.role === 'assistant'
+          ? msg.content.split('</think>').at(-1)!.trim()
+          : msg.content,
+    } as APIMessage;
+  });
+}
+
+export function classNames(classes: Record<string, boolean>): string {
+  return Object.entries(classes)
+    .filter(([_, value]) => value)
+    .map(([key, _]) => key)
+    .join(' ');
+}
+
+export const delay = (ms: number) =>
+  new Promise((resolve) => setTimeout(resolve, ms));
--- a/examples/server/webui/src/utils/storage.ts
+++ b/examples/server/webui/src/utils/storage.ts
@ -0,0 +1,138 @@
+// coversations is stored in localStorage
+// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+
+import { CONFIG_DEFAULT } from '../Config';
+import { Conversation, Message } from './types';
+
+const event = new EventTarget();
+
+type CallbackConversationChanged = (convId: string) => void;
+let onConversationChangedHandlers: [
+  CallbackConversationChanged,
+  EventListener,
+][] = [];
+const dispatchConversationChange = (convId: string) => {
+  event.dispatchEvent(
+    new CustomEvent('conversationChange', { detail: { convId } })
+  );
+};
+
+// convId is a string prefixed with 'conv-'
+const StorageUtils = {
+  /**
+   * manage conversations
+   */
+  getAllConversations(): Conversation[] {
+    const res = [];
+    for (const key in localStorage) {
+      if (key.startsWith('conv-')) {
+        res.push(JSON.parse(localStorage.getItem(key) ?? '{}'));
+      }
+    }
+    res.sort((a, b) => b.lastModified - a.lastModified);
+    return res;
+  },
+  /**
+   * can return null if convId does not exist
+   */
+  getOneConversation(convId: string): Conversation | null {
+    return JSON.parse(localStorage.getItem(convId) || 'null');
+  },
+  /**
+   * if convId does not exist, create one
+   */
+  appendMsg(convId: string, msg: Message): void {
+    if (msg.content === null) return;
+    const conv = StorageUtils.getOneConversation(convId) || {
+      id: convId,
+      lastModified: Date.now(),
+      messages: [],
+    };
+    conv.messages.push(msg);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * Get new conversation id
+   */
+  getNewConvId(): string {
+    return `conv-${Date.now()}`;
+  },
+  /**
+   * remove conversation by id
+   */
+  remove(convId: string): void {
+    localStorage.removeItem(convId);
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove all conversations
+   */
+  filterAndKeepMsgs(
+    convId: string,
+    predicate: (msg: Message) => boolean
+  ): void {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    conv.messages = conv.messages.filter(predicate);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove last message from conversation
+   */
+  popMsg(convId: string): Message | undefined {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    const msg = conv.messages.pop();
+    conv.lastModified = Date.now();
+    if (conv.messages.length === 0) {
+      StorageUtils.remove(convId);
+    } else {
+      localStorage.setItem(convId, JSON.stringify(conv));
+    }
+    dispatchConversationChange(convId);
+    return msg;
+  },
+
+  // event listeners
+  onConversationChanged(callback: CallbackConversationChanged) {
+    const fn = (e: Event) => callback((e as CustomEvent).detail.convId);
+    onConversationChangedHandlers.push([callback, fn]);
+    event.addEventListener('conversationChange', fn);
+  },
+  offConversationChanged(callback: CallbackConversationChanged) {
+    const fn = onConversationChangedHandlers.find(([cb, _]) => cb === callback);
+    if (fn) {
+      event.removeEventListener('conversationChange', fn[1]);
+    }
+    onConversationChangedHandlers = [];
+  },
+
+  // manage config
+  getConfig(): typeof CONFIG_DEFAULT {
+    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+    // to prevent breaking changes in the future, we always provide default value for missing keys
+    return {
+      ...CONFIG_DEFAULT,
+      ...savedVal,
+    };
+  },
+  setConfig(config: typeof CONFIG_DEFAULT) {
+    localStorage.setItem('config', JSON.stringify(config));
+  },
+  getTheme(): string {
+    return localStorage.getItem('theme') || 'auto';
+  },
+  setTheme(theme: string) {
+    if (theme === 'auto') {
+      localStorage.removeItem('theme');
+    } else {
+      localStorage.setItem('theme', theme);
+    }
+  },
+};
+
+export default StorageUtils;
--- a/examples/server/webui/src/utils/types.ts
+++ b/examples/server/webui/src/utils/types.ts
@ -0,0 +1,36 @@
+export interface TimingReport {
+  prompt_n: number;
+  prompt_ms: number;
+  predicted_n: number;
+  predicted_ms: number;
+}
+
+export interface Message {
+  id: number;
+  role: 'user' | 'assistant' | 'system';
+  content: string;
+  timings?: TimingReport;
+}
+
+export type APIMessage = Pick<Message, 'role' | 'content'>;
+
+export interface Conversation {
+  id: string; // format: `conv-{timestamp}`
+  lastModified: number; // timestamp from Date.now()
+  messages: Message[];
+}
+
+export type PendingMessage = Omit<Message, 'content'> & {
+  content: string | null;
+};
+
+export enum CanvasType {
+  PY_INTERPRETER,
+}
+
+export interface CanvasPyInterpreter {
+  type: CanvasType.PY_INTERPRETER;
+  content: string;
+}
+
+export type CanvasData = CanvasPyInterpreter;
--- a/examples/server/webui/src/vite-env.d.ts
+++ b/examples/server/webui/src/vite-env.d.ts
@ -0,0 +1 @@
+/// <reference types="vite/client" />
--- a/examples/server/webui/tsconfig.app.json
+++ b/examples/server/webui/tsconfig.app.json
@ -0,0 +1,26 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
+    "target": "ES2021",
+    "useDefineForClassFields": true,
+    "lib": ["ES2021", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedSideEffectImports": true
+  },
+  "include": ["src"]
+}
--- a/examples/server/webui/tsconfig.json
+++ b/examples/server/webui/tsconfig.json
@ -0,0 +1,7 @@
+{
+  "files": [],
+  "references": [
+    { "path": "./tsconfig.app.json" },
+    { "path": "./tsconfig.node.json" }
+  ]
+}
--- a/examples/server/webui/tsconfig.node.json
+++ b/examples/server/webui/tsconfig.node.json
@ -0,0 +1,24 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
+    "target": "ES2022",
+    "lib": ["ES2023"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedSideEffectImports": true
+  },
+  "include": ["vite.config.ts"]
+}
--- a/examples/server/webui/vite.config.ts
+++ b/examples/server/webui/vite.config.ts
@ -1,8 +1,11 @@
-
+import { defineConfig, PluginOption } from 'vite';
+import react from '@vitejs/plugin-react';
 import { viteSingleFile } from 'vite-plugin-singlefile';
-import path from 'path';
-import fs from 'fs';
-import zlib from 'zlib';
+import path from 'node:path';
+import fs from 'node:fs';
+import zlib from 'node:zlib';
+
+/* eslint-disable */

 const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary

@ -15,20 +18,26 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();

+const FRONTEND_PLUGINS = [react()];
+
 const BUILD_PLUGINS = [
+  ...FRONTEND_PLUGINS,
  viteSingleFile(),
  (function llamaCppPlugin() {
-    let config;
+    let config: any;
    return {
      name: 'llamacpp:build',
      apply: 'build',
-      async configResolved(_config) {
+      async configResolved(_config: any) {
        config = _config;
      },
      writeBundle() {
        const outputIndexHtml = path.join(config.build.outDir, 'index.html');
-        const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
-        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
+        const content =
+          GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
+        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), {
+          level: 9,
+        });

        // because gzip header contains machine-specific info, we must remove these data from the header
        // timestamp
@ -42,18 +51,30 @@ const BUILD_PLUGINS = [
        if (compressed.byteLength > MAX_BUNDLE_SIZE) {
          throw new Error(
            `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
-            `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`,
+              `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`
          );
        }

-        const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz');
+        const targetOutputFile = path.join(
+          config.build.outDir,
+          '../../public/index.html.gz'
+        );
        fs.writeFileSync(targetOutputFile, compressed);
-      }
-    }
+      },
+    } satisfies PluginOption;
  })(),
 ];

-/** @type {import('vite').UserConfig} */
-export default {
-  plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS,
-};
+export default defineConfig({
+  // @ts-ignore
+  plugins: process.env.ANALYZE ? FRONTEND_PLUGINS : BUILD_PLUGINS,
+  server: {
+    proxy: {
+      '/v1': 'http://localhost:8080',
+    },
+    headers: {
+      'Cross-Origin-Embedder-Policy': 'require-corp',
+      'Cross-Origin-Opener-Policy': 'same-origin',
+    },
+  },
+});
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -10,8 +10,6 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16

-GGML_BACKEND_API void ggml_vk_instance_init(void);
-
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
        }

-        if (this_size > max_size) {
-            GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
-                    __func__, t->name,
-                    ggml_backend_buft_name(buft),
-                    this_size, max_size);
-            for (size_t i = 0; i < n_buffers; i++) {
-                ggml_backend_buffer_free(buffers[i]);
-            }
-            free(buffers);
-            return NULL;
-        }
-
-        if ((cur_buf_size + this_size) > max_size) {
+        if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
            // allocate tensors in the current buffer
            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                return NULL;
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 #endif

 #if defined(__loongarch_asx)
-
-typedef union {
-    int32_t i;
-    float f;
-} ft_union;
-
 /* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+static __m128 __lsx_vreplfr2vr_s(const float val) {
+    v4f32 res = {val, val, val, val};
+    return (__m128)res;
 }

-static __m256 __lasx_xvreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+static __m256 __lasx_xvreplfr2vr_s(const float val) {
+    v8f32 res = {val, val, val, val, val, val, val, val};
+    return (__m256)res;
 }
 #endif

--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@ -297,6 +297,90 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif

+#if defined(__loongarch_sx)
+
+static __m128i lsx_packs_w(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_w(a, 15);
+    tmp1 = __lsx_vsat_w(b, 15);
+    return __lsx_vpickev_h(tmp1, tmp);
+}
+
+static __m128i lsx_packs_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_h(a, 7);
+    tmp1 = __lsx_vsat_h(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packus_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_hu(a, 7);
+    tmp1 = __lsx_vsat_hu(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_h_b(a, b);
+    tmp2 = __lsx_vmulwod_h_b(a, b);
+    return __lsx_vsadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_madd_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_w_h(a, b);
+    tmp2 = __lsx_vmulwod_w_h(a, b);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
+    v4i32 __ret = {d, c, b, a};
+    return (__m128i)__ret;
+}
+
+static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+    __m128i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lsx_vreplgr2vr_b(f);
+    zero = __lsx_vldi(0);
+    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
+    return __lsx_vshuf_b(a, zero, tmp2);
+}
+
+static __m128i lsx_hadd_h(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_h(b, a);
+    __m128i tmp2 = __lsx_vpickod_h(b, a);
+    return __lsx_vadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_hadd_w(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_w(b, a);
+    __m128i tmp2 = __lsx_vpickod_w(b, a);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128 lsx_hadd_s(__m128 a, __m128 b) {
+    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
+    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
+
+    return __lsx_vfadd_s(tmp1, tmp2);
+}
+
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =lsx_hadd_s(a, b);
+    __m128 res_1 =lsx_hadd_s(c, d);
+    __m128 res =lsx_hadd_s(res_0, res_1);
+    res =lsx_hadd_s(res, res);
+    res =lsx_hadd_s(res, res);
+
+    return ((v4f32)res)[0];
+}
+#endif
+
 #if defined(__loongarch_asx)

 #ifdef __clang__
@ -395,11 +479,6 @@ static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1
    return (__m256i)__ret;
 }

-static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
-    v4i32 __ret = {d, c, b, a};
-    return (__m128i)__ret;
-}
-
 static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
    v4i64 __ret = {d, c, b, a};
    return (__m256i)__ret;
@ -409,18 +488,6 @@ static __m256i lasx_insertf128( __m128i x, __m128i y) {
    return lasx_set_q(x, y);
 }

-static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
-    __m128i mask_f, zero, tmp0, tmp2, mask;
-    int f = 0x8f;
-    mask_f = __lsx_vreplgr2vr_b(f);
-    zero = __lsx_vldi(0);
-    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
-    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
-    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
-    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
-    return __lsx_vshuf_b(a, zero, tmp2);
-}
-
 static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
    __m256i mask_f, zero, tmp0, tmp2, mask;
    int f = 0x8f;
@ -434,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
 }

 static __m256i lasx_extu8_16(__m128i a) {
-    __m128i zero = __lsx_vldi(0);
-    __m128i vlo = __lsx_vilvl_b(zero, a);
-    __m128i vhi = __lsx_vilvh_b(zero, a);
-    return lasx_set_q(vhi, vlo);
+    return __lasx_vext2xv_hu_bu(____m256i(a));
 }

 static __m256i lasx_ext8_16(__m128i a) {
-     __m128i sign = __lsx_vslti_b(a, 0);
-     __m128i vlo = __lsx_vilvl_b(sign, a);
-     __m128i vhi = __lsx_vilvh_b(sign, a);
-     return lasx_set_q(vhi, vlo);
+    return __lasx_vext2xv_h_b(____m256i(a));
 }

 static __m256i lasx_ext16_32(__m128i a) {
-    __m256i tmp1;
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
-    return tmp1;
+    return __lasx_vext2xv_w_h(____m256i(a));
 }

 static __m128i lasx_extracti128( __m256i a, int pos) {
@ -482,25 +534,6 @@ static __m128 lasx_extractf128( __m256 a, int pos) {
    return ret;
 }

-static __m128i lsx_hadd_h(__m128i a, __m128i b) {
-    __m128i tmp1 = __lsx_vpickev_h(b, a);
-    __m128i tmp2 = __lsx_vpickod_h(b, a);
-    return __lsx_vadd_h(tmp1, tmp2);
-}
-
-static __m128i lsx_hadd_w(__m128i a, __m128i b) {
-    __m128i tmp1 = __lsx_vpickev_w(b, a);
-    __m128i tmp2 = __lsx_vpickod_w(b, a);
-    return __lsx_vadd_w(tmp1, tmp2);
-}
-
-static __m128 lsx_hadd_s(__m128 a, __m128 b) {
-    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
-    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
-
-    return __lsx_vfadd_s(tmp1, tmp2);
-}
-
 static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
    __m256i tmp1, tmp2;
    tmp1 = __lasx_xvmulwev_h_b(a, b);
@ -529,42 +562,6 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) {
    return __lasx_xvpickev_b(tmp1, tmp);
 }

-static __m128i lsx_packs_w(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_w(a, 15);
-    tmp1 = __lsx_vsat_w(b, 15);
-    return __lsx_vpickev_h(tmp1, tmp);
-}
-
-static __m128i lsx_packs_h(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_h(a, 7);
-    tmp1 = __lsx_vsat_h(b, 7);
-    return __lsx_vpickev_b(tmp1, tmp);
-}
-
-static __m128i lsx_packus_h(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_hu(a, 7);
-    tmp1 = __lsx_vsat_hu(b, 7);
-    return __lsx_vpickev_b(tmp1, tmp);
-}
-
-
-static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
-    __m128i tmp1, tmp2;
-    tmp1 = __lsx_vmulwev_h_b(a, b);
-    tmp2 = __lsx_vmulwod_h_b(a, b);
-    return __lsx_vsadd_h(tmp1, tmp2);
-}
-
-static __m128i lsx_madd_h(__m128i a, __m128i b) {
-    __m128i tmp1, tmp2;
-    tmp1 = __lsx_vmulwev_w_h(a, b);
-    tmp2 = __lsx_vmulwod_w_h(a, b);
-    return __lsx_vadd_w(tmp1, tmp2);
-}
-
 // multiply int8_t, add results pairwise twice
 static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
    // Get absolute values of x vectors
@ -580,12 +577,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
 // horizontally add 8 floats
 static inline float hsum_float_8(const __m256 x) {
    __m128 res = lasx_extractf128(x, 1);
-    ft_union tmp;
    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
-    tmp.i = __lsx_vpickve2gr_w(res, 0);
-    return tmp.f;
+    return ((v4f32)res)[0];
 }

 // horizontally add 8 int32_t
@ -927,7 +922,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)

 #elif defined(__loongarch_asx)
    for (int i = 0; i < nb; i++) {
-        ft_union fi;
        __m256 v0 = (__m256)__lasx_xvld( x , 0);
        __m256 v1 = (__m256)__lasx_xvld( x , 32);
        __m256 v2 = (__m256)__lasx_xvld( x , 64);
@ -945,8 +939,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
        __m128 tmp = max4;
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
-        fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
-        const float max_scalar = fi.f;
+        const float max_scalar = ((v4f32)max4)[0];

        // Quantize these floats
        const float d = max_scalar / 127.f;
@ -1251,7 +1244,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)

 #elif defined(__loongarch_asx)
    for (int i = 0; i < nb; i++) {
-        ft_union ft;
        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
@ -1269,8 +1261,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
        __m128 tmp = max4;
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
-        ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
-        const float max_scalar = ft.f;
+        const float max_scalar = ((v4f32)max4)[0];

        // Quantize these floats
        const float d = max_scalar / 127.f;
@ -2232,21 +2223,22 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    }

    sumf = hsum_float_8(acc);
+
 #elif defined(__loongarch_sx)
    // set constants
    const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
    const __m128i off = __lsx_vreplgr2vr_b(8);

    // Initialize accumulator with zeros
-    __m128 acc_0 = __lsx_vldi(0);
-    __m128 acc_1 = __lsx_vldi(0);
-    __m128 acc_2 = __lsx_vldi(0);
-    __m128 acc_3 = __lsx_vldi(0);
+    __m128 acc_0 = (__m128)__lsx_vldi(0);
+    __m128 acc_1 = (__m128)__lsx_vldi(0);
+    __m128 acc_2 = (__m128)__lsx_vldi(0);
+    __m128 acc_3 = (__m128)__lsx_vldi(0);

    for (; ib + 1 < nb; ib += 2) {

        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );

        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);

@ -2264,7 +2256,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);

        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
+        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );

        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);

@ -6141,9 +6133,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);


-    ft_union fi;
-    fi.i = __lsx_vpickve2gr_w(acc_m, 0);
-    *s = hsum_float_8(acc) + fi.f ;
+    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
 #else

    const uint8_t * scales = (const uint8_t*)&utmp[0];
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -1078,29 +1078,23 @@ do {                                                              \
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  8

-// F16 arithmetic is not supported by AVX, so we use F32 instead
+// F16 arithmetic is not supported by LASX, so we use F32 instead

 #define GGML_F32Cx8          __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))

 static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return (__m256)__lasx_xvld(tmp, 0);
+    __m256i a;
+    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
+    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
+    return __lasx_xvfcvtl_s_h(a);
 }
+
 static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
-    float arr[8];
-
-    __lasx_xvst(y, arr, 0);
-
-    for (int i = 0; i < 8; i++) {
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-    }
+    __m256i a = __lasx_xvfcvt_h_s(y, y);
+    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
+    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
 }
 #define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
 #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@ -13862,9 +13856,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            tp->ec    = GGML_STATUS_ABORTED;
        }

-        ggml_barrier(state->threadpool);
+        if (node_n + 1 < cgraph->n_nodes) {
+            ggml_barrier(state->threadpool);
+        }
    }

+    ggml_barrier(state->threadpool);
+
    return 0;
 }

--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@ -16,7 +16,7 @@
 #include "common.cuh"


-#if CUDART_VERSION >= 11800
+#if CUDART_VERSION >= 11080

 static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
    int ret = 0;
@ -50,7 +50,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
    return ret_low | ret_high;
 }

-#endif // CUDART_VERSION >= 11800
+#endif // CUDART_VERSION >= 11080


 template <typename T>
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -19,6 +19,10 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8

+#ifndef TARGET_OS_VISION
+#define TARGET_OS_VISION 0
+#endif
+
 // create residency sets only on macOS >= 15.0
 #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
    TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1045,7 +1045,28 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
        ggml_free(ctx);
        return false;
    }
-    GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
+
+    uint64_t src_size   = (uint64_t) ggml_nbytes(src);
+    uint64_t dst_data   = (uint64_t) dst->data;
+    uint64_t dst_base   = (uint64_t) ggml_backend_buffer_get_base(dst->buffer);
+    uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
+
+    if (dst_data + src_size > dst_base + dst_buf_sz) {
+        GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
+                         "    write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
+                         "    buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
+                         __func__,
+                         dst_data,
+                         dst_data + src_size,
+                         dst_base,
+                         dst_base + dst_buf_sz);
+        ggml_free(ctx);
+        return false;
+    }
+
+    GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
+                     __func__, (void*) src->buffer, (void*) dst->buffer);
+
    response.result = ggml_backend_buffer_copy_tensor(src, dst);
    ggml_free(ctx);
    return true;
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -103,11 +103,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");

    auto global_mem_size = prop.get_global_mem_size()/1000000;
-    std::string xmx = gpu_has_xmx(device) ? "yes" : "no";
-    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(),
+    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
            name.c_str(), version.c_str(), prop.get_max_compute_units(),
            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str(), xmx.c_str());
+            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
 }

 void ggml_backend_sycl_print_sycl_devices() {
@ -118,16 +117,16 @@ void ggml_backend_sycl_print_sycl_devices() {

    GGML_LOG_INFO(
        "|  |                   |                                       |      "
-        " |Max    |        |Max  |Global |                     |         XMX  |\n");
+        " |Max    |        |Max  |Global |                     |\n");
    GGML_LOG_INFO(
        "|  |                   |                                       |      "
-        " |compute|Max work|sub  |mem    |                     |          or  |\n");
+        " |compute|Max work|sub  |mem    |                     |\n");
    GGML_LOG_INFO(
        "|ID|        Device Type|                                   "
-        "Name|Version|units  |group   |group|size   |       Driver version| Tensor Cores |\n");
+        "Name|Version|units  |group   |group|size   |       Driver version|\n");
    GGML_LOG_INFO(
        "|--|-------------------|---------------------------------------|------"
-        "-|-------|--------|-----|-------|---------------------|--------------|\n");
+        "-|-------|--------|-----|-------|---------------------|\n");

    for (int id = 0; id < device_count; ++id) {
      sycl::device device = dpct::dev_mgr::instance().get_device(id);
@ -4537,14 +4536,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_LOG:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
+            return true;
+        case GGML_OP_NORM:
        case GGML_OP_RMS_NORM:
+        case GGML_OP_GROUP_NORM:
+            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
@ -4576,7 +4578,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
-        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_LEAKY_RELU:
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp
@ -12,7 +12,7 @@ layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 #endif

 void main() {
-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
    if (gl_LocalInvocationIndex.x != 0) {
        return;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
@ -217,7 +217,7 @@ void quantize(uint dst_idx, uint src_idx)
 #endif

 void main() {
-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
    if (gl_LocalInvocationIndex.x != 0) {
        return;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@ -304,6 +304,42 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 }
 #endif

+#if defined(DATA_A_IQ4_XS)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint iq = 16 * ib32 + (iqs % 16);
+
+    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
+    const uint qshift = (iqs & 16) >> 2;
+    u8vec2 qs = u8vec2(data_a[a_offset + ib].qs[iq], data_a[a_offset + ib].qs[iq + 1]);
+    qs = (qs >> qshift) & uint8_t(0xF);
+
+    const float dl = float(int(sl | (sh << 4)) - 32);
+    return dl * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib32 = iqs / 32;
+    const uint iq = 16 * ib32 + (iqs % 16);
+
+    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
+    const uint qshift = (iqs & 16) >> 2;
+    u8vec4 qs = u8vec4(
+        data_a[a_offset + ib].qs[iq + 0],
+        data_a[a_offset + ib].qs[iq + 1],
+        data_a[a_offset + ib].qs[iq + 2],
+        data_a[a_offset + ib].qs[iq + 3]
+    );
+    qs = (qs >> qshift) & uint8_t(0xF);
+
+    const float dl = float(int(sl | (sh << 4)) - 32);
+    return dl * vec4(
+        kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y],
+        kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
+}
+#endif
+
 #if defined(DATA_A_IQ4_NL)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
@ -321,7 +357,7 @@ vec2 get_dm(uint ib, uint a_offset) {
 }
 #endif

-#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
 vec2 get_dm(uint ib, uint a_offset) {
    return vec2(float(data_a[a_offset + ib].d), 0);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@ -323,15 +323,16 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
    const uint8_t qs = bl.block.qs[iqs];
    const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));

-    const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t(signscale >> 28));
+    const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
    uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
    sign |= bitCount(sign) << 7;

-    const uint8_t g = unpack8(iq2xxs_grid[qs][(idx & 4) >> 2])[idx & 3];
+    uint g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
+    g2 >>= (idx & 2) * 8;
+    const vec2 g = vec2(unpack8(g2));

-    float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
-
-    return ret;
+    vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
+    return float16_t(ret[idx & 1]);
 }
 #endif

@ -350,14 +351,16 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor
    const uint iqs = (idx & 0xF8) >> 3;    // 0..63

    const uint16_t qs = bl.block.qs[iqs];
-    const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t((bl.block.scales[is] >> sshift) & 0xF));
+    const float dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));

    uint sign = uint(qs >> 9);
    sign |= bitCount(sign) << 7;
-    const uint8_t g = unpack8(iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2])[idx & 3];
+    uint g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
+    g2 >>= (idx & 2) * 8;
+    const vec2 g = vec2(unpack8(g2));

-    float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
-    return ret;
+    vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
+    return float16_t(ret[idx & 1]);
 }
 #endif

@ -369,24 +372,23 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2
 float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    uint idx = coordInBlock[1];
-    uint lsb = idx & 1;
-    idx /= 2;

-    const uint ib8 = (idx % 128) / 4; // 0..31
-    const uint ib32 = ib8 / 4;        // 0..7
+    const uint ib32 = (idx & 0xE0) >> 5;        // 0..7
+    const uint ib8 = (idx & 0xF8) >> 3;         // 0..31
+    const uint qhshift = 2 * (ib8 % 4);

-    const uint scale = (bl.block.scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
+    const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
    const uint qs = bl.block.qs[ib8];
    const uint qh = bl.block.qh[ib32];
-    const uint qhshift = 2 * (ib8 % 4);
-    const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4));
+    const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (idx & 0x6);

    const float d = float(bl.block.d);
    const float db = d * 0.25 * (0.5 + scale);
-    const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
-    const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1];
-    const vec2 v = db * vec2(sign01) * vec2(unpack8(grid));
-    return float16_t(v[lsb]);
+    const ivec2 sign01 = 1 - (2 & ivec2(sign << 1, sign));
+    uint g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
+    g2 >>= (idx & 2) * 8;
+    const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
+    return float16_t(v[idx & 1]);
 }
 #endif

@ -401,28 +403,25 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3

 float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
+    decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
    uint idx = coordInBlock[1];
-    uint lsb = idx & 1;
-    idx /= 2;

-    const uint iqs = (idx % 128) / 2;           // 0..63
-    const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
+    const uint iqs = (idx & 0xFC) >> 2;             // 0..63
+    const uint is = QUANT_K / 4 + ((idx & 0xE0) >> 3);// 8 values

    const float d = float(bl.block.d);
    const uint qs = bl.block.qs[iqs];
-    const uint signs = pack32(u8vec4(
-        bl.block.qs[is+0],
-        bl.block.qs[is+1],
-        bl.block.qs[is+2],
-        bl.block.qs[is+3]
+    const uint signs = pack32(u16vec2(
+        bl16.block.qs[is/2+0],
+        bl16.block.qs[is/2+1]
    ));
    const float db = d * 0.5 * (0.5 + (signs >> 28));
    const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
-    const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
-    const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
-    const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1));
+    const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6);
+    const ivec2 sign01 = ivec2(1 - (2 & ivec2(sign << 1, sign)));
+    const uint grid = iq3xxs_grid[qs] >> (16 * ((idx & 2) >> 1));
    const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
-    return float16_t(v[lsb]);
+    return float16_t(v[idx & 1]);
 }
 #endif

@ -434,26 +433,45 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3
 float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    uint idx = coordInBlock[1];
-    uint lsb = idx & 1;
-    idx /= 2;

-    const uint iqs = (idx % 128) / 2;           // 0..63
-    const uint iqh = iqs / 8;
+    const uint iqs = (idx & 0xFC) >> 2;           // 0..63
+    const uint iqh = (idx & 0xE0) >> 5;

    const float d = float(bl.block.d);
    const uint qs = bl.block.qs[iqs];
    const uint qh = bl.block.qh[iqh];
-    const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (2 * (idx % 4)));
+    const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (idx & 0x6));
    const uint scale = bl.block.scales[iqs / 16];
-    const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
+    const ivec2 sign01 = ivec2(1 - (2 & ivec2(sign << 1, sign)));
    const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
-    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
+    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> ((idx & 2) << 3);
    const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);

-    return float16_t(v[lsb]);
+    return float16_t(v[idx & 1]);
 }
 #endif

+#if defined(DATA_A_IQ4_XS)
+layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_XS {
+   block_iq4_xs block;
+};
+
+float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
+{
+    const float16_t d = bl.block.d;
+    const uint idx = coordInBlock[1];
+
+    const uint ib32 = (idx & 0xE0) >> 5; // 0..7
+
+    const uint sl = (bl.block.scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+    const uint sh = ((bl.block.scales_h) >> (2 * ib32)) & 3;
+    const uint qshift = (idx & 16) >> 2;
+    const uint q = (bl.block.qs[16 * ib32 + (idx % 16)] >> qshift) & 0xF;
+
+    float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
+    return ret;
+}
+#endif

 #if defined(DATA_A_IQ4_NL)
 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
@ -504,6 +522,8 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
 #define dequantFuncA dequantFuncIQ3_XXS
 #elif defined(DATA_A_IQ3_S)
 #define dequantFuncA dequantFuncIQ3_S
+#elif defined(DATA_A_IQ4_XS)
+#define dequantFuncA dequantFuncIQ4_XS
 #elif defined(DATA_A_IQ4_NL)
 #define dequantFuncA dequantFuncIQ4_NL
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp
@ -0,0 +1,34 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq4_xs data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 subblock (1 scale and 32 quantized values)
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint ib32 = gl_LocalInvocationID.x % 8;
+
+    const float d = float(data_a[ib].d);
+    // Scales are 6 bits
+    const uint scale = ((data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF)
+                     | (((data_a[ib].scales_h >> (2 * ib32)) & 3) << 4);
+    const float dl = d * (int(scale) - 32);
+
+    const uint b_idx = 256 * ib + 32 * ib32;
+    const uint q_idx = 16 * ib32;
+    [[unroll]] for (uint l = 0; l < 16; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
+        data_b[b_idx + l + 16] = D_TYPE(dl * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@ -104,7 +104,7 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
 #endif

 void main() {
-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
 #endif

--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@ -12,7 +12,7 @@ void main() {
    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;

-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
 #endif

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@ -133,7 +133,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 void main() {
    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);

-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
 #endif

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@ -95,7 +95,7 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
 #endif

 void main() {
-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
 #endif

@ -547,6 +547,25 @@ void main() {
            const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2));
            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);

+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ4_XS)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint ib32 = (idx % 128) / 16;         // 0..7
+            const uint iq = 16 * ib32 + 2 * (idx % 8);
+
+            const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+            const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
+            const uint qshift = (idx & 8) >> 1;
+            u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
+            qs = (qs >> qshift) & uint8_t(0xF);
+
+            const float d = float(data_a[ib].d);
+            const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
+
            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
 #elif defined(DATA_A_IQ4_NL)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@ -106,7 +106,7 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
 #endif

 void main() {
-#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
    init_iq_shmem(gl_WorkGroupSize);
 #endif

--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@ -1026,6 +1026,23 @@ void init_iq_shmem(uvec3 wgsize)
 #define A_TYPE_PACKED16 block_iq3_s_packed16
 #endif

+#define QUANT_K_IQ4_XS 256
+#define QUANT_R_IQ4_XS 1
+
+struct block_iq4_xs
+{
+    float16_t d;
+    uint16_t scales_h;
+    uint8_t scales_l[QUANT_K_IQ4_XS/64];
+    uint8_t qs[QUANT_K_IQ4_XS/2];
+};
+
+#if defined(DATA_A_IQ4_XS)
+#define QUANT_K QUANT_K_IQ4_XS
+#define QUANT_R QUANT_R_IQ4_XS
+#define A_TYPE block_iq4_xs
+#endif
+
 #define QUANT_K_IQ4_NL 32
 #define QUANT_R_IQ4_NL 2

@ -1042,7 +1059,13 @@ struct block_iq4_nl_packed16
 };

 #if defined(DATA_A_IQ4_NL)
+#define QUANT_K QUANT_K_IQ4_NL
+#define QUANT_R QUANT_R_IQ4_NL
+#define A_TYPE block_iq4_nl
+#define A_TYPE_PACKED16 block_iq4_nl_packed16
+#endif

+#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
 const int8_t kvalues_iq4nl_const[16] = {
    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
    int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
@ -1058,11 +1081,6 @@ void init_iq_shmem(uvec3 wgsize)
    }
    barrier();
 }
-
-#define QUANT_K QUANT_K_IQ4_NL
-#define QUANT_R QUANT_R_IQ4_NL
-#define A_TYPE block_iq4_nl
-#define A_TYPE_PACKED16 block_iq4_nl_packed16
 #endif

 #endif // !defined(GGML_TYPES_COMP)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -60,6 +60,7 @@ const std::vector<std::string> type_names = {
    "iq2_s",
    "iq3_xxs",
    "iq3_s",
+    "iq4_xs",
    "iq4_nl"
 };

--- a/include/llama.h
+++ b/include/llama.h
@ -1114,11 +1114,12 @@ extern "C" {
    };

    struct llama_sampler {
-        struct llama_sampler_i  * iface;
-        llama_sampler_context_t   ctx;
+        const struct llama_sampler_i * iface;
+        llama_sampler_context_t        ctx;
    };

    // mirror of llama_sampler_i:
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-694244a6e40dc255f6bb4376fb17431c06633e6c
+08b538031f7f944e84f472483ef5d26bf5190ead
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1275,6 +1275,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

    const bool use_mmap_buffer = true;

+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
+
    // build a list of buffer types for the CPU and GPU devices
    pimpl->cpu_buft_list = make_cpu_buft_list(devices);
    for (auto * dev : devices) {
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {

 // llama_sampler API

+struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
+    return new llama_sampler {
+        /* .iface = */ iface,
+        /* .ctx   = */ ctx,
+    };
+}
+
 const char * llama_sampler_name(const struct llama_sampler * smpl) {
    if (!smpl->iface) {
        return "(null)";
@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
    }

    if (smpl->ctx == nullptr) {
-        return new llama_sampler {
+        return llama_sampler_init(
            /* .iface = */ smpl->iface,
-            /* .ctx   = */ nullptr,
-        };
+            /* .ctx   = */ nullptr
+        );
    }

    GGML_ABORT("the sampler does not support cloning");
@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
 };

 struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_chain_i,
        /* .ctx   = */ new llama_sampler_chain {
            /* .params      = */ params,
            /* .samplers    = */ {},
            /* .t_sample_us = */ 0,
            /* .n_sample    = */ 0,
-        },
-    };
+        }
+    );
 }

 void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
 };

 struct llama_sampler * llama_sampler_init_greedy() {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_greedy_i,
-        /* .ctx   = */ nullptr,
-    };
+        /* .ctx   = */ nullptr
+    );
 }

 // dist
@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {

 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_dist_i,
        /* .ctx   = */ new llama_sampler_dist {
            /* .seed     = */ seed,
            /* .seed_cur = */ seed_cur,
            /* .rng      = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // softmax
@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
 };

 struct llama_sampler * llama_sampler_init_softmax() {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_softmax_i,
-        /* .ctx   = */ nullptr,
-    };
+        /* .ctx   = */ nullptr
+    );
 }

 // top-k
@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
 };

 struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_top_k_i,
        /* .ctx   = */ new llama_sampler_top_k {
            /* .k = */ k,
-        },
-    };
+        }
+    );
 }

 // top-p
@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
 };

 struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_top_p_i,
        /* .ctx   = */ new llama_sampler_top_p {
            /* .p        = */ p,
            /* .min_keep = */ min_keep,
-        },
-    };
+        }
+    );
 }

 // min-p
@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
 };

 struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_min_p_i,
        /* .ctx   = */ new llama_sampler_min_p {
            /* .p        = */ p,
            /* .min_keep = */ min_keep,
-        },
-    };
+        }
+    );
 }

 // typical
@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
 };

 struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_typical_i,
        /* .ctx   = */ new llama_sampler_typical {
            /* .p        = */ p,
            /* .min_keep = */ min_keep,
-        },
-    };
+        }
+    );
 }

 // temp
@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
 };

 struct llama_sampler * llama_sampler_init_temp(float temp) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_temp_i,
        /* .ctx   = */ new llama_sampler_temp {
            /*.temp = */ temp,
-        },
-    };
+        }
+    );
 }

 // temp-ext
@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
 };

 struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_temp_ext_i,
        /* .ctx   = */ new llama_sampler_temp_ext {
            /* .temp     = */ temp,
            /* .delta    = */ delta,
            /* .exponent = */ exponent,
-        },
-    };
+        }
+    );
 }

 // xtc
@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {

 struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_xtc_i,
        /* .ctx   = */ new llama_sampler_xtc {
            /* .probability   = */ p,
@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
            /* .seed          = */ seed,
            /* .seed_cur      = */ seed_cur,
            /* .rng           = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // mirostat
@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {

 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_mirostat_i,
        /* .ctx   = */ new llama_sampler_mirostat {
            /* .n_vocab  = */ n_vocab,
@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
            /* .m        = */ m,
            /* .mu       = */ 2.0f*tau,
            /* .rng      = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // mirostat v2
@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {

 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_mirostat_v2_i,
        /* .ctx   = */ new llama_sampler_mirostat_v2 {
            /* .seed     = */ seed,
@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
            /* .eta      = */ eta,
            /* .mu       = */ 2.0f*tau,
            /* .rng      = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // grammar
@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
        };
    }

-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_grammar_i,
-        /* .ctx   = */ ctx,
-    };
+        /* .ctx   = */ ctx
+    );
 }

 struct llama_sampler * llama_sampler_init_grammar(
@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
        float penalty_present) {
    penalty_last_n = std::max(penalty_last_n, 0);

-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_penalties_i,
        /* .ctx   = */ new llama_sampler_penalties {
            /* .penalty_last_n  = */ penalty_last_n,
@ -1687,8 +1694,8 @@ struct llama_sampler * llama_sampler_init_penalties(
            /* .penalty_present = */ penalty_present,
            /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
            /* .token_count     = */ {},
-        },
-    };
+        }
+    );
 }

 // DRY
@ -2041,7 +2048,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
        }
    }

-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_dry_i,
        /* .ctx   = */ new llama_sampler_dry {
            /* .total_context_size     = */ context_size,
@ -2053,8 +2060,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
            /* .dry_repeat_count       = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
            /* .dry_max_token_repeat   = */ {},
            /* .last_tokens            = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
-        },
-    };
+        }
+    );
 }

 // wrapper for test-sampling.cpp
@ -2155,14 +2162,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
                         int32_t   n_vocab,
                         int32_t   n_logit_bias,
          const llama_logit_bias * logit_bias) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_logit_bias_i,
        /* .ctx   = */ new llama_sampler_logit_bias {
            /* .n_vocab    = */ n_vocab,
            /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
            /* .to_search  = */ {},
-        },
-    };
+        }
+    );
 }

 // infill
@ -2377,14 +2384,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
 };

 struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_infill_i,
        /* .ctx   = */ new llama_sampler_infill {
            /* .vocab = */ vocab,
            /* .buf0  = */ std::vector<char>(512),
            /* .buf1  = */ std::vector<char>(512),
-        },
-    };
+        }
+    );
 }

 // utils
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -7217,7 +7217,7 @@ struct llm_build_context {
                struct ggml_tensor * Qcur = nullptr;
                struct ggml_tensor * Kcur = nullptr;
                struct ggml_tensor * Vcur = nullptr;
-                if (model.type == LLM_TYPE_1_5B || model.type == LLM_TYPE_4B || model.type == LLM_TYPE_9B) {
+                if (model.layers[il].wqkv == nullptr) {
                    Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                    if (model.layers[il].bq) {
                        Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@ -8801,12 +8801,14 @@ static int llama_decode_impl(
    //llama_synchronize(&lctx);

    // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
+    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
+        // - do not defrag small contexts (i.e. < 2048 tokens)
+        // - count the padding towards the number of used tokens
+        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;

        // queue defragmentation for next llama_kv_cache_update
        if (fragmentation > cparams.defrag_thold) {
-            //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);

            llama_kv_cache_defrag(kv_self);
        }
@ -9428,8 +9430,6 @@ static struct llama_model * llama_model_load_from_file_impl(
        struct llama_model_params params) {
    ggml_time_init();

-    llama_model * model = new llama_model(params);
-
    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
        params.progress_callback_user_data = &cur_percentage;
@ -9447,6 +9447,8 @@ static struct llama_model * llama_model_load_from_file_impl(
        };
    }

+    llama_model * model = new llama_model(params);
+
    // create list of devices to use with this model
    if (params.devices) {
        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        try {
+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        }
+        catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.emplace_back(0xFFFD); // replacement character
+        }
    }
    return result;
 }
Author	SHA1	Message	Date
Olivier Chafik	d7b31a9d84	sync: minja (`a72057e519`) (#11774 )	2025-02-10 09:34:09 +00:00
pascal-lc	9ac3457b39	Update README.md [no ci] (#11781 ) typo: `\` -> `/` Change the UNIX path separator to` \`.	2025-02-10 09:05:57 +01:00
Danny Milosavljevic	c2a67efe38	vulkan: Make Vulkan optional at runtime (#11493 ). (#11494 ) Co-authored-by: Jeff Bolz <jbolz@nvidia.com>	2025-02-10 07:17:21 +01:00
Wagner Bruna	b044a0fe3c	vulkan: add environment variable GGML_VK_PREFER_HOST_MEMORY to avoid VRAM allocation (#11592 )	2025-02-10 07:08:22 +01:00
Eric Curtin	19d3c8293b	There's a better way of clearing lines (#11756 ) Use the ANSI escape code for clearing a line. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2025-02-09 10:34:49 +00:00
Jeff Bolz	98f6b0fd1e	vulkan: account for lookup tables when checking shared memory size (#11502 )	2025-02-09 08:43:51 +01:00
Xuan-Son Nguyen	55ac8c7791	server : (webui) revamp Settings dialog, add Pyodide interpreter (#11759 ) * redo Settings modal UI * add python code interpreter * fix auto scroll * build * fix overflow for long output lines * bring back sticky copy button * adapt layout on mobile view * fix multiple lines output and color scheme * handle python exception * better state management * add webworker * add headers * format code * speed up by loading pyodide on page load * (small tweak) add small animation to make it feels like claude	2025-02-08 21:54:50 +01:00
Woof Dog	e6e6583199	server : (webui) increase edit textarea size (#11763 )	2025-02-08 20:09:55 +01:00
Georgi Gerganov	aaa5505307	server : minor log updates (#11760 ) ggml-ci	2025-02-08 18:08:43 +02:00
Georgi Gerganov	bdcf8b6a56	cont : fix mmap flag print (#11699 )	2025-02-08 16:49:38 +02:00
Karol Kontny	4d3465c5ae	ggml: Fix data race in ggml threadpool (#11736 ) After the barrier in last iteration is executed, still the loop termination condition will be executed. However main thread can destroy the cgraph object and its nodes already, then another thread will access it, but the thing is already gone. Also trouble can happen when n_nodes == 0 or abort is called, but I'm not sure if the prior situation is possible. Last syncronization should be done after the loop to ensure the cgraph/cplan won't be accessed after the main thread exits from the function.	2025-02-08 15:30:53 +01:00
Johannes Gäßler	d80be897ac	CUDA: fix min. version for movmatrix (#11751 )	2025-02-08 10:46:07 +01:00
Nikolaos Pothitos	3ab410f55f	readme : update front-end framework (#11753 ) After the migration to React with #11688	2025-02-08 10:43:04 +01:00
Xuan-Son Nguyen	0cf867160c	server : (webui) fix numeric settings being saved as string (#11739 ) * server : (webui) fix numeric settings being saved as string * add some more comments	2025-02-08 10:42:34 +01:00
Eric Curtin	d2fe216fb2	Make logging more verbose (#11714 ) Debugged an issue with a user who was on a read-only filesystem. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2025-02-07 14:42:46 +00:00
Georgi Gerganov	ed926d8833	llama : fix defrag logic (#11707 ) * llama : fix defrag logic ggml-ci * cont : better logic ggml-ci * cont : clamp fragmentation to 0.0 ggml-ci	2025-02-07 16:05:34 +02:00
Christian Fillion	2d219b389e	vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729 ) Silently insert U+FFFD(s) (Unicode replacement character) instead until the next valid codepoint can be found. This fixes `llama_tokenize` throwing an exception across the C API boundary or libllama's module boundary (the caller's runtime might be incompatible!) Returing a proper error code might be desirable, however the signature of `llama_tokenize` doesn't allow it as all return values already have existing meaning.	2025-02-07 15:55:47 +02:00
magicse	333820d749	llama : fix progress dots (#11730 ) * Update llama.cpp For display progress dots in terminal. Without this it didn't display dots progress during loading model from file. * Update llama.cpp removed trailing spaces	2025-02-07 15:48:47 +02:00
Jeff Bolz	c026ba3c23	vulkan: print shared memory size (#11719 )	2025-02-07 11:26:03 +01:00
Christian Fillion	7ee953a64a	llama : add llama_sampler_init for safe usage of llama_sampler_free (#11727 ) The C API in llama.h claims users can implement `llama_sampler_i` to create custom `llama_sampler`. The sampler chain takes ownership and calls `llama_sampler_free` on them. However, `llama_sampler_free` is hard-coded to use `delete`. This is undefined behavior if the object wasn't also allocated via `new` from libllama's C++ runtime. Callers in C and C-compatible languages do not use C++'s `new` operator. C++ callers may not be sharing the same heap as libllama.	2025-02-07 11:33:27 +02:00
Akarshan Biswas	ec3bc8270b	SYCL: remove XMX info from print devices (#11712 )	2025-02-07 09:27:53 +00:00
Daniel Bevenius	b7552cfcbc	common : add default embeddings presets (#11677 ) * common : add default embeddings presets This commit adds default embeddings presets for the following models: - bge-small-en-v1.5 - e5-small-v2 - gte-small These can be used with llama-embedding and llama-server. For example, with llama-embedding: ```console ./build/bin/llama-embedding --embd-gte-small-default -p "Hello, how are you?" ``` And with llama-server: ```console ./build/bin/llama-server --embd-gte-small-default ``` And the embeddings endpoint can then be called with a POST request: ```console curl --request POST \ --url http://localhost:8080/embeddings \ --header "Content-Type: application/json" \ --data '{"input": "Hello, how are you?"}' ``` I'm not sure if these are the most common embedding models but hopefully this can be a good starting point for discussion and further improvements. Refs: https://github.com/ggerganov/llama.cpp/issues/10932	2025-02-07 09:15:22 +01:00
Jinyang He	225bbbfa39	ggml : optimize and build warning fix for LoongArch (#11709 ) * ggml : optimize convert f32<->f16 for loongarch_asx * ggml : optimize loongarch_asx extend i16,i8,u8 to i32,i16 * ggml : Fix warnings when run cpu CI locally on LoongArch	2025-02-07 09:38:31 +02:00
tv1wnd	855cd0734a	llama : fix old glm4 models (#11670 )	2025-02-06 22:48:51 +01:00
Georgi Gerganov	8a59053f63	sync : ggml	2025-02-06 21:23:03 +02:00
Patrick Peng	1d20e53c40	rpc: fix known RCE in rpc-server (ggml/1103) Add bounds checking in `rpc_server::copy_tensor` to prevent out-of-bounds writes + Check if `(uint8_t *)dst->data + ggml_nbytes(src)` remains within the destination buffer’s allocated region.	2025-02-06 21:22:54 +02:00
Xuan-Son Nguyen	2fb3c32a16	server : (webui) migrate project to ReactJS with typescript (#11688 ) * init version * fix auto scroll * bring back copy btn * bring back thought process * add lint and format check on CI * remove lang from html tag * allow multiple generations at the same time * lint and format combined * fix unused var * improve MarkdownDisplay * fix more latex * fix code block cannot be selected while generating	2025-02-06 17:32:29 +01:00
Tei Home	9ab42dc722	docs: update fedora cuda guide for 12.8 release (#11393 ) * docs: update fedora cuda guide for 12.8 release * docs: build cuda update	2025-02-06 12:16:15 +00:00
Akarshan Biswas	194b2e69f8	SYCL: Adjust support condition for norm operators (#11674 ) SYCL does not support non contiguous tensors for norm operations	2025-02-06 11:42:35 +00:00
Georgi Gerganov	9dd7a0390f	llama : add log about loading model tensors (#11699 )	2025-02-06 13:41:37 +02:00
Adrien Gallouët	c0d4843225	build : fix llama.pc (#11658 ) Signed-off-by: Adrien Gallouët <adrien@gallouet.fr>	2025-02-06 13:08:13 +02:00
junchao-zhao	8d4d2be143	ggml : fix LoongArch compile error with 128-bit SIMD (#11701 )	2025-02-06 11:20:00 +02:00
Jeff Bolz	2c6c8df56d	vulkan: optimize coopmat2 iq2/iq3 callbacks (#11521 ) * vulkan: optimize coopmat2 iq2/iq3 callbacks * build: trigger CI on GLSL compute shader changes	2025-02-06 07:15:30 +01:00
Rémy O	8a7e3bf17a	vulkan: initial support for IQ4_XS quantization (#11501 )	2025-02-06 07:09:59 +01:00
Jeff Bolz	1b598b3058	vulkan: use smaller combined allocations to avoid fragmentation (#11551 )	2025-02-06 07:02:18 +01:00
Charles Duffy	902368a06b	metal : avoid breaking build when metal API predates TARGET_OS_VISION (#11690 ) Avoids breakage in nix flake build introduced by `b0569130c5`	2025-02-06 09:52:31 +08:00
Matvey Soloviev	c3db0480bb	readme : add link to Autopen under UIs (#11684 ) Autopen (https://github.com/blackhole89/autopen) is a graphical text editor that uses llama.cpp to tokenize the buffer on the fly, score the buffer, visualise token logits and allow you to switch back and forth between different possible completions at any point. It hopefully meets the criteria for inclusion, as the dependency on llama.cpp is stated prominently.	2025-02-06 01:55:25 +01:00