Merge branch 'master' into compilade/imatrix-batched-chunks

2025-02-09 12:06:24 -05:00 · 2025-02-09 12:06:24 -05:00 · 1be357d990
commit 1be357d990
parent db502ddd0e 19d3c8293b
149 changed files with 14407 additions and 5105 deletions
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -31,6 +31,11 @@ defer {
    llama_model_free(model)
 }

+guard let vocab = llama_model_get_vocab(model) else {
+    print("Failed to get vocab")
+    exit(1)
+}
+
 var tokens = tokenize(text: prompt, add_bos: true)

 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
@ -41,7 +46,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8

-let context = llama_new_context_with_model(model, context_params)
+let context = llama_init_from_model(model, context_params)
 guard context != nil else {
    print("Failed to initialize context")
    exit(1)
@ -141,7 +146,7 @@ while n_cur <= n_len {
        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

        // is it an end of stream? -> mark the stream as finished
-        if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
+        if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@ -207,7 +212,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
    let n_tokens = utf8Count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
@ -218,12 +223,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
+    let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false)
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
        let check = llama_token_to_piece(
-            model,
+            vocab,
            token,
            &result,
            Int32(result.count),
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var vocab: OpaquePointer
    private var sampling: UnsafeMutablePointer<llama_sampler>
    private var batch: llama_batch
    private var tokens_list: [llama_token]
@ -47,6 +48,7 @@ actor LlamaContext {
        self.sampling = llama_sampler_chain_init(sparams)
        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
+        vocab = llama_model_get_vocab(model)
    }

    deinit {
@ -79,7 +81,7 @@ actor LlamaContext {
        ctx_params.n_threads       = Int32(n_threads)
        ctx_params.n_threads_batch = Int32(n_threads)

-        let context = llama_new_context_with_model(model, ctx_params)
+        let context = llama_init_from_model(model, ctx_params)
        guard let context else {
            print("Could not load context!")
            throw LlamaError.couldNotInitializeContext
@ -151,7 +153,7 @@ actor LlamaContext {

        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
+        if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
@ -297,7 +299,7 @@ actor LlamaContext {
        let utf8Count = text.utf8.count
        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

        var swiftTokens: [llama_token] = []
        for i in 0..<tokenCount {
@ -316,7 +318,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
+        let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@ -324,7 +326,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
+            let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -50,3 +50,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+set(TARGET llama-llava-clip-quantize-cli)
+add_executable(${TARGET} clip-quantize-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/llava/README-glmedge.md
+++ b/examples/llava/README-glmedge.md
@ -0,0 +1,43 @@
+# GLMV-EDGE
+
+Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
+
+## Usage
+Build with cmake or run `make llama-llava-cli` to build it.
+
+After building, run: `./llama-llava-cli` to see the usage. For example:
+
+```sh
+./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+
+## GGUF conversion
+
+1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example:
+
+```sh
+git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b
+```
+
+2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
+
+```sh
+python ./examples/llava/glmedge-surgery.py -m ../model_path
+```
+
+4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
+
+```sh
+python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
+```
+
+5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
+
+```sh
+python convert_hf_to_gguf.py ../model_path
+```
+
+Now both the LLM part and the image encoder are in the `model_path` directory.
--- a/examples/llava/README-quantize.md
+++ b/examples/llava/README-quantize.md
@ -0,0 +1,44 @@
+# Quantizing CLIP Visual Projector
+
+This is the tool for quantizing the CLIP visual projector model. Quantization reduces the precision of the model's weights, which can significantly decrease the model size and improve inference speed, often with minimal impact on performance.
+
+## Usage
+
+To quantize a CLIP visual projector model, use the following command:
+
+```sh
+./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf <type>
+```
+
+After the quantization, the visual projector can be used freely with the existing LLAVA cli (LLAVA, Qwen2VL, etc).
+
+### Arguments
+
+- `/path/to/ggml-model-f32.gguf`: The path to the input model file in FP32 or FP16 format.
+- `/path/to/ggml-model-quantized.gguf`: The path where the quantized model will be saved.
+- `<type>`: The quantization type to apply. This should be an integer corresponding to one of the quantization types defined in the `enum ggml_type`.
+
+### Quantization Types
+
+The following quantization types are supported, based on the `enum ggml_type` definition:
+
+- `2` - `q4_0`: 4-bit quantization with a single scale value.
+- `3` - `q4_1`: 4-bit quantization with a separate scale value for each block.
+- `6` - `q5_0`: 5-bit quantization with a single scale value.
+- `7` - `q5_1`: 5-bit quantization with a separate scale value for each block.
+- `8` - `q8_0`: 8-bit quantization with a single scale value.
+
+### Example
+
+To quantize a model using the `q4_0` quantization type, you would run:
+
+```sh
+./bin/llama-llava-clip-quantize-cli /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf 2
+```
+
+This command will generate a quantized model at `/path/to/ggml-model-quantized.gguf` using the `q4_0` quantization method.
+
+## Notes
+
+- Quantization can lead to a loss in model accuracy, depending on the chosen quantization type. It is recommended to evaluate the quantized model's performance on your specific task to ensure it meets your requirements.
+- The quantized model will typically be smaller in size and faster to run, making it more suitable for deployment in resource-constrained environments.
--- a/examples/llava/clip-quantize-cli.cpp
+++ b/examples/llava/clip-quantize-cli.cpp
@ -0,0 +1,59 @@
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+#include "ggml.h"
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
+    fprintf(stderr, "  type = 2 - q4_0\n");
+    fprintf(stderr, "  type = 3 - q4_1\n");
+    fprintf(stderr, "  type = 6 - q5_0\n");
+    fprintf(stderr, "  type = 7 - q5_1\n");
+    fprintf(stderr, "  type = 8 - q8_0\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const int itype = atoi(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+
+    return 0;
+}
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -102,6 +102,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
 #define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
 #define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
+#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
 #define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
 #define KEY_USE_GELU            "clip.use_gelu"
@ -160,6 +161,15 @@ static std::string format(const char * fmt, ...) {
 #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
 #define TN_MINICPMV_LN "resampler.ln_%s.%s"

+#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
+#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
+#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
+#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
+#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
+#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
+#define TN_GLM_BOI_W "adapter.boi"
+#define TN_GLM_EOI_W "adapter.eoi"
+

 enum projector_type {
    PROJECTOR_TYPE_MLP,
@ -167,6 +177,7 @@ enum projector_type {
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_LDPV2,
    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_GLM_EDGE,
    PROJECTOR_TYPE_MERGER,
    PROJECTOR_TYPE_UNKNOWN,
 };
@ -176,6 +187,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_LDP, "ldp" },
    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+    { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
    { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
 };

@ -500,6 +512,12 @@ struct clip_vision_model {
    struct ggml_tensor * mm_4_w = NULL;
    struct ggml_tensor * mm_4_b = NULL;

+    //GLMV-Edge projection
+    struct ggml_tensor * mm_model_adapter_conv_w;
+    struct ggml_tensor * mm_model_adapter_conv_b;
+    struct ggml_tensor * boi_w;
+    struct ggml_tensor * eoi_w;
+
    // MobileVLM projection
    struct ggml_tensor * mm_model_mlp_1_w;
    struct ggml_tensor * mm_model_mlp_1_b;
@ -560,6 +578,7 @@ struct clip_ctx {
    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
    bool has_minicpmv_projector = false;
+    bool has_glm_projector = false;
    bool has_qwen2vl_merger = false;
    int minicpmv_version = 2;

@ -638,7 +657,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

    const int batch_size = imgs->size;

-    if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
+    if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
        GGML_ASSERT(batch_size == 1);
    }

@ -734,8 +753,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    }

    // loop over layers
-    if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
-        // TODO: figure out why we doing thing in this way ???
+    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
        n_layer += 1;
    }
    for (int il = 0; il < n_layer - 1; il++) {
@ -1095,7 +1113,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            GGML_ASSERT(false);
        }
    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+    // glm projector
+    else if (ctx->has_glm_projector) {
+        if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
+            embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+            embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+            embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+            embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+            //GLU
+            {
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+                embeddings = ggml_gelu_inplace(ctx0, embeddings);
+                struct ggml_tensor * x = embeddings;
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+                x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+                embeddings = ggml_silu_inplace(ctx0, embeddings);
+                embeddings = ggml_mul(ctx0, embeddings,x);
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+            }
+        } else {
+            GGML_ABORT("fatel error");
+        }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);

        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@ -1284,6 +1328,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
        }

+        idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
+        if (idx != -1) {
+            new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
+        }
+
        idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
        if (idx != -1) {
            new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
@ -1308,6 +1357,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_INF("%s: glm_projector:  %d\n", __func__, new_clip->has_glm_projector);
            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
@ -1575,6 +1625,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
+            vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
+            vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
+            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
+            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
+            vision_model.mm_model_mlp_1_w =  get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
+            vision_model.mm_model_mlp_2_w =  get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
+            vision_model.mm_model_mlp_3_w =  get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
+            vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
+            vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
+        }
        else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
            vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
            vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
@ -2115,6 +2177,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        return true;
    }

+    if (ctx->has_glm_projector) {
+        res_imgs->size = 1;
+        res_imgs->data = new clip_image_f32[res_imgs->size];
+        clip_image_u8 resized_image;
+        int32_t sz=ctx->vision_model.hparams.image_size;
+        bicubic_resize(*img, resized_image,sz,sz);
+        clip_image_f32 * res = clip_image_f32_init();
+        //clip_image_save_to_bmp(resized_image, "resized.bmp");
+        normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
+        res_imgs->data[0] = *res;
+        clip_image_f32_free(res);
+        return true;
+    }
+
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
        LOG_ERR("This gguf file seems to have no vision encoder\n");
@ -2300,7 +2376,8 @@ void clip_free(clip_ctx * ctx) {
 }

 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    int extra_tokens = ctx->has_glm_projector ? 2 : 0;
+    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }

 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@ -2342,7 +2419,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i

    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

-    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
        n_patches /= 4;
    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
        if (ctx->minicpmv_version == 2) {
@ -2475,6 +2552,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    if (ctx->has_minicpmv_projector) {
        GGML_ASSERT(batch_size == 1);
    }
+    if (ctx->has_glm_projector) {
+        GGML_ASSERT(batch_size == 1);
+        ggml_tensor * boi = ctx->vision_model.boi_w;
+        ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
+        vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
+    }

    // build the inference graph
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
@ -2627,7 +2710,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);

-            {
+            if (!ctx->has_glm_projector) {
                struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                int* patches_data = (int*)malloc(ggml_nbytes(patches));
                for (int i = 0; i < num_patches; i++) {
@ -2651,14 +2734,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));

+    if (ctx->has_glm_projector) {
+        //eoi
+        ggml_tensor * eoi = ctx->vision_model.eoi_w;
+        int offset = ggml_nelements(embeddings);
+        ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
+    }
+
    return true;
 }

 bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
-    ggml_type type = GGML_TYPE_Q4_1;
-
    assert(itype < GGML_TYPE_COUNT);
-    type = static_cast<ggml_type>(itype);
+    ggml_type type = static_cast<ggml_type>(itype);

    auto * ctx_clip = clip_model_load(fname_inp, 2);

@ -2711,8 +2799,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            }
        }

-        // quantize only 2D tensors
-        quantize &= (ggml_n_dims(cur) == 2);
+        // quantize only 2D tensors and bigger than block size
+        quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);

        if (quantize) {
            new_type = type;
@ -2812,6 +2900,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return 3584;
        }
    }
+    if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
+        return ctx->vision_model.mm_model_mlp_3_w->ne[1];
+    }
    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
        return ctx->vision_model.mm_1_b->ne[0];
    }
@ -2827,6 +2918,9 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
    return 0;
 }

+bool clip_is_glm(const struct clip_ctx * ctx) {
+    return ctx->has_glm_projector;
+}
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
    return ctx->has_qwen2vl_merger;
 }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -93,6 +93,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);

 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

+CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
+
 #ifdef __cplusplus
 }
 #endif
--- a/examples/llava/glmedge-convert-image-encoder-to-gguf.py
+++ b/examples/llava/glmedge-convert-image-encoder-to-gguf.py
@ -0,0 +1,280 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+from transformers import SiglipVisionModel, SiglipVisionConfig
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if name in (
+        "vision_model.head.probe",
+        "vision_model.head.attention.in_proj_weight",
+        "vision_model.head.attention.in_proj_bias",
+        "vision_model.head.attention.out_proj.weight",
+        "vision_model.head.attention.out_proj.bias",
+        "vision_model.head.layernorm.weight",
+        "vision_model.head.layernorm.bias",
+        "vision_model.head.mlp.fc1.weight",
+        "vision_model.head.mlp.fc1.bias",
+        "vision_model.head.mlp.fc2.weight",
+        "vision_model.head.mlp.fc2.bias"
+    ):
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = None
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+vision_config = SiglipVisionConfig(**v_hparams)
+model = SiglipVisionModel(vision_config)
+model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip")))
+
+fname_middle = None
+has_text_encoder = False
+has_vision_encoder = True
+has_glm_projector = True
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_glm_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_glm_projector", has_glm_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if has_glm_projector:
+    fout.add_description("image encoder for glm4v")
+    fout.add_string("clip.projector_type", "adapter")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", 0)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"])
+
+    image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+    image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+fout.add_bool("clip.use_gelu", True)
+
+
+if has_glm_projector:
+    # model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+        if name.startswith("vision."):
+            name=name.replace("vision.","")
+        fout.add_tensor(name, data)
+        print(f"Projector {name} - {data.dtype} - shape = {data.shape}")
+        # print(f"Projector {name} tensors added\n")
+
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            # print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            # print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            # print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    print(f"siglip {name} - {data.dtype} - shape = {data.shape}")
+    # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
--- a/examples/llava/glmedge-surgery.py
+++ b/examples/llava/glmedge-surgery.py
@ -0,0 +1,33 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to GLM model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/glm.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/glm.clip")
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.")
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -311,6 +311,20 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        img_res_v.size = 0;
        img_res_v.data = nullptr;
    }
+    else if (clip_is_glm(ctx_clip)){
+        struct clip_image_size * load_image_size = clip_image_size_init();
+        load_image_size->width = img_res_v.data[0].nx;
+        load_image_size->height = img_res_v.data[0].ny;
+        clip_add_load_image_size(ctx_clip, load_image_size);
+
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
+        int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
+        *n_img_pos = (pos * pos + 2);
+        if (!encoded){
+            LOG_ERR("Unable to encode image \n");
+            return false;
+        }
+    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
        *n_img_pos = clip_n_patches(ctx_clip);
@ -395,6 +409,9 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
    if (clip_is_minicpmv(ctx_clip)) {
        num_max_patches = 10;
    }
+    if (clip_is_glm(ctx_clip)) {
+        num_max_patches = 1;
+    }
    float * image_embd;
    if (clip_is_qwen2vl(ctx_clip)) {
        // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -24,15 +24,16 @@
 #include <string>
 #include <vector>

+#include "chat-template.hpp"
 #include "common.h"
 #include "json.hpp"
 #include "linenoise.cpp/linenoise.h"
 #include "llama-cpp.h"
-#include "chat-template.hpp"
+#include "log.h"

 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
-    printf("\n\033[0m");
+    printf("\n" LOG_COL_DEFAULT);
    exit(0);  // not ideal, but it's the only way to guarantee exit in all cases
 }
 #endif
@ -65,6 +66,13 @@ static int printe(const char * fmt, ...) {
    return ret;
 }

+static std::string strftime_fmt(const char * fmt, const std::tm & tm) {
+    std::ostringstream oss;
+    oss << std::put_time(&tm, fmt);
+
+    return oss.str();
+}
+
 class Opt {
  public:
    int init(int argc, const char ** argv) {
@ -338,7 +346,7 @@ class HttpClient {
        if (!output_file.empty()) {
            output_file_partial = output_file + ".partial";
            if (!out.open(output_file_partial, "ab")) {
-                printe("Failed to open file\n");
+                printe("Failed to open file for writing\n");

                return 1;
            }
@ -527,8 +535,7 @@ class HttpClient {

    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
                               const std::string & progress_suffix) {
-        printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
-               progress_suffix.c_str());
+        printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
    }
    // Function to write data to a file
    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
@ -698,6 +705,39 @@ class LlamaData {
        return download(url, bn, true);
    }

+    int s3_dl(const std::string & model, const std::string & bn) {
+        const size_t slash_pos = model.find('/');
+        if (slash_pos == std::string::npos) {
+            return 1;
+        }
+
+        const std::string bucket     = model.substr(0, slash_pos);
+        const std::string key        = model.substr(slash_pos + 1);
+        const char * access_key = std::getenv("AWS_ACCESS_KEY_ID");
+        const char * secret_key = std::getenv("AWS_SECRET_ACCESS_KEY");
+        if (!access_key || !secret_key) {
+            printe("AWS credentials not found in environment\n");
+            return 1;
+        }
+
+        // Generate AWS Signature Version 4 headers
+        // (Implementation requires HMAC-SHA256 and date handling)
+        // Get current timestamp
+        const time_t                   now     = time(nullptr);
+        const tm                       tm      = *gmtime(&now);
+        const std::string              date     = strftime_fmt("%Y%m%d", tm);
+        const std::string              datetime = strftime_fmt("%Y%m%dT%H%M%SZ", tm);
+        const std::vector<std::string> headers  = {
+            "Authorization: AWS4-HMAC-SHA256 Credential=" + std::string(access_key) + "/" + date +
+                "/us-east-1/s3/aws4_request",
+            "x-amz-content-sha256: UNSIGNED-PAYLOAD", "x-amz-date: " + datetime
+        };
+
+        const std::string url = "https://" + bucket + ".s3.amazonaws.com/" + key;
+
+        return download(url, bn, true, headers);
+    }
+
    std::string basename(const std::string & path) {
        const size_t pos = path.find_last_of("/\\");
        if (pos == std::string::npos) {
@ -738,6 +778,9 @@ class LlamaData {
            rm_until_substring(model_, "github:");
            rm_until_substring(model_, "://");
            ret = github_dl(model_, bn);
+        } else if (string_starts_with(model_, "s3://")) {
+            rm_until_substring(model_, "://");
+            ret = s3_dl(model_, bn);
        } else {  // ollama:// or nothing
            rm_until_substring(model_, "ollama.com/library/");
            rm_until_substring(model_, "://");
@ -753,16 +796,13 @@ class LlamaData {
    llama_model_ptr initialize_model(Opt & opt) {
        ggml_backend_load_all();
        resolve_model(opt.model_);
-        printe(
-            "\r%*s"
-            "\rLoading model",
-            get_terminal_width(), " ");
+        printe("\r" LOG_CLR_TO_EOL "Loading model");
        llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
        if (!model) {
            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }

-        printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
+        printe("\r" LOG_CLR_TO_EOL);
        return model;
    }

@ -804,7 +844,15 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
            });
        }
        try {
-            auto result = tmpl.apply(messages, /* tools= */ json(), append);
+            minja::chat_template_inputs tmpl_inputs;
+            tmpl_inputs.messages = messages;
+            tmpl_inputs.add_generation_prompt = append;
+
+            minja::chat_template_options tmpl_opts;
+            tmpl_opts.use_bos_token = false;
+            tmpl_opts.use_eos_token = false;
+
+            auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
            llama_data.fmtted.resize(result.size() + 1);
            memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
            return result.size();
@ -847,7 +895,7 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
    const int n_ctx      = llama_n_ctx(ctx.get());
    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
    if (n_ctx_used + batch.n_tokens > n_ctx) {
-        printf("\033[0m\n");
+        printf(LOG_COL_DEFAULT "\n");
        printe("context size exceeded\n");
        return 1;
    }
@ -910,17 +958,14 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
        batch = llama_batch_get_one(&new_token_id, 1);
    }

-    printf("\033[0m");
+    printf(LOG_COL_DEFAULT);
    return 0;
 }

 static int read_user_input(std::string & user_input) {
    static const char * prompt_prefix = "> ";
 #ifdef WIN32
-    printf(
-        "\r%*s"
-        "\r\033[0m%s",
-        get_terminal_width(), " ", prompt_prefix);
+    printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);

    std::getline(std::cin, user_input);
    if (std::cin.eof()) {
@ -956,7 +1001,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
                             const bool stdout_a_terminal) {
    // Set response color
    if (stdout_a_terminal) {
-        printf("\033[33m");
+        printf(LOG_COL_YELLOW);
    }

    if (generate(llama_data, prompt, response)) {
@ -965,7 +1010,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
    }

    // End response with color reset and newline
-    printf("\n%s", stdout_a_terminal ? "\033[0m" : "");
+    printf("\n%s", stdout_a_terminal ? LOG_COL_DEFAULT : "");
    return 0;
 }

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
+| `--jinja` | Enable experimental Jinja templating engine (required for tool use) |

 **Example-specific params**

@ -220,7 +220,7 @@ services:
 The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.

 The web UI is developed using:
- `vue` framework for frontend development
+- `react` framework for frontend development
 - `tailwindcss` and `daisyui` for styling
 - `vite` for build tooling

@ -1069,7 +1069,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte

 *Options:*

-See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.

 The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.

@ -1117,17 +1117,119 @@ curl http://localhost:8080/v1/chat/completions \
 }'
 ```

-... and even tool usage (needs `--jinja` flag):
+*Tool call support*
+
+[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):
+
+- Requires `--jinja` flag
+- Native tool call formats supported:
+  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+  - Functionary v3.1 / v3.2
+  - Hermes 2/3, Qwen 2.5
+  - Mistral Nemo
+  - Firefunction v2
+  - Command R7B
+  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+  <details>
+  <summary>Show some common templates and which format handler they use</summary>
+
+  | Template | Format |
+  |----------|--------|
+  | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls |
+  | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls |
+  | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls |
+  | NexaAIDev-Octopus-v2.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls |
+  | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls |
+  | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls |
+  | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls |
+  | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls |
+  | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls |
+  | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls |
+  | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls |
+  | databricks-dbrx-instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls |
+  | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls |
+  | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls |
+  | google-gemma-2-2b-it.jinja | generic tool calls |
+  | google-gemma-7b-it.jinja | generic tool calls |
+  | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls |
+  | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls |
+  | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls |
+  | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls |
+  | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls |
+  | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls |
+  | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls |
+  | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls |
+  | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls |
+  | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls |
+  | mlabonne-AlphaMonarch-7B.jinja | generic tool calls |
+  | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) |
+  | openchat-openchat-3.5-0106.jinja | generic tool calls |
+  | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls |
+
+  This table can be generated with:
+
+  ```bash
+  ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+
+  </details>
+
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+  - Use `--chat-template-file` to override the template when appropriate (see examples below)
+  - Generic support may consume more tokens and be less efficient than a model's native format.
+
+- Run with:

  ```shell
-  llama-server --jinja -hfr lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF -hff Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf -fa
+  # Native support:
+  llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
+  llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+  llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M

-  # https://huggingface.co/meetkai/functionary-medium-v3.2
-  llama-server --jinja -hfr bartowski/functionary-medium-v3.2-GGUF -hff functionary-medium-v3.2-IQ4_XS.gguf -fa
+  # Native support requires the right template for these GGUFs:

-  # https://huggingface.co/meetkai/functionary-medium-v3.1
-  llama-server --jinja -hfr meetkai/functionary-medium-v3.1-GGUF -hff functionary-medium-llama-3.1.Q4_0.gguf -fa
+  llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )

+  llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+
+  llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+    --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+    --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+
+  # Generic format support
+  llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
+  llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
+  ```
+
+- Test in CLI:
+
+  ```bash
  curl http://localhost:8080/v1/chat/completions -d '{
    "model": "gpt-3.5-turbo",
    "tools": [
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -131,6 +131,11 @@ struct slot_params {
            lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
        }

+        std::vector<std::string> grammar_trigger_words;
+        for (const auto & trigger : sampling.grammar_trigger_words) {
+            grammar_trigger_words.push_back(trigger.word);
+        }
+
        return json {
            {"n_predict",                 n_predict},     // Server configured n_predict
            {"seed",                      sampling.seed},
@ -165,8 +170,9 @@ struct slot_params {
            {"n_probs",                   sampling.n_probs},
            {"min_keep",                  sampling.min_keep},
            {"grammar",                   sampling.grammar},
-            // {"grammar_trigger_words",     sampling.grammar_trigger_words},
+            {"grammar_trigger_words",     grammar_trigger_words},
            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
+            {"preserved_tokens",          sampling.preserved_tokens},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@ -328,24 +334,24 @@ struct server_task {
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                  = json_value(data, "json_schema", json::object());
-                LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+                SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
                params.sampling.grammar      = json_schema_to_grammar(schema);
-                LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+                SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
            } catch (const std::exception & e) {
                throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
            }
        } else {
            params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
-            LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+            SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
            params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
-            LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+            SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
        }

        {
            auto it = data.find("chat_format");
            if (it != data.end()) {
                params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                LOG_DBG("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
            } else {
                params.oaicompat_chat_format = defaults.oaicompat_chat_format;
            }
@ -361,14 +367,28 @@ struct server_task {

                    auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
                    if (ids.size() == 1) {
-                        LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
                        params.sampling.grammar_trigger_tokens.push_back(ids[0]);
+                        params.sampling.preserved_tokens.insert(ids[0]);
                        continue;
                    }
-                    LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
+                    SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
                    params.sampling.grammar_trigger_words.push_back(trigger);
                }
            }
+            const auto preserved_tokens = data.find("preserved_tokens");
+            if (preserved_tokens != data.end()) {
+                for (const auto & t : *preserved_tokens) {
+                    auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
+                    if (ids.size() == 1) {
+                        SRV_DBG("Preserved token: %d\n", ids[0]);
+                        params.sampling.preserved_tokens.insert(ids[0]);
+                    } else {
+                        // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
+                        SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
+                    }
+                }
+            }
            if (params.sampling.grammar_lazy) {
                GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
            }
@ -695,37 +715,43 @@ struct server_task_result_cmpl_final : server_task_result {

    json to_json_oaicompat_chat() {
        std::string finish_reason = "length";
-        common_chat_msg message;
+        common_chat_msg msg;
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            message = common_chat_parse(content, oaicompat_chat_format);
-            finish_reason = message.tool_calls.empty() ? "stop" : "tool_calls";
+            SRV_DBG("Parsing chat message: %s\n", content.c_str());
+            msg = common_chat_parse(content, oaicompat_chat_format);
+            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
        } else {
-            message.content = content;
+            msg.content = content;
        }

        json tool_calls;
-        if (!message.tool_calls.empty()) {
+        if (!msg.tool_calls.empty()) {
            tool_calls = json::array();
-            for (const auto & tc : message.tool_calls) {
+            for (const auto & tc : msg.tool_calls) {
                tool_calls.push_back({
                    {"type", "function"},
                    {"function", {
                        {"name", tc.name},
                        {"arguments", tc.arguments},
                    }},
-                    {"id", tc.id.empty() ? json() : json(tc.id)},
+                    {"id", tc.id},
                });
            }
        }

+        json message {
+            {"content", msg.content},
+            {"tool_calls", tool_calls},
+            {"role", "assistant"},
+        };
+        if (!msg.tool_plan.empty()) {
+            message["tool_plan"] = msg.tool_plan;
+        }
+
        json choice {
            {"finish_reason", finish_reason},
            {"index", 0},
-            {"message", json {
-                {"content", message.content},
-                {"tool_calls", tool_calls},
-                {"role", "assistant"},
-            }},
+            {"message", message},
        };

        if (!stream && probs_output.size() > 0) {
@ -1858,7 +1884,12 @@ struct server_context {
            llama_init_dft.context.reset();
        }

-        chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
+        if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
+            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            chat_templates = common_chat_templates_from_model(model, "chatml");
+        } else {
+            chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
+        }
        GGML_ASSERT(chat_templates.template_default.get() != nullptr);

        return true;
@ -2827,8 +2858,7 @@ struct server_context {
        server_slot * slot_batched = nullptr;

        auto accept_special_token = [&](server_slot & slot, llama_token token) {
-            const auto & trigger_tokens = slot.params.sampling.grammar_trigger_tokens;
-            return params_base.special || std::find(trigger_tokens.begin(), trigger_tokens.end(), token) != trigger_tokens.end();
+            return params_base.special || slot.params.sampling.preserved_tokens.find(token) != slot.params.sampling.preserved_tokens.end();
        };

        // frist, add sampled tokens from any ongoing sequences
@ -3323,10 +3353,12 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
        return;
    }

-    LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch

-    LOG_DBG("request:  %s\n", req.body.c_str());
-    LOG_DBG("response: %s\n", res.body.c_str());
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
 }

 std::function<void(int)> shutdown_handler;
@ -3409,9 +3441,13 @@ int main(int argc, char ** argv) {
            message = "Unknown Exception";
        }

-        json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-        LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
-        res_error(res, formatted_error);
+        try {
+            json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
+            LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
+            res_error(res, formatted_error);
+        } catch (const std::exception & e) {
+            LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
+        }
    });

    svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) {
@ -3633,11 +3669,11 @@ int main(int argc, char ** argv) {
                    {"value",  (uint64_t) res_metrics->kv_cache_tokens_count}
            },{
                    {"name",  "requests_processing"},
-                    {"help",  "Number of request processing."},
+                    {"help",  "Number of requests processing."},
                    {"value",  (uint64_t) res_metrics->n_processing_slots}
            },{
                    {"name",  "requests_deferred"},
-                    {"help",  "Number of request deferred."},
+                    {"help",  "Number of requests deferred."},
                    {"value",  (uint64_t) res_metrics->n_tasks_deferred}
            }}}
        };
@ -3824,7 +3860,9 @@ int main(int argc, char ** argv) {

        try {
            const auto & prompt = data.at("prompt");
-            LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+            // TODO: this log can become very long, put it behind a flag or think about a more compact format
+            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+
            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
            tasks.reserve(tokenized_prompts.size());
            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@ -4340,6 +4378,9 @@ int main(int argc, char ** argv) {
                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
                } else {
                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
                }
                return false;
@ -4435,14 +4476,6 @@ int main(int argc, char ** argv) {

    LOG_INF("%s: model loaded\n", __func__);

-    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-    if (params.chat_template.empty()) {
-        if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
-            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            params.chat_template = "chatml";
-        }
-    }
-
    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
        ctx_server.chat_templates.template_default->source().c_str(),
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -13,12 +13,14 @@ def create_server():
@pytest.mark.parametrize(
    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
    [
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None),
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None),
        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True,  None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'),
+        (None, "Book", "What is the best book", 8, "^ blue",                    23, 8, "length", True, "This is not a chat template, it is"),
        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
-        # TODO: fix testing of non-tool jinja mode
-        # (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None),
-        # (None, "Book", "What is the best book", 8, "I want to play with", 23, 8, "length", True, "This is not a chat template, it is"),
-        # ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
    ]
 )
 def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@ -67,8 +67,8 @@ WEATHER_TOOL = {


 def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
-    n_predict = 512
    global server
+    n_predict = 512
    # server = ServerPreset.stories15m_moe()
    server.jinja = True
    server.n_predict = n_predict
@ -139,29 +139,49 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
    (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
    (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
    (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
    (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
+
    (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
+
    (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       "chatml"),
+
    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
    # TODO: fix these
    # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
    # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+    global server
    n_predict = 512
    server.n_slots = 1
    server.jinja = True
@ -169,10 +189,12 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
    server.n_predict = n_predict
    server.model_hf_repo = hf_repo
    server.model_hf_file = None
-    if template_override:
+    if isinstance(template_override, tuple):
        (template_hf_repo, template_variant) = template_override
        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
    server.start(timeout_seconds=TIMEOUT_SERVER_START)
    res = server.make_request("POST", "/chat/completions", data={
        "max_tokens": n_predict,
@ -251,33 +273,55 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t

@pytest.mark.slow
@pytest.mark.parametrize("hf_repo,template_override", [
+    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
    # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
    # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None):
    global server
+    n_predict = 512
    server.n_slots = 1
    server.jinja = True
    server.n_ctx = 8192
-    server.n_predict = 512
+    server.n_predict = n_predict
    server.model_hf_repo = hf_repo
    server.model_hf_file = None
-    if template_override:
+    if isinstance(template_override, tuple):
        (template_hf_repo, template_variant) = template_override
        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
    server.start(timeout_seconds=TIMEOUT_SERVER_START)
    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": 256,
+        "max_tokens": n_predict,
        "messages": [
            {"role": "user", "content": "What is the weather in Istanbul?"},
        ],
@ -298,19 +342,39 @@ def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | Non

@pytest.mark.slow
@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
+    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
    # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
    global server
    server.n_slots = 1
    server.jinja = True
@ -318,10 +382,12 @@ def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo:
    server.n_predict = 128
    server.model_hf_repo = hf_repo
    server.model_hf_file = None
-    if template_override:
+    if isinstance(template_override, tuple):
        (template_hf_repo, template_variant) = template_override
        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
    server.start(timeout_seconds=TIMEOUT_SERVER_START)
    res = server.make_request("POST", "/chat/completions", data={
        "max_tokens": 256,
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -5,10 +5,6 @@
 #include "llama.h"
 #include "common/base64.hpp"

-#ifndef NDEBUG
-// crash the server in debug mode, otherwise send an http 500 error
-#define CPPHTTPLIB_NO_EXCEPTIONS 1
-#endif
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 #include "httplib.h"
@ -484,13 +480,14 @@ static bool ends_with(const std::string & str, const std::string & suffix) {

 static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
    if (!text.empty() && !stop.empty()) {
-        auto it = std::find(stop.rbegin(), stop.rend(), text.back());
-        while (it != stop.rend()) {
-            size_t length = std::distance(it, stop.rend());
-            if (text.length() >= length && 0 == text.compare(text.length() - length, length, stop)) {
-                return text.length() - length;
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
+            if (stop[char_index] == text_last_char) {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial)) {
+                    return text.size() - char_index - 1;
+                }
            }
-            it = std::find(std::next(it), stop.rend(), text.back());
        }
    }

@ -640,9 +637,13 @@ static json oaicompat_completion_params_parse(
        inputs.tools = tools;
        inputs.tool_choice = tool_choice;
        inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
+        if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
+            LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
+            inputs.parallel_tool_calls = false;
+        }
        inputs.stream = stream;
        // TODO: support mixing schema w/ tools beyond generic format.
-        inputs.json_schema = json_value(llama_params, "json_schema", json::object());
+        inputs.json_schema = json_value(llama_params, "json_schema", json());
        auto chat_params = common_chat_params_init(tmpl, inputs);

        llama_params["chat_format"] = static_cast<int>(chat_params.format);
@ -657,6 +658,7 @@ static json oaicompat_completion_params_parse(
            });
        }
        llama_params["grammar_triggers"] = grammar_triggers;
+        llama_params["preserved_tokens"] = chat_params.preserved_tokens;
        for (const auto & stop : chat_params.additional_stops) {
            llama_params["stop"].push_back(stop);
        }
--- a/examples/server/webui/.gitignore
+++ b/examples/server/webui/.gitignore
@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
--- a/examples/server/webui/.prettierignore
+++ b/examples/server/webui/.prettierignore
@ -0,0 +1,10 @@
+**/.vscode
+**/.github
+**/.git
+**/.svn
+**/.hg
+**/node_modules
+**/dist
+**/build
+
+*.config.js
--- a/examples/server/webui/eslint.config.js
+++ b/examples/server/webui/eslint.config.js
@ -0,0 +1,26 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+
+export default tseslint.config(
+  { ignores: ['dist'] },
+  {
+    extends: [js.configs.recommended, ...tseslint.configs.recommended],
+    files: ['**/*.{ts,tsx}'],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+    plugins: {
+      'react-hooks': reactHooks,
+      'react-refresh': reactRefresh,
+    },
+    rules: {
+      ...reactHooks.configs.recommended.rules,
+      'react-refresh/only-export-components': 'off',
+      '@typescript-eslint/no-unused-vars': 'off',
+    },
+  },
+)
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@ -1,343 +1,16 @@
-<!DOCTYPE html>
+<!doctype html>
 <html>
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
-  <meta name="color-scheme" content="light dark">
-  <title>🦙 llama.cpp - chat</title>
-</head>
-
-<body>
-  <div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
-    <div class="flex flex-row drawer lg:drawer-open">
-      <input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
-
-      <!-- sidebar -->
-      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
-        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
-        <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
-          <div class="flex flex-row items-center justify-between mb-4 mt-4">
-            <h2 class="font-bold ml-4">Conversations</h2>
-
-            <!-- close sidebar button -->
-            <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
-              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
-                <path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
-              </svg>
-            </label>
-          </div>
-
-          <!-- list of conversations -->
-          <div :class="{
-            'btn btn-ghost justify-start': true,
-            'btn-active': messages.length === 0,
-          }" @click="newConversation">
-            + New conversation
-          </div>
-          <div v-for="conv in conversations" :class="{
-            'btn btn-ghost justify-start font-normal': true,
-            'btn-active': conv.id === viewingConvId,
-          }" @click="setViewingConv(conv.id)" dir="auto">
-            <span class="truncate">{{ conv.messages[0].content }}</span>
-          </div>
-          <div class="text-center text-xs opacity-40 mt-auto mx-4">
-            Conversations are saved to browser's localStorage
-          </div>
-        </div>
-      </div>
-
-      <!-- main view -->
-      <div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
-        <!-- header -->
-        <div class="flex flex-row items-center mt-6 mb-6">
-          <!-- open sidebar button -->
-          <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
-            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
-              <path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
-            </svg>
-          </label>
-
-          <div class="grow text-2xl font-bold ml-2">llama.cpp</div>
-
-          <!-- action buttons (top right) -->
-          <div class="flex items-center">
-            <div v-if="messages.length > 0" class="dropdown dropdown-end">
-              <!-- "..." button -->
-              <button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
-                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
-                  <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
-                </svg>
-              </button>
-              <!-- "delete" dropdown menu -->
-              <ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
-                <li @click="downloadConv(viewingConvId)"><a>Download</a></li>
-                <li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
-              </ul>
-            </div>
-            <div class="tooltip tooltip-bottom" data-tip="Settings">
-              <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
-                <!-- settings button -->
-                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
-                  <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
-                  <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
-                </svg>
-              </button>
-            </div>
-
-            <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
-            <div class="tooltip tooltip-bottom" data-tip="Themes">
-              <div class="dropdown dropdown-end dropdown-bottom">
-                <div tabindex="0" role="button" class="btn m-1">
-                  <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
-                    <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
-                  </svg>
-                </div>
-                <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
-                  <li>
-                    <button
-                      class="btn btn-sm btn-block btn-ghost justify-start"
-                      :class="{ 'btn-active': selectedTheme === 'auto' }"
-                      @click="setSelectedTheme('auto')">
-                      auto
-                    </button>
-                  </li>
-                  <li v-for="theme in themes">
-                    <input
-                      type="radio"
-                      name="theme-dropdown"
-                      class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
-                      :aria-label="theme"
-                      :value="theme"
-                      :checked="selectedTheme === theme"
-                      @click="setSelectedTheme(theme)" />
-                  </li>
-                </ul>
-              </div>
-            </div>
-          </div>
-        </div>
-
-        <!-- chat messages -->
-        <div id="messages-list" class="flex flex-col grow overflow-y-auto">
-          <div class="mt-auto flex justify-center">
-            <!-- placeholder to shift the message to the bottom -->
-            {{ messages.length === 0 ? 'Send a message to start' : '' }}
-          </div>
-          <div v-for="msg in messages" class="group">
-            <message-bubble
-              :config="config"
-              :msg="msg"
-              :key="msg.id"
-              :is-generating="isGenerating"
-              :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
-              :regenerate-msg="regenerateMsg"></message-bubble>
-          </div>
-
-          <!-- pending (ongoing) assistant message -->
-          <div id="pending-msg" class="group">
-            <message-bubble
-              v-if="pendingMsg"
-              :config="config"
-              :msg="pendingMsg"
-              :key="pendingMsg.id"
-              :is-generating="isGenerating"
-              :show-thought-in-progress="config.showThoughtInProgress"
-              :edit-user-msg-and-regenerate="() => {}"
-              :regenerate-msg="() => {}"></message-bubble>
-          </div>
-        </div>
-
-        <!-- chat input -->
-        <div class="flex flex-row items-center mt-8 mb-6">
-          <textarea
-            class="textarea textarea-bordered w-full"
-            placeholder="Type a message (Shift+Enter to add a new line)"
-            v-model="inputMsg"
-            @keydown.enter.exact.prevent="sendMessage"
-            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
-            :disabled="isGenerating"
-            id="msg-input"
-            dir="auto"
-          ></textarea>
-          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
-          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
-        </div>
-      </div>
-
-    </div>
-
-
-    <!-- modal for editing config -->
-    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
-      <div class="modal-box">
-        <h3 class="text-lg font-bold mb-6">Settings</h3>
-        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
-          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
-          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
-          <label class="form-control mb-2">
-            <div class="label">System Message</div>
-            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
-          </label>
-          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-          </template>
-          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
-          <!-- Section: Other sampler settings -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Other sampler settings</summary>
-            <div class="collapse-content">
-              <!-- Samplers queue -->
-              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
-              <!-- Samplers -->
-              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-              </template>
-            </div>
-          </details>
-          <!-- Section: Penalties settings -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Penalties settings</summary>
-            <div class="collapse-content">
-              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
-                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
-              </template>
-            </div>
-          </details>
-          <!-- Section: Reasoning models -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Reasoning models</summary>
-            <div class="collapse-content">
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
-                <span class="ml-4">Expand though process by default for generating message</span>
-              </div>
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
-                <span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
-              </div>
-            </div>
-          </details>
-          <!-- Section: Advanced config -->
-          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary class="collapse-title font-bold">Advanced config</summary>
-            <div class="collapse-content">
-              <div class="flex flex-row items-center mb-2" v-if="isDev">
-                <!-- this button only shows in dev mode, used to import a demo conversation to test message rendering -->
-                <button class="btn" @click="debugImportDemoConv()">(debug) Import demo conversation</button>
-              </div>
-              <div class="flex flex-row items-center mb-2">
-                <input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
-                <span class="ml-4">Show tokens per second</span>
-              </div>
-              <label class="form-control mb-2">
-                <!-- Custom parameters input -->
-                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
-                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
-              </label>
-            </div>
-          </details>
-        </div>
-
-        <!-- action buttons -->
-        <div class="modal-action">
-          <button class="btn" @click="resetConfigDialog">Reset to default</button>
-          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
-          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
-        </div>
-      </div>
-    </dialog>
-
-  </div>
-
-
-  <!-- Template to be used as message bubble -->
-  <template id="message-bubble">
-    <div :class="{
-      'chat': true,
-      'chat-start': msg.role !== 'user',
-      'chat-end': msg.role === 'user',
-    }">
-      <div :class="{
-        'chat-bubble markdown': true,
-        'chat-bubble-base-300': msg.role !== 'user',
-      }">
-        <!-- textarea for editing message -->
-        <template v-if="editingContent !== null">
-          <textarea
-            dir="auto"
-            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
-            v-model="editingContent"></textarea>
-          <br/>
-          <button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
-          <button class="btn mt-2" @click="editMsg()">Submit</button>
-        </template>
-        <template v-else>
-          <!-- show loading dots for pending message -->
-          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
-          <!-- render message as markdown -->
-          <div v-else dir="auto">
-            <details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
-              <summary class="collapse-title">
-                <span v-if="splitMsgContent.isThinking">
-                  <span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
-                  <b>Thinking</b>
-                </span>
-                <b v-else>Thought Process</b>
-              </summary>
-              <vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
-            </details>
-            <vue-markdown :source="splitMsgContent.content"></vue-markdown>
-          </div>
-          <!-- render timings if enabled -->
-          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
-            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
-            <div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
-              <b>Prompt</b><br/>
-              - Tokens: {{ timings.prompt_n }}<br/>
-              - Time: {{ timings.prompt_ms }} ms<br/>
-              - Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
-              <b>Generation</b><br/>
-              - Tokens: {{ timings.predicted_n }}<br/>
-              - Time: {{ timings.predicted_ms }} ms<br/>
-              - Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
-            </div>
-          </div>
-        </template>
-      </div>
-    </div>
-    <!-- actions for each message -->
-    <div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
-      <!-- user message -->
-      <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
-        ✍️ Edit
-      </button>
-      <!-- assistant message -->
-      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-        🔄 Regenerate
-      </button>
-      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
-        📋 Copy
-      </button>
-    </div>
-  </template>
-
-
-  <!-- Template to be used by settings modal -->
-  <template id="settings-modal-short-input">
-    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
-      <!-- Show help message on hovering on the input label -->
-      <div class="dropdown dropdown-hover">
-        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
-        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
-          {{ configInfo[configKey] || '(no help message available)' }}
-        </div>
-      </div>
-      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
-      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
-    </label>
-  </template>
-
-  <script type="module" src="/src/main.js"></script>
-</body>
-
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, initial-scale=1, maximum-scale=1"
+    />
+    <meta name="color-scheme" content="light dark" />
+    <title>🦙 llama.cpp - chat</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
 </html>
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@ -5,26 +5,55 @@
  "type": "module",
  "scripts": {
    "dev": "vite",
-    "build": "vite build",
-    "preview": "vite preview",
-    "analyze": "ANALYZE=1 npx vite-bundle-visualizer"
-  },
-  "devDependencies": {
-    "sass-embedded": "^1.83.0",
-    "vite": "^5.4.10"
+    "build": "tsc -b && vite build",
+    "format": "eslint . && prettier --write .",
+    "lint": "eslint .",
+    "preview": "vite preview"
  },
  "dependencies": {
+    "@heroicons/react": "^2.2.0",
    "@sec-ant/readable-stream": "^0.6.0",
    "@vscode/markdown-it-katex": "^1.1.1",
    "autoprefixer": "^10.4.20",
    "daisyui": "^4.12.14",
    "highlight.js": "^11.10.0",
    "katex": "^0.16.15",
-    "markdown-it": "^14.1.0",
    "postcss": "^8.4.49",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "react-markdown": "^9.0.3",
+    "react-router": "^7.1.5",
+    "rehype-highlight": "^7.0.2",
+    "rehype-katex": "^7.0.1",
+    "remark-breaks": "^4.0.0",
+    "remark-gfm": "^4.0.0",
+    "remark-math": "^6.0.0",
    "tailwindcss": "^3.4.15",
    "textlinestream": "^1.1.1",
-    "vite-plugin-singlefile": "^2.0.3",
-    "vue": "^3.5.13"
+    "vite-plugin-singlefile": "^2.0.3"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.17.0",
+    "@types/markdown-it": "^14.1.2",
+    "@types/node": "^22.13.1",
+    "@types/react": "^18.3.18",
+    "@types/react-dom": "^18.3.5",
+    "@vitejs/plugin-react": "^4.3.4",
+    "eslint": "^9.17.0",
+    "eslint-plugin-react-hooks": "^5.0.0",
+    "eslint-plugin-react-refresh": "^0.4.16",
+    "globals": "^15.14.0",
+    "prettier": "^3.4.2",
+    "sass-embedded": "^1.83.4",
+    "typescript": "~5.6.2",
+    "typescript-eslint": "^8.18.2",
+    "vite": "^6.0.5"
+  },
+  "prettier": {
+    "trailingComma": "es5",
+    "tabWidth": 2,
+    "semi": true,
+    "singleQuote": true,
+    "bracketSameLine": false
  }
 }
--- a/examples/server/webui/public/demo-conversation.json
+++ b/examples/server/webui/public/demo-conversation.json
@ -11,7 +11,7 @@
    {
      "id": 1734087548327,
      "role": "assistant",
-      "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
+      "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\n$2x + y = z$\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$",
      "timings": {
        "prompt_n": 1,
        "prompt_ms": 28.923,
--- a/examples/server/webui/src/App.tsx
+++ b/examples/server/webui/src/App.tsx
@ -0,0 +1,47 @@
+import { HashRouter, Outlet, Route, Routes } from 'react-router';
+import Header from './components/Header';
+import Sidebar from './components/Sidebar';
+import { AppContextProvider, useAppContext } from './utils/app.context';
+import ChatScreen from './components/ChatScreen';
+import SettingDialog from './components/SettingDialog';
+
+function App() {
+  return (
+    <HashRouter>
+      <div className="flex flex-row drawer lg:drawer-open">
+        <AppContextProvider>
+          <Routes>
+            <Route element={<AppLayout />}>
+              <Route path="/chat/:convId" element={<ChatScreen />} />
+              <Route path="*" element={<ChatScreen />} />
+            </Route>
+          </Routes>
+        </AppContextProvider>
+      </div>
+    </HashRouter>
+  );
+}
+
+function AppLayout() {
+  const { showSettings, setShowSettings } = useAppContext();
+  return (
+    <>
+      <Sidebar />
+      <div
+        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
+        id="main-scroll"
+      >
+        <Header />
+        <Outlet />
+      </div>
+      {
+        <SettingDialog
+          show={showSettings}
+          onClose={() => setShowSettings(false)}
+        />
+      }
+    </>
+  );
+}
+
+export default App;
--- a/examples/server/webui/src/Config.ts
+++ b/examples/server/webui/src/Config.ts
@ -0,0 +1,92 @@
+import daisyuiThemes from 'daisyui/src/theming/themes';
+import { isNumeric } from './utils/misc';
+
+export const isDev = import.meta.env.MODE === 'development';
+
+// constants
+export const BASE_URL = new URL('.', document.baseURI).href
+  .toString()
+  .replace(/\/$/, '');
+
+export const CONFIG_DEFAULT = {
+  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+  // Do not use nested objects, keep it single level. Prefix the key if you need to group them.
+  apiKey: '',
+  systemMessage: 'You are a helpful assistant.',
+  showTokensPerSecond: false,
+  showThoughtInProgress: false,
+  excludeThoughtOnReq: true,
+  // make sure these default values are in sync with `common.h`
+  samplers: 'edkypmxt',
+  temperature: 0.8,
+  dynatemp_range: 0.0,
+  dynatemp_exponent: 1.0,
+  top_k: 40,
+  top_p: 0.95,
+  min_p: 0.05,
+  xtc_probability: 0.0,
+  xtc_threshold: 0.1,
+  typical_p: 1.0,
+  repeat_last_n: 64,
+  repeat_penalty: 1.0,
+  presence_penalty: 0.0,
+  frequency_penalty: 0.0,
+  dry_multiplier: 0.0,
+  dry_base: 1.75,
+  dry_allowed_length: 2,
+  dry_penalty_last_n: -1,
+  max_tokens: -1,
+  custom: '', // custom json-stringified object
+  // experimental features
+  pyIntepreterEnabled: false,
+};
+export const CONFIG_INFO: Record<string, string> = {
+  apiKey: 'Set the API Key if you are using --api-key option for the server.',
+  systemMessage: 'The starting message that defines how model should behave.',
+  samplers:
+    'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
+  temperature:
+    'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+  dynatemp_range:
+    'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+  dynatemp_exponent:
+    'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+  top_k: 'Keeps only k top tokens.',
+  top_p:
+    'Limits tokens to those that together have a cumulative probability of at least p',
+  min_p:
+    'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+  xtc_probability:
+    'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+  xtc_threshold:
+    'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+  typical_p:
+    'Sorts and limits tokens based on the difference between log-probability and entropy.',
+  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+  repeat_penalty:
+    'Controls the repetition of token sequences in the generated text',
+  presence_penalty:
+    'Limits tokens based on whether they appear in the output or not.',
+  frequency_penalty:
+    'Limits tokens based on how often they appear in the output.',
+  dry_multiplier:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+  dry_base:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+  dry_allowed_length:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+  dry_penalty_last_n:
+    'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+  max_tokens: 'The maximum number of token per output.',
+  custom: '', // custom json-stringified object
+};
+// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+export const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT)
+  .filter((e) => isNumeric(e[1]))
+  .map((e) => e[0]);
+// list of themes supported by daisyui
+export const THEMES = ['light', 'dark']
+  // make sure light & dark are always at the beginning
+  .concat(
+    Object.keys(daisyuiThemes).filter((t) => t !== 'light' && t !== 'dark')
+  );
--- a/examples/server/webui/src/components/CanvasPyInterpreter.tsx
+++ b/examples/server/webui/src/components/CanvasPyInterpreter.tsx
@ -0,0 +1,195 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { OpenInNewTab, XCloseButton } from '../utils/common';
+import { CanvasType } from '../utils/types';
+import { PlayIcon, StopIcon } from '@heroicons/react/24/outline';
+import StorageUtils from '../utils/storage';
+
+const canInterrupt = typeof SharedArrayBuffer === 'function';
+
+// adapted from https://pyodide.org/en/stable/usage/webworker.html
+const WORKER_CODE = `
+importScripts("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.js");
+
+let stdOutAndErr = [];
+
+let pyodideReadyPromise = loadPyodide({
+  stdout: (data) => stdOutAndErr.push(data),
+  stderr: (data) => stdOutAndErr.push(data),
+});
+
+let alreadySetBuff = false;
+
+self.onmessage = async (event) => {
+  stdOutAndErr = [];
+
+  // make sure loading is done
+  const pyodide = await pyodideReadyPromise;
+  const { id, python, context, interruptBuffer } = event.data;
+
+  if (interruptBuffer && !alreadySetBuff) {
+    pyodide.setInterruptBuffer(interruptBuffer);
+    alreadySetBuff = true;
+  }
+
+  // Now load any packages we need, run the code, and send the result back.
+  await pyodide.loadPackagesFromImports(python);
+
+  // make a Python dictionary with the data from content
+  const dict = pyodide.globals.get("dict");
+  const globals = dict(Object.entries(context));
+  try {
+    self.postMessage({ id, running: true });
+    // Execute the python code in this context
+    const result = pyodide.runPython(python, { globals });
+    self.postMessage({ result, id, stdOutAndErr });
+  } catch (error) {
+    self.postMessage({ error: error.message, id });
+  }
+  interruptBuffer[0] = 0;
+};
+`;
+
+let worker: Worker;
+const interruptBuffer = canInterrupt
+  ? new Uint8Array(new SharedArrayBuffer(1))
+  : null;
+
+const startWorker = () => {
+  if (!worker) {
+    worker = new Worker(
+      URL.createObjectURL(new Blob([WORKER_CODE], { type: 'text/javascript' }))
+    );
+  }
+};
+
+if (StorageUtils.getConfig().pyIntepreterEnabled) {
+  startWorker();
+}
+
+const runCodeInWorker = (
+  pyCode: string,
+  callbackRunning: () => void
+): {
+  donePromise: Promise<string>;
+  interrupt: () => void;
+} => {
+  startWorker();
+  const id = Math.random() * 1e8;
+  const context = {};
+  if (interruptBuffer) {
+    interruptBuffer[0] = 0;
+  }
+
+  const donePromise = new Promise<string>((resolve) => {
+    worker.onmessage = (event) => {
+      const { error, stdOutAndErr, running } = event.data;
+      if (id !== event.data.id) return;
+      if (running) {
+        callbackRunning();
+        return;
+      } else if (error) {
+        resolve(error.toString());
+      } else {
+        resolve(stdOutAndErr.join('\n'));
+      }
+    };
+    worker.postMessage({ id, python: pyCode, context, interruptBuffer });
+  });
+
+  const interrupt = () => {
+    console.log('Interrupting...');
+    console.trace();
+    if (interruptBuffer) {
+      interruptBuffer[0] = 2;
+    }
+  };
+
+  return { donePromise, interrupt };
+};
+
+export default function CanvasPyInterpreter() {
+  const { canvasData, setCanvasData } = useAppContext();
+
+  const [code, setCode] = useState(canvasData?.content ?? ''); // copy to avoid direct mutation
+  const [running, setRunning] = useState(false);
+  const [output, setOutput] = useState('');
+  const [interruptFn, setInterruptFn] = useState<() => void>();
+  const [showStopBtn, setShowStopBtn] = useState(false);
+
+  const runCode = async (pycode: string) => {
+    interruptFn?.();
+    setRunning(true);
+    setOutput('Loading Pyodide...');
+    const { donePromise, interrupt } = runCodeInWorker(pycode, () => {
+      setOutput('Running...');
+      setShowStopBtn(canInterrupt);
+    });
+    setInterruptFn(() => interrupt);
+    const out = await donePromise;
+    setOutput(out);
+    setRunning(false);
+    setShowStopBtn(false);
+  };
+
+  // run code on mount
+  useEffect(() => {
+    setCode(canvasData?.content ?? '');
+    runCode(canvasData?.content ?? '');
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [canvasData?.content]);
+
+  if (canvasData?.type !== CanvasType.PY_INTERPRETER) {
+    return null;
+  }
+
+  return (
+    <div className="card bg-base-200 w-full h-full shadow-xl">
+      <div className="card-body">
+        <div className="flex justify-between items-center mb-4">
+          <span className="text-lg font-bold">Python Interpreter</span>
+          <XCloseButton
+            className="bg-base-100"
+            onClick={() => setCanvasData(null)}
+          />
+        </div>
+        <div className="grid grid-rows-3 gap-4 h-full">
+          <textarea
+            className="textarea textarea-bordered w-full h-full font-mono"
+            value={code}
+            onChange={(e) => setCode(e.target.value)}
+          ></textarea>
+          <div className="font-mono flex flex-col row-span-2">
+            <div className="flex items-center mb-2">
+              <button
+                className="btn btn-sm bg-base-100"
+                onClick={() => runCode(code)}
+                disabled={running}
+              >
+                <PlayIcon className="h-6 w-6" /> Run
+              </button>
+              {showStopBtn && (
+                <button
+                  className="btn btn-sm bg-base-100 ml-2"
+                  onClick={() => interruptFn?.()}
+                >
+                  <StopIcon className="h-6 w-6" /> Stop
+                </button>
+              )}
+              <span className="grow text-right text-xs">
+                <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/11762">
+                  Report a bug
+                </OpenInNewTab>
+              </span>
+            </div>
+            <textarea
+              className="textarea textarea-bordered h-full dark-color"
+              value={output}
+              readOnly
+            ></textarea>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/ChatMessage.tsx
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@ -0,0 +1,235 @@
+import { useMemo, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { Message, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
+
+interface SplitMessage {
+  content: PendingMessage['content'];
+  thought?: string;
+  isThinking?: boolean;
+}
+
+export default function ChatMessage({
+  msg,
+  id,
+  scrollToBottom,
+  isPending,
+}: {
+  msg: Message | PendingMessage;
+  id?: string;
+  scrollToBottom: (requiresNearBottom: boolean) => void;
+  isPending?: boolean;
+}) {
+  const { viewingConversation, replaceMessageAndGenerate, config } =
+    useAppContext();
+  const [editingContent, setEditingContent] = useState<string | null>(null);
+  const timings = useMemo(
+    () =>
+      msg.timings
+        ? {
+            ...msg.timings,
+            prompt_per_second:
+              (msg.timings.prompt_n / msg.timings.prompt_ms) * 1000,
+            predicted_per_second:
+              (msg.timings.predicted_n / msg.timings.predicted_ms) * 1000,
+          }
+        : null,
+    [msg.timings]
+  );
+
+  // for reasoning model, we split the message into content and thought
+  // TODO: implement this as remark/rehype plugin in the future
+  const { content, thought, isThinking }: SplitMessage = useMemo(() => {
+    if (msg.content === null || msg.role !== 'assistant') {
+      return { content: msg.content };
+    }
+    let actualContent = '';
+    let thought = '';
+    let isThinking = false;
+    let thinkSplit = msg.content.split('<think>', 2);
+    actualContent += thinkSplit[0];
+    while (thinkSplit[1] !== undefined) {
+      // <think> tag found
+      thinkSplit = thinkSplit[1].split('</think>', 2);
+      thought += thinkSplit[0];
+      isThinking = true;
+      if (thinkSplit[1] !== undefined) {
+        // </think> closing tag found
+        isThinking = false;
+        thinkSplit = thinkSplit[1].split('<think>', 2);
+        actualContent += thinkSplit[0];
+      }
+    }
+    return { content: actualContent, thought, isThinking };
+  }, [msg]);
+
+  if (!viewingConversation) return null;
+
+  const regenerate = async () => {
+    replaceMessageAndGenerate(viewingConversation.id, msg.id, undefined, () =>
+      scrollToBottom(true)
+    );
+  };
+
+  return (
+    <div className="group" id={id}>
+      <div
+        className={classNames({
+          chat: true,
+          'chat-start': msg.role !== 'user',
+          'chat-end': msg.role === 'user',
+        })}
+      >
+        <div
+          className={classNames({
+            'chat-bubble markdown': true,
+            'chat-bubble-base-300': msg.role !== 'user',
+          })}
+        >
+          {/* textarea for editing message */}
+          {editingContent !== null && (
+            <>
+              <textarea
+                dir="auto"
+                className="textarea textarea-bordered bg-base-100 text-base-content max-w-2xl w-[calc(90vw-8em)] h-24"
+                value={editingContent}
+                onChange={(e) => setEditingContent(e.target.value)}
+              ></textarea>
+              <br />
+              <button
+                className="btn btn-ghost mt-2 mr-2"
+                onClick={() => setEditingContent(null)}
+              >
+                Cancel
+              </button>
+              <button
+                className="btn mt-2"
+                onClick={() =>
+                  replaceMessageAndGenerate(
+                    viewingConversation.id,
+                    msg.id,
+                    editingContent
+                  )
+                }
+              >
+                Submit
+              </button>
+            </>
+          )}
+          {/* not editing content, render message */}
+          {editingContent === null && (
+            <>
+              {content === null ? (
+                <>
+                  {/* show loading dots for pending message */}
+                  <span className="loading loading-dots loading-md"></span>
+                </>
+              ) : (
+                <>
+                  {/* render message as markdown */}
+                  <div dir="auto">
+                    {thought && (
+                      <details
+                        className="collapse bg-base-200 collapse-arrow mb-4"
+                        open={isThinking && config.showThoughtInProgress}
+                      >
+                        <summary className="collapse-title">
+                          {isPending && isThinking ? (
+                            <span>
+                              <span
+                                v-if="isGenerating"
+                                className="loading loading-spinner loading-md mr-2"
+                                style={{ verticalAlign: 'middle' }}
+                              ></span>
+                              <b>Thinking</b>
+                            </span>
+                          ) : (
+                            <b>Thought Process</b>
+                          )}
+                        </summary>
+                        <div className="collapse-content">
+                          <MarkdownDisplay
+                            content={thought}
+                            isGenerating={isPending}
+                          />
+                        </div>
+                      </details>
+                    )}
+                    <MarkdownDisplay
+                      content={content}
+                      isGenerating={isPending}
+                    />
+                  </div>
+                </>
+              )}
+              {/* render timings if enabled */}
+              {timings && config.showTokensPerSecond && (
+                <div className="dropdown dropdown-hover dropdown-top mt-2">
+                  <div
+                    tabIndex={0}
+                    role="button"
+                    className="cursor-pointer font-semibold text-sm opacity-60"
+                  >
+                    Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                  </div>
+                  <div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+                    <b>Prompt</b>
+                    <br />- Tokens: {timings.prompt_n}
+                    <br />- Time: {timings.prompt_ms} ms
+                    <br />- Speed: {timings.prompt_per_second.toFixed(1)} t/s
+                    <br />
+                    <b>Generation</b>
+                    <br />- Tokens: {timings.predicted_n}
+                    <br />- Time: {timings.predicted_ms} ms
+                    <br />- Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                    <br />
+                  </div>
+                </div>
+              )}
+            </>
+          )}
+        </div>
+      </div>
+
+      {/* actions for each message */}
+      {msg.content !== null && (
+        <div
+          className={classNames({
+            'mx-4 mt-2 mb-2': true,
+            'text-right': msg.role === 'user',
+          })}
+        >
+          {/* user message */}
+          {msg.role === 'user' && (
+            <button
+              className="badge btn-mini show-on-hover"
+              onClick={() => setEditingContent(msg.content)}
+              disabled={msg.content === null}
+            >
+              ✍️ Edit
+            </button>
+          )}
+          {/* assistant message */}
+          {msg.role === 'assistant' && (
+            <>
+              {!isPending && (
+                <button
+                  className="badge btn-mini show-on-hover mr-2"
+                  onClick={regenerate}
+                  disabled={msg.content === null}
+                >
+                  🔄 Regenerate
+                </button>
+              )}
+              <CopyButton
+                className="badge btn-mini show-on-hover mr-2"
+                content={msg.content}
+              />
+            </>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@ -0,0 +1,146 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import StorageUtils from '../utils/storage';
+import { useNavigate } from 'react-router';
+import ChatMessage from './ChatMessage';
+import { CanvasType, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import CanvasPyInterpreter from './CanvasPyInterpreter';
+
+export default function ChatScreen() {
+  const {
+    viewingConversation,
+    sendMessage,
+    isGenerating,
+    stopGenerating,
+    pendingMessages,
+    canvasData,
+  } = useAppContext();
+  const [inputMsg, setInputMsg] = useState('');
+  const navigate = useNavigate();
+
+  const currConvId = viewingConversation?.id ?? '';
+  const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId];
+
+  const scrollToBottom = (requiresNearBottom: boolean) => {
+    const mainScrollElem = document.getElementById('main-scroll');
+    if (!mainScrollElem) return;
+    const spaceToBottom =
+      mainScrollElem.scrollHeight -
+      mainScrollElem.scrollTop -
+      mainScrollElem.clientHeight;
+    if (!requiresNearBottom || spaceToBottom < 50) {
+      setTimeout(
+        () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
+        1
+      );
+    }
+  };
+
+  // scroll to bottom when conversation changes
+  useEffect(() => {
+    scrollToBottom(false);
+  }, [viewingConversation?.id]);
+
+  const sendNewMessage = async () => {
+    if (inputMsg.trim().length === 0 || isGenerating(currConvId)) return;
+    const convId = viewingConversation?.id ?? StorageUtils.getNewConvId();
+    const lastInpMsg = inputMsg;
+    setInputMsg('');
+    if (!viewingConversation) {
+      // if user is creating a new conversation, redirect to the new conversation
+      navigate(`/chat/${convId}`);
+    }
+    scrollToBottom(false);
+    // auto scroll as message is being generated
+    const onChunk = () => scrollToBottom(true);
+    if (!(await sendMessage(convId, inputMsg, onChunk))) {
+      // restore the input message if failed
+      setInputMsg(lastInpMsg);
+    }
+  };
+
+  const hasCanvas = !!canvasData;
+
+  return (
+    <div
+      className={classNames({
+        'grid lg:gap-8 grow transition-[300ms]': true,
+        'grid-cols-[1fr_0fr] lg:grid-cols-[1fr_1fr]': hasCanvas, // adapted for mobile
+        'grid-cols-[1fr_0fr]': !hasCanvas,
+      })}
+    >
+      <div
+        className={classNames({
+          'flex flex-col w-full max-w-[900px] mx-auto': true,
+          'hidden lg:flex': hasCanvas, // adapted for mobile
+          flex: !hasCanvas,
+        })}
+      >
+        {/* chat messages */}
+        <div id="messages-list" className="grow">
+          <div className="mt-auto flex justify-center">
+            {/* placeholder to shift the message to the bottom */}
+            {viewingConversation ? '' : 'Send a message to start'}
+          </div>
+          {viewingConversation?.messages.map((msg) => (
+            <ChatMessage
+              key={msg.id}
+              msg={msg}
+              scrollToBottom={scrollToBottom}
+            />
+          ))}
+
+          {pendingMsg && (
+            <ChatMessage
+              msg={pendingMsg}
+              scrollToBottom={scrollToBottom}
+              isPending
+              id="pending-msg"
+            />
+          )}
+        </div>
+
+        {/* chat input */}
+        <div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
+          <textarea
+            className="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            value={inputMsg}
+            onChange={(e) => setInputMsg(e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === 'Enter' && e.shiftKey) return;
+              if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendNewMessage();
+              }
+            }}
+            id="msg-input"
+            dir="auto"
+          ></textarea>
+          {isGenerating(currConvId) ? (
+            <button
+              className="btn btn-neutral ml-2"
+              onClick={() => stopGenerating(currConvId)}
+            >
+              Stop
+            </button>
+          ) : (
+            <button
+              className="btn btn-primary ml-2"
+              onClick={sendNewMessage}
+              disabled={inputMsg.trim().length === 0}
+            >
+              Send
+            </button>
+          )}
+        </div>
+      </div>
+      <div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
+        {canvasData?.type === CanvasType.PY_INTERPRETER && (
+          <CanvasPyInterpreter />
+        )}
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@ -0,0 +1,176 @@
+import { useEffect, useState } from 'react';
+import StorageUtils from '../utils/storage';
+import { useAppContext } from '../utils/app.context';
+import { classNames } from '../utils/misc';
+import daisyuiThemes from 'daisyui/src/theming/themes';
+import { THEMES } from '../Config';
+import { useNavigate } from 'react-router';
+
+export default function Header() {
+  const navigate = useNavigate();
+  const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
+  const { setShowSettings } = useAppContext();
+
+  const setTheme = (theme: string) => {
+    StorageUtils.setTheme(theme);
+    setSelectedTheme(theme);
+  };
+
+  useEffect(() => {
+    document.body.setAttribute('data-theme', selectedTheme);
+    document.body.setAttribute(
+      'data-color-scheme',
+      // @ts-expect-error daisyuiThemes complains about index type, but it should work
+      daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
+    );
+  }, [selectedTheme]);
+
+  const { isGenerating, viewingConversation } = useAppContext();
+  const isCurrConvGenerating = isGenerating(viewingConversation?.id ?? '');
+
+  const removeConversation = () => {
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    if (window.confirm('Are you sure to delete this conversation?')) {
+      StorageUtils.remove(convId);
+      navigate('/');
+    }
+  };
+
+  const downloadConversation = () => {
+    if (isCurrConvGenerating || !viewingConversation) return;
+    const convId = viewingConversation.id;
+    const conversationJson = JSON.stringify(viewingConversation, null, 2);
+    const blob = new Blob([conversationJson], { type: 'application/json' });
+    const url = URL.createObjectURL(blob);
+    const a = document.createElement('a');
+    a.href = url;
+    a.download = `conversation_${convId}.json`;
+    document.body.appendChild(a);
+    a.click();
+    document.body.removeChild(a);
+    URL.revokeObjectURL(url);
+  };
+
+  return (
+    <div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
+      {/* open sidebar button */}
+      <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          width="16"
+          height="16"
+          fill="currentColor"
+          className="bi bi-list"
+          viewBox="0 0 16 16"
+        >
+          <path
+            fillRule="evenodd"
+            d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
+          />
+        </svg>
+      </label>
+
+      <div className="grow text-2xl font-bold ml-2">llama.cpp</div>
+
+      {/* action buttons (top right) */}
+      <div className="flex items-center">
+        <div v-if="messages.length > 0" className="dropdown dropdown-end">
+          {/* "..." button */}
+          <button
+            tabIndex={0}
+            role="button"
+            className="btn m-1"
+            disabled={isCurrConvGenerating}
+          >
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-three-dots-vertical"
+              viewBox="0 0 16 16"
+            >
+              <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
+            </svg>
+          </button>
+          {/* dropdown menu */}
+          <ul
+            tabIndex={0}
+            className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
+          >
+            <li onClick={downloadConversation}>
+              <a>Download</a>
+            </li>
+            <li className="text-error" onClick={removeConversation}>
+              <a>Delete</a>
+            </li>
+          </ul>
+        </div>
+        <div className="tooltip tooltip-bottom" data-tip="Settings">
+          <button className="btn" onClick={() => setShowSettings(true)}>
+            {/* settings button */}
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              width="16"
+              height="16"
+              fill="currentColor"
+              className="bi bi-gear"
+              viewBox="0 0 16 16"
+            >
+              <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
+              <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
+            </svg>
+          </button>
+        </div>
+
+        {/* theme controller is copied from https://daisyui.com/components/theme-controller/ */}
+        <div className="tooltip tooltip-bottom" data-tip="Themes">
+          <div className="dropdown dropdown-end dropdown-bottom">
+            <div tabIndex={0} role="button" className="btn m-1">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                width="16"
+                height="16"
+                fill="currentColor"
+                className="bi bi-palette2"
+                viewBox="0 0 16 16"
+              >
+                <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
+              </svg>
+            </div>
+            <ul
+              tabIndex={0}
+              className="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto"
+            >
+              <li>
+                <button
+                  className={classNames({
+                    'btn btn-sm btn-block btn-ghost justify-start': true,
+                    'btn-active': selectedTheme === 'auto',
+                  })}
+                  onClick={() => setTheme('auto')}
+                >
+                  auto
+                </button>
+              </li>
+              {THEMES.map((theme) => (
+                <li key={theme}>
+                  <input
+                    type="radio"
+                    name="theme-dropdown"
+                    className="theme-controller btn btn-sm btn-block btn-ghost justify-start"
+                    aria-label={theme}
+                    value={theme}
+                    checked={selectedTheme === theme}
+                    onChange={(e) => e.target.checked && setTheme(theme)}
+                  />
+                </li>
+              ))}
+            </ul>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/MarkdownDisplay.tsx
+++ b/examples/server/webui/src/components/MarkdownDisplay.tsx
@ -0,0 +1,310 @@
+import React, { useMemo, useState } from 'react';
+import Markdown, { ExtraProps } from 'react-markdown';
+import remarkGfm from 'remark-gfm';
+import rehypeHightlight from 'rehype-highlight';
+import rehypeKatex from 'rehype-katex';
+import remarkMath from 'remark-math';
+import remarkBreaks from 'remark-breaks';
+import 'katex/dist/katex.min.css';
+import { classNames, copyStr } from '../utils/misc';
+import { ElementContent, Root } from 'hast';
+import { visit } from 'unist-util-visit';
+import { useAppContext } from '../utils/app.context';
+import { CanvasType } from '../utils/types';
+
+export default function MarkdownDisplay({
+  content,
+  isGenerating,
+}: {
+  content: string;
+  isGenerating?: boolean;
+}) {
+  const preprocessedContent = useMemo(
+    () => preprocessLaTeX(content),
+    [content]
+  );
+  return (
+    <Markdown
+      remarkPlugins={[remarkGfm, remarkMath, remarkBreaks]}
+      rehypePlugins={[rehypeHightlight, rehypeKatex, rehypeCustomCopyButton]}
+      components={{
+        button: (props) => (
+          <CodeBlockButtons
+            {...props}
+            isGenerating={isGenerating}
+            origContent={preprocessedContent}
+          />
+        ),
+        // note: do not use "pre", "p" or other basic html elements here, it will cause the node to re-render when the message is being generated (this should be a bug with react-markdown, not sure how to fix it)
+      }}
+    >
+      {preprocessedContent}
+    </Markdown>
+  );
+}
+
+const CodeBlockButtons: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement> &
+    ExtraProps & { origContent: string; isGenerating?: boolean }
+> = ({ node, origContent, isGenerating }) => {
+  const { config } = useAppContext();
+  const startOffset = node?.position?.start.offset ?? 0;
+  const endOffset = node?.position?.end.offset ?? 0;
+
+  const copiedContent = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, endOffset)
+        .replace(/^```[^\n]+\n/g, '')
+        .replace(/```$/g, ''),
+    [origContent, startOffset, endOffset]
+  );
+
+  const codeLanguage = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, startOffset + 10)
+        .match(/^```([^\n]+)\n/)?.[1] ?? '',
+    [origContent, startOffset]
+  );
+
+  const canRunCode =
+    !isGenerating &&
+    config.pyIntepreterEnabled &&
+    codeLanguage.startsWith('py');
+
+  return (
+    <div
+      className={classNames({
+        'text-right sticky top-[7em] mb-2 mr-2 h-0': true,
+        'display-none': !node?.position,
+      })}
+    >
+      <CopyButton className="badge btn-mini" content={copiedContent} />
+      {canRunCode && (
+        <RunPyCodeButton
+          className="badge btn-mini ml-2"
+          content={copiedContent}
+        />
+      )}
+    </div>
+  );
+};
+
+export const CopyButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const [copied, setCopied] = useState(false);
+  return (
+    <button
+      className={className}
+      onClick={() => {
+        copyStr(content);
+        setCopied(true);
+      }}
+      onMouseLeave={() => setCopied(false)}
+    >
+      {copied ? 'Copied!' : '📋 Copy'}
+    </button>
+  );
+};
+
+export const RunPyCodeButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const { setCanvasData } = useAppContext();
+  return (
+    <>
+      <button
+        className={className}
+        onClick={() =>
+          setCanvasData({
+            type: CanvasType.PY_INTERPRETER,
+            content,
+          })
+        }
+      >
+        ▶️ Run
+      </button>
+    </>
+  );
+};
+
+/**
+ * This injects the "button" element before each "pre" element.
+ * The actual button will be replaced with a react component in the MarkdownDisplay.
+ * We don't replace "pre" node directly because it will cause the node to re-render, which causes this bug: https://github.com/ggerganov/llama.cpp/issues/9608
+ */
+function rehypeCustomCopyButton() {
+  return function (tree: Root) {
+    visit(tree, 'element', function (node) {
+      if (node.tagName === 'pre' && !node.properties.visited) {
+        const preNode = { ...node };
+        // replace current node
+        preNode.properties.visited = 'true';
+        node.tagName = 'div';
+        node.properties = {};
+        // add node for button
+        const btnNode: ElementContent = {
+          type: 'element',
+          tagName: 'button',
+          properties: {},
+          children: [],
+          position: node.position,
+        };
+        node.children = [btnNode, preNode];
+      }
+    });
+  };
+}
+
+/**
+ * The part below is copied and adapted from:
+ * https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
+ * (MIT License)
+ */
+
+// Regex to check if the processed content contains any potential LaTeX patterns
+const containsLatexRegex =
+  /\\\(.*?\\\)|\\\[.*?\\\]|\$.*?\$|\\begin\{equation\}.*?\\end\{equation\}/;
+
+// Regex for inline and block LaTeX expressions
+const inlineLatex = new RegExp(/\\\((.+?)\\\)/, 'g');
+const blockLatex = new RegExp(/\\\[(.*?[^\\])\\\]/, 'gs');
+
+// Function to restore code blocks
+const restoreCodeBlocks = (content: string, codeBlocks: string[]) => {
+  return content.replace(
+    /<<CODE_BLOCK_(\d+)>>/g,
+    (_, index) => codeBlocks[index]
+  );
+};
+
+// Regex to identify code blocks and inline code
+const codeBlockRegex = /(```[\s\S]*?```|`.*?`)/g;
+
+export const processLaTeX = (_content: string) => {
+  let content = _content;
+  // Temporarily replace code blocks and inline code with placeholders
+  const codeBlocks: string[] = [];
+  let index = 0;
+  content = content.replace(codeBlockRegex, (match) => {
+    codeBlocks[index] = match;
+    return `<<CODE_BLOCK_${index++}>>`;
+  });
+
+  // Escape dollar signs followed by a digit or space and digit
+  let processedContent = content.replace(/(\$)(?=\s?\d)/g, '\\$');
+
+  // If no LaTeX patterns are found, restore code blocks and return the processed content
+  if (!containsLatexRegex.test(processedContent)) {
+    return restoreCodeBlocks(processedContent, codeBlocks);
+  }
+
+  // Convert LaTeX expressions to a markdown compatible format
+  processedContent = processedContent
+    .replace(inlineLatex, (_: string, equation: string) => `$${equation}$`) // Convert inline LaTeX
+    .replace(blockLatex, (_: string, equation: string) => `$$${equation}$$`); // Convert block LaTeX
+
+  // Restore code blocks
+  return restoreCodeBlocks(processedContent, codeBlocks);
+};
+
+/**
+ * Preprocesses LaTeX content by replacing delimiters and escaping certain characters.
+ *
+ * @param content The input string containing LaTeX expressions.
+ * @returns The processed string with replaced delimiters and escaped characters.
+ */
+export function preprocessLaTeX(content: string): string {
+  // Step 1: Protect code blocks
+  const codeBlocks: string[] = [];
+  content = content.replace(/(```[\s\S]*?```|`[^`\n]+`)/g, (_, code) => {
+    codeBlocks.push(code);
+    return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
+  });
+
+  // Step 2: Protect existing LaTeX expressions
+  const latexExpressions: string[] = [];
+
+  // Protect block math ($$...$$), \[...\], and \(...\) as before.
+  content = content.replace(
+    /(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g,
+    (match) => {
+      latexExpressions.push(match);
+      return `<<LATEX_${latexExpressions.length - 1}>>`;
+    }
+  );
+
+  // Protect inline math ($...$) only if it does NOT match a currency pattern.
+  // We assume a currency pattern is one where the inner content is purely numeric (with optional decimals).
+  content = content.replace(/\$([^$]+)\$/g, (match, inner) => {
+    if (/^\s*\d+(?:\.\d+)?\s*$/.test(inner)) {
+      // This looks like a currency value (e.g. "$123" or "$12.34"),
+      // so don't protect it.
+      return match;
+    } else {
+      // Otherwise, treat it as a LaTeX expression.
+      latexExpressions.push(match);
+      return `<<LATEX_${latexExpressions.length - 1}>>`;
+    }
+  });
+
+  // Step 3: Escape dollar signs that are likely currency indicators.
+  // (Now that inline math is protected, this will only escape dollars not already protected)
+  content = content.replace(/\$(?=\d)/g, '\\$');
+
+  // Step 4: Restore LaTeX expressions
+  content = content.replace(
+    /<<LATEX_(\d+)>>/g,
+    (_, index) => latexExpressions[parseInt(index)]
+  );
+
+  // Step 5: Restore code blocks
+  content = content.replace(
+    /<<CODE_BLOCK_(\d+)>>/g,
+    (_, index) => codeBlocks[parseInt(index)]
+  );
+
+  // Step 6: Apply additional escaping functions
+  content = escapeBrackets(content);
+  content = escapeMhchem(content);
+
+  return content;
+}
+
+export function escapeBrackets(text: string): string {
+  const pattern =
+    /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g;
+  return text.replace(
+    pattern,
+    (
+      match: string,
+      codeBlock: string | undefined,
+      squareBracket: string | undefined,
+      roundBracket: string | undefined
+    ): string => {
+      if (codeBlock != null) {
+        return codeBlock;
+      } else if (squareBracket != null) {
+        return `$$${squareBracket}$$`;
+      } else if (roundBracket != null) {
+        return `$${roundBracket}$`;
+      }
+      return match;
+    }
+  );
+}
+
+export function escapeMhchem(text: string) {
+  return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{');
+}
--- a/examples/server/webui/src/components/SettingDialog.tsx
+++ b/examples/server/webui/src/components/SettingDialog.tsx
@ -0,0 +1,536 @@
+import { useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { CONFIG_DEFAULT, CONFIG_INFO } from '../Config';
+import { isDev } from '../Config';
+import StorageUtils from '../utils/storage';
+import { classNames, isBoolean, isNumeric, isString } from '../utils/misc';
+import {
+  BeakerIcon,
+  ChatBubbleOvalLeftEllipsisIcon,
+  Cog6ToothIcon,
+  FunnelIcon,
+  HandRaisedIcon,
+  SquaresPlusIcon,
+} from '@heroicons/react/24/outline';
+import { OpenInNewTab } from '../utils/common';
+
+type SettKey = keyof typeof CONFIG_DEFAULT;
+
+const BASIC_KEYS: SettKey[] = [
+  'temperature',
+  'top_k',
+  'top_p',
+  'min_p',
+  'max_tokens',
+];
+const SAMPLER_KEYS: SettKey[] = [
+  'dynatemp_range',
+  'dynatemp_exponent',
+  'typical_p',
+  'xtc_probability',
+  'xtc_threshold',
+];
+const PENALTY_KEYS: SettKey[] = [
+  'repeat_last_n',
+  'repeat_penalty',
+  'presence_penalty',
+  'frequency_penalty',
+  'dry_multiplier',
+  'dry_base',
+  'dry_allowed_length',
+  'dry_penalty_last_n',
+];
+
+enum SettingInputType {
+  SHORT_INPUT,
+  LONG_INPUT,
+  CHECKBOX,
+  CUSTOM,
+}
+
+interface SettingFieldInput {
+  type: Exclude<SettingInputType, SettingInputType.CUSTOM>;
+  label: string | React.ReactElement;
+  help?: string | React.ReactElement;
+  key: SettKey;
+}
+
+interface SettingFieldCustom {
+  type: SettingInputType.CUSTOM;
+  key: SettKey;
+  component:
+    | string
+    | React.FC<{
+        value: string | boolean | number;
+        onChange: (value: string) => void;
+      }>;
+}
+
+interface SettingSection {
+  title: React.ReactElement;
+  fields: (SettingFieldInput | SettingFieldCustom)[];
+}
+
+const ICON_CLASSNAME = 'w-4 h-4 mr-1 inline';
+
+const SETTING_SECTIONS: SettingSection[] = [
+  {
+    title: (
+      <>
+        <Cog6ToothIcon className={ICON_CLASSNAME} />
+        General
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'API Key',
+        key: 'apiKey',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: 'System Message (will be disabled if left empty)',
+        key: 'systemMessage',
+      },
+      ...BASIC_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <FunnelIcon className={ICON_CLASSNAME} />
+        Samplers
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'Samplers queue',
+        key: 'samplers',
+      },
+      ...SAMPLER_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <HandRaisedIcon className={ICON_CLASSNAME} />
+        Penalties
+      </>
+    ),
+    fields: PENALTY_KEYS.map((key) => ({
+      type: SettingInputType.SHORT_INPUT,
+      label: key,
+      key,
+    })),
+  },
+  {
+    title: (
+      <>
+        <ChatBubbleOvalLeftEllipsisIcon className={ICON_CLASSNAME} />
+        Reasoning
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Expand though process by default for generating message',
+        key: 'showThoughtInProgress',
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label:
+          'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
+        key: 'excludeThoughtOnReq',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <SquaresPlusIcon className={ICON_CLASSNAME} />
+        Advanced
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => {
+          const debugImportDemoConv = async () => {
+            const res = await fetch('/demo-conversation.json');
+            const demoConv = await res.json();
+            StorageUtils.remove(demoConv.id);
+            for (const msg of demoConv.messages) {
+              StorageUtils.appendMsg(demoConv.id, msg);
+            }
+          };
+          return (
+            <button className="btn" onClick={debugImportDemoConv}>
+              (debug) Import demo conversation
+            </button>
+          );
+        },
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Show tokens per second',
+        key: 'showTokensPerSecond',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: (
+          <>
+            Custom JSON config (For more info, refer to{' '}
+            <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md">
+              server documentation
+            </OpenInNewTab>
+            )
+          </>
+        ),
+        key: 'custom',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <BeakerIcon className={ICON_CLASSNAME} />
+        Experimental
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => (
+          <>
+            <p className="mb-8">
+              Experimental features are not guaranteed to work correctly.
+              <br />
+              <br />
+              If you encounter any problems, create a{' '}
+              <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/new?template=019-bug-misc.yml">
+                Bug (misc.)
+              </OpenInNewTab>{' '}
+              report on Github. Please also specify <b>webui/experimental</b> on
+              the report title and include screenshots.
+              <br />
+              <br />
+              Some features may require packages downloaded from CDN, so they
+              need internet connection.
+            </p>
+          </>
+        ),
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: (
+          <>
+            <b>Enable Python interpreter</b>
+            <br />
+            <small className="text-xs">
+              This feature uses{' '}
+              <OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
+              downloaded from CDN. To use this feature, ask the LLM to generate
+              python code inside a markdown code block. You will see a "Run"
+              button on the code block, near the "Copy" button.
+            </small>
+          </>
+        ),
+        key: 'pyIntepreterEnabled',
+      },
+    ],
+  },
+];
+
+export default function SettingDialog({
+  show,
+  onClose,
+}: {
+  show: boolean;
+  onClose: () => void;
+}) {
+  const { config, saveConfig } = useAppContext();
+  const [sectionIdx, setSectionIdx] = useState(0);
+
+  // clone the config object to prevent direct mutation
+  const [localConfig, setLocalConfig] = useState<typeof CONFIG_DEFAULT>(
+    JSON.parse(JSON.stringify(config))
+  );
+
+  const resetConfig = () => {
+    if (window.confirm('Are you sure to reset all settings?')) {
+      setLocalConfig(CONFIG_DEFAULT);
+    }
+  };
+
+  const handleSave = () => {
+    // copy the local config to prevent direct mutation
+    const newConfig: typeof CONFIG_DEFAULT = JSON.parse(
+      JSON.stringify(localConfig)
+    );
+    // validate the config
+    for (const key in newConfig) {
+      const value = newConfig[key as SettKey];
+      const mustBeBoolean = isBoolean(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeString = isString(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]);
+      if (mustBeString) {
+        if (!isString(value)) {
+          alert(`Value for ${key} must be string`);
+          return;
+        }
+      } else if (mustBeNumeric) {
+        const trimedValue = value.toString().trim();
+        const numVal = Number(trimedValue);
+        if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
+          alert(`Value for ${key} must be numeric`);
+          return;
+        }
+        // force conversion to number
+        // @ts-expect-error this is safe
+        newConfig[key] = numVal;
+      } else if (mustBeBoolean) {
+        if (!isBoolean(value)) {
+          alert(`Value for ${key} must be boolean`);
+          return;
+        }
+      } else {
+        console.error(`Unknown default type for key ${key}`);
+      }
+    }
+    if (isDev) console.log('Saving config', newConfig);
+    saveConfig(newConfig);
+    onClose();
+  };
+
+  const onChange = (key: SettKey) => (value: string | boolean) => {
+    // note: we do not perform validation here, because we may get incomplete value as user is still typing it
+    setLocalConfig({ ...localConfig, [key]: value });
+  };
+
+  return (
+    <dialog className={classNames({ modal: true, 'modal-open': show })}>
+      <div className="modal-box w-11/12 max-w-3xl">
+        <h3 className="text-lg font-bold mb-6">Settings</h3>
+        <div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
+          {/* Left panel, showing sections - Desktop version */}
+          <div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
+            {SETTING_SECTIONS.map((section, idx) => (
+              <div
+                key={idx}
+                className={classNames({
+                  'btn btn-ghost justify-start font-normal w-44 mb-1': true,
+                  'btn-active': sectionIdx === idx,
+                })}
+                onClick={() => setSectionIdx(idx)}
+                dir="auto"
+              >
+                {section.title}
+              </div>
+            ))}
+          </div>
+
+          {/* Left panel, showing sections - Mobile version */}
+          <div className="md:hidden flex flex-row gap-2 mb-4">
+            <details className="dropdown">
+              <summary className="btn bt-sm w-full m-1">
+                {SETTING_SECTIONS[sectionIdx].title}
+              </summary>
+              <ul className="menu dropdown-content bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                {SETTING_SECTIONS.map((section, idx) => (
+                  <div
+                    key={idx}
+                    className={classNames({
+                      'btn btn-ghost justify-start font-normal': true,
+                      'btn-active': sectionIdx === idx,
+                    })}
+                    onClick={() => setSectionIdx(idx)}
+                    dir="auto"
+                  >
+                    {section.title}
+                  </div>
+                ))}
+              </ul>
+            </details>
+          </div>
+
+          {/* Right panel, showing setting fields */}
+          <div className="grow overflow-y-auto px-4">
+            {SETTING_SECTIONS[sectionIdx].fields.map((field, idx) => {
+              const key = `${sectionIdx}-${idx}`;
+              if (field.type === SettingInputType.SHORT_INPUT) {
+                return (
+                  <SettingsModalShortInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.LONG_INPUT) {
+                return (
+                  <SettingsModalLongInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key].toString()}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CHECKBOX) {
+                return (
+                  <SettingsModalCheckbox
+                    key={key}
+                    configKey={field.key}
+                    value={!!localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CUSTOM) {
+                return (
+                  <div key={key} className="mb-2">
+                    {typeof field.component === 'string'
+                      ? field.component
+                      : field.component({
+                          value: localConfig[field.key],
+                          onChange: onChange(field.key),
+                        })}
+                  </div>
+                );
+              }
+            })}
+
+            <p className="opacity-40 mb-6 text-sm mt-8">
+              Settings are saved in browser's localStorage
+            </p>
+          </div>
+        </div>
+
+        <div className="modal-action">
+          <button className="btn" onClick={resetConfig}>
+            Reset to default
+          </button>
+          <button className="btn" onClick={onClose}>
+            Close
+          </button>
+          <button className="btn btn-primary" onClick={handleSave}>
+            Save
+          </button>
+        </div>
+      </div>
+    </dialog>
+  );
+}
+
+function SettingsModalLongInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: string;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  return (
+    <label className="form-control mb-2">
+      <div className="label inline">{label || configKey}</div>
+      <textarea
+        className="textarea textarea-bordered h-24"
+        placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+        value={value}
+        onChange={(e) => onChange(e.target.value)}
+      />
+    </label>
+  );
+}
+
+function SettingsModalShortInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  value: any;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  const helpMsg = CONFIG_INFO[configKey];
+
+  return (
+    <>
+      {/* on mobile, we simply show the help message here */}
+      {helpMsg && (
+        <div className="block md:hidden mb-1">
+          <b>{label || configKey}</b>
+          <br />
+          <p className="text-xs">{helpMsg}</p>
+        </div>
+      )}
+      <label className="input input-bordered join-item grow flex items-center gap-2 mb-2">
+        <div className="dropdown dropdown-hover">
+          <div tabIndex={0} role="button" className="font-bold hidden md:block">
+            {label || configKey}
+          </div>
+          {helpMsg && (
+            <div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+              {helpMsg}
+            </div>
+          )}
+        </div>
+        <input
+          type="text"
+          className="grow"
+          placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+          value={value}
+          onChange={(e) => onChange(e.target.value)}
+        />
+      </label>
+    </>
+  );
+}
+
+function SettingsModalCheckbox({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: boolean;
+  onChange: (value: boolean) => void;
+  label: string;
+}) {
+  return (
+    <div className="flex flex-row items-center mb-2">
+      <input
+        type="checkbox"
+        className="toggle"
+        checked={value}
+        onChange={(e) => onChange(e.target.checked)}
+      />
+      <span className="ml-4">{label || configKey}</span>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/Sidebar.tsx
+++ b/examples/server/webui/src/components/Sidebar.tsx
@ -0,0 +1,95 @@
+import { useEffect, useMemo, useState } from 'react';
+import { classNames } from '../utils/misc';
+import { Conversation } from '../utils/types';
+import StorageUtils from '../utils/storage';
+import { useNavigate, useParams } from 'react-router';
+
+export default function Sidebar() {
+  const params = useParams();
+  const navigate = useNavigate();
+  const currConv = useMemo(
+    () => StorageUtils.getOneConversation(params.convId ?? ''),
+    [params.convId]
+  );
+
+  const [conversations, setConversations] = useState<Conversation[]>([]);
+
+  useEffect(() => {
+    const handleConversationChange = () => {
+      setConversations(StorageUtils.getAllConversations());
+    };
+    StorageUtils.onConversationChanged(handleConversationChange);
+    handleConversationChange();
+    return () => {
+      StorageUtils.offConversationChanged(handleConversationChange);
+    };
+  }, []);
+
+  return (
+    <>
+      <input
+        id="toggle-drawer"
+        type="checkbox"
+        className="drawer-toggle"
+        defaultChecked
+      />
+
+      <div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+        <label
+          htmlFor="toggle-drawer"
+          aria-label="close sidebar"
+          className="drawer-overlay"
+        ></label>
+        <div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
+          <div className="flex flex-row items-center justify-between mb-4 mt-4">
+            <h2 className="font-bold ml-4">Conversations</h2>
+
+            {/* close sidebar button */}
+            <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                width="16"
+                height="16"
+                fill="currentColor"
+                className="bi bi-arrow-bar-left"
+                viewBox="0 0 16 16"
+              >
+                <path
+                  fillRule="evenodd"
+                  d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
+                />
+              </svg>
+            </label>
+          </div>
+
+          {/* list of conversations */}
+          <div
+            className={classNames({
+              'btn btn-ghost justify-start': true,
+              'btn-active': !currConv,
+            })}
+            onClick={() => navigate('/')}
+          >
+            + New conversation
+          </div>
+          {conversations.map((conv) => (
+            <div
+              key={conv.id}
+              className={classNames({
+                'btn btn-ghost justify-start font-normal': true,
+                'btn-active': conv.id === currConv?.id,
+              })}
+              onClick={() => navigate(`/chat/${conv.id}`)}
+              dir="auto"
+            >
+              <span className="truncate">{conv.messages[0].content}</span>
+            </div>
+          ))}
+          <div className="text-center text-xs opacity-40 mt-auto mx-4">
+            Conversations are saved to browser's localStorage
+          </div>
+        </div>
+      </div>
+    </>
+  );
+}
--- a/examples/server/webui/src/highlight-config.js
+++ b/examples/server/webui/src/highlight-config.js
@ -1,60 +0,0 @@
-import hljs from 'highlight.js/lib/core';
-
-// only import commonly used languages to reduce bundle size
-
-import python from 'highlight.js/lib/languages/python';
-import javascript from 'highlight.js/lib/languages/javascript';
-import json from 'highlight.js/lib/languages/json';
-import bash from 'highlight.js/lib/languages/bash';
-import yaml from 'highlight.js/lib/languages/yaml';
-import markdown from 'highlight.js/lib/languages/markdown';
-import scss from 'highlight.js/lib/languages/scss';
-import xml from 'highlight.js/lib/languages/xml';
-import ruby from 'highlight.js/lib/languages/ruby';
-import go from 'highlight.js/lib/languages/go';
-import java from 'highlight.js/lib/languages/java';
-import rust from 'highlight.js/lib/languages/rust';
-import scala from 'highlight.js/lib/languages/scala';
-import cpp from 'highlight.js/lib/languages/cpp';
-import csharp from 'highlight.js/lib/languages/csharp';
-import swift from 'highlight.js/lib/languages/swift';
-import dart from 'highlight.js/lib/languages/dart';
-import elixir from 'highlight.js/lib/languages/elixir';
-import kotlin from 'highlight.js/lib/languages/kotlin';
-import lua from 'highlight.js/lib/languages/lua';
-import php from 'highlight.js/lib/languages/php';
-import latex from 'highlight.js/lib/languages/latex';
-
-hljs.registerLanguage('python', python);
-hljs.registerLanguage('javascript', javascript);
-hljs.registerLanguage('json', json);
-hljs.registerLanguage('yaml', yaml);
-hljs.registerLanguage('markdown', markdown);
-hljs.registerLanguage('xml', xml);
-hljs.registerLanguage('ruby', ruby);
-hljs.registerLanguage('go', go);
-hljs.registerLanguage('java', java);
-hljs.registerLanguage('rust', rust);
-hljs.registerLanguage('scala', scala);
-hljs.registerLanguage('csharp', csharp);
-hljs.registerLanguage('swift', swift);
-hljs.registerLanguage('dart', dart);
-hljs.registerLanguage('elixir', elixir);
-hljs.registerLanguage('kotlin', kotlin);
-hljs.registerLanguage('lua', lua);
-hljs.registerLanguage('php', php);
-hljs.registerLanguage('latex', latex);
-
-// reuse some languages to further reduce bundle size
-
-hljs.registerLanguage('shell', bash);
-hljs.registerLanguage('bash', bash);
-hljs.registerLanguage('sh', bash);
-
-hljs.registerLanguage('css', scss);
-hljs.registerLanguage('scss', scss);
-
-hljs.registerLanguage('c', cpp);
-hljs.registerLanguage('cpp', cpp);
-
-export default hljs;
--- a/examples/server/webui/src/styles.scss
+++ b/examples/server/webui/src/styles.scss
@ -1,15 +1,28 @@
-@use "sass:meta";
+@use 'sass:meta';

@tailwind base;
@tailwind components;
@tailwind utilities;

 .markdown {
-  h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+  h1,
+  h2,
+  h3,
+  h4,
+  h5,
+  h6,
+  ul,
+  ol,
+  li {
+    all: revert;
+  }
  pre {
    @apply whitespace-pre-wrap rounded-lg p-2;
    border: 1px solid currentColor;
  }
+  p {
+    @apply mb-2;
+  }
  /* TODO: fix markdown table */
 }

@ -19,7 +32,9 @@
 .btn-mini {
  @apply cursor-pointer hover:shadow-md;
 }
-.chat-screen { max-width: 900px; }
+.chat-screen {
+  max-width: 900px;
+}

 .chat-bubble-base-300 {
  --tw-bg-opacity: 1;
@ -30,6 +45,9 @@
 /* Highlight.js */
 [data-color-scheme='light'] {
  @include meta.load-css('highlight.js/styles/stackoverflow-light');
+  .dark-color {
+    @apply bg-base-content text-base-100;
+  }
 }
 [data-color-scheme='dark'] {
  @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -37,6 +55,9 @@
 [data-color-scheme='auto'] {
  @media (prefers-color-scheme: light) {
    @include meta.load-css('highlight.js/styles/stackoverflow-light');
+    .dark-color {
+      @apply bg-base-content text-base-100;
+    }
  }
  @media (prefers-color-scheme: dark) {
    @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -46,3 +67,7 @@
  background: transparent !important;
  padding: 0.5em !important;
 }
+
+.katex-display {
+  margin: 0 0 !important;
+}
--- a/examples/server/webui/src/katex-gpt.js
+++ b/examples/server/webui/src/katex-gpt.js
@ -1,66 +0,0 @@
-import katex from 'katex';
-
-// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt
-// MIT license
-
-const defaultOptions = {
-  delimiters: [
-    { left: '\\[', right: '\\]', display: true },
-    { left: '\\(', right: '\\)', display: false },
-  ],
-};
-
-export function renderLatexHTML(content, display = false) {
-  return katex.renderToString(content, {
-    throwOnError: false,
-    output: 'mathml',
-    displayMode: display,
-  });
-}
-
-function escapedBracketRule(options) {
-  return (state, silent) => {
-    const max = state.posMax;
-    const start = state.pos;
-
-    for (const { left, right, display } of options.delimiters) {
-
-      // Check if it starts with the left delimiter
-      if (!state.src.slice(start).startsWith(left)) continue;
-
-      // Skip the length of the left delimiter
-      let pos = start + left.length;
-
-      // Find the matching right delimiter
-      while (pos < max) {
-        if (state.src.slice(pos).startsWith(right)) {
-          break;
-        }
-        pos++;
-      }
-
-      // No matching right delimiter found, skip to the next match
-      if (pos >= max) continue;
-
-      // If not in silent mode, convert LaTeX formula to MathML
-      if (!silent) {
-        const content = state.src.slice(start + left.length, pos);
-        try {
-          const renderedContent = renderLatexHTML(content, display);
-          const token = state.push('html_inline', '', 0);
-          token.content = renderedContent;
-        } catch (e) {
-          console.error(e);
-        }
-      }
-
-      // Update position, skip the length of the right delimiter
-      state.pos = pos + right.length;
-      return true;
-    }
-  }
-}
-
-export default function (md, options = defaultOptions) {
-  md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options));
-}
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@ -1,701 +0,0 @@
-import './styles.scss';
-import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
-import MarkdownIt from 'markdown-it';
-import TextLineStream from 'textlinestream';
-
-// math formula rendering
-import 'katex/dist/katex.min.css';
-import markdownItKatexGpt from './katex-gpt';
-import markdownItKatexNormal from '@vscode/markdown-it-katex';
-
-// code highlighting
-import hljs from './highlight-config';
-import daisyuiThemes from 'daisyui/src/theming/themes';
-
-// ponyfill for missing ReadableStream asyncIterator on Safari
-import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
-
-const isDev = import.meta.env.MODE === 'development';
-
-// types
-/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
-/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
-/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
-
-// utility functions
-const isString = (x) => !!x.toLowerCase;
-const isBoolean = (x) => x === true || x === false;
-const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
-const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
-const copyStr = (textToCopy) => {
-  // Navigator clipboard api needs a secure context (https)
-  if (navigator.clipboard && window.isSecureContext) {
-    navigator.clipboard.writeText(textToCopy);
-  } else {
-    // Use the 'out of viewport hidden text area' trick
-    const textArea = document.createElement('textarea');
-    textArea.value = textToCopy;
-    // Move textarea out of the viewport so it's not visible
-    textArea.style.position = 'absolute';
-    textArea.style.left = '-999999px';
-    document.body.prepend(textArea);
-    textArea.select();
-    document.execCommand('copy');
-  }
-};
-
-// constants
-const BASE_URL = isDev
-  ? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging
-  : (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
-console.log({ BASE_URL });
-
-const CONFIG_DEFAULT = {
-  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
-  apiKey: '',
-  systemMessage: 'You are a helpful assistant.',
-  showTokensPerSecond: false,
-  showThoughtInProgress: false,
-  excludeThoughtOnReq: true,
-  // make sure these default values are in sync with `common.h`
-  samplers: 'edkypmxt',
-  temperature: 0.8,
-  dynatemp_range: 0.0,
-  dynatemp_exponent: 1.0,
-  top_k: 40,
-  top_p: 0.95,
-  min_p: 0.05,
-  xtc_probability: 0.0,
-  xtc_threshold: 0.1,
-  typical_p: 1.0,
-  repeat_last_n: 64,
-  repeat_penalty: 1.0,
-  presence_penalty: 0.0,
-  frequency_penalty: 0.0,
-  dry_multiplier: 0.0,
-  dry_base: 1.75,
-  dry_allowed_length: 2,
-  dry_penalty_last_n: -1,
-  max_tokens: -1,
-  custom: '', // custom json-stringified object
-};
-const CONFIG_INFO = {
-  apiKey: 'Set the API Key if you are using --api-key option for the server.',
-  systemMessage: 'The starting message that defines how model should behave.',
-  samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
-  temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
-  dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
-  dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
-  top_k: 'Keeps only k top tokens.',
-  top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
-  min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
-  xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
-  xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
-  typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
-  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
-  repeat_penalty: 'Controls the repetition of token sequences in the generated text',
-  presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
-  frequency_penalty: 'Limits tokens based on how often they appear in the output.',
-  dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
-  dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
-  dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
-  dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
-  max_tokens: 'The maximum number of token per output.',
-  custom: '', // custom json-stringified object
-};
-// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
-const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
-// list of themes supported by daisyui
-const THEMES = ['light', 'dark']
-  // make sure light & dark are always at the beginning
-  .concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark'));
-
-// markdown support
-const VueMarkdown = defineComponent(
-  (props) => {
-    const md = shallowRef(new MarkdownIt({
-      breaks: true,
-      highlight: function (str, lang) { // Add highlight.js
-        if (lang && hljs.getLanguage(lang)) {
-          try {
-            return '<pre dir="auto"><code class="hljs">' +
-                   hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
-                   '</code></pre>';
-          } catch (__) {}
-        }
-        return '<pre dir="auto"><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
-      }
-    }));
-    // support latex with double dollar sign and square brackets
-    md.value.use(markdownItKatexGpt, {
-      delimiters: [
-        { left: '\\[', right: '\\]', display: true },
-        { left: '\\(', right: '\\)', display: false },
-        { left: '$$', right: '$$', display: false },
-        // do not add single dollar sign here, other wise it will confused with dollar used for money symbol
-      ],
-      throwOnError: false,
-    });
-    // support latex with single dollar sign
-    md.value.use(markdownItKatexNormal, { throwOnError: false });
-    // add copy button to code blocks
-    const origFenchRenderer = md.value.renderer.rules.fence;
-    md.value.renderer.rules.fence = (tokens, idx, ...args) => {
-      const content = tokens[idx].content;
-      const origRendered = origFenchRenderer(tokens, idx, ...args);
-      return `<div class="relative my-4">
-        <div class="text-right sticky top-4 mb-2 mr-2 h-0">
-          <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
-        </div>
-        ${origRendered}
-      </div>`;
-    };
-    window.copyStr = copyStr;
-    const content = computed(() => md.value.render(props.source));
-    return () => h('div', { innerHTML: content.value });
-  },
-  { props: ['source'] }
-);
-
-// input field to be used by settings modal
-const SettingsModalShortInput = defineComponent({
-  template: document.getElementById('settings-modal-short-input').innerHTML,
-  props: {
-    label: { type: String, required: false },
-    configKey: String,
-    configDefault: Object,
-    configInfo: Object,
-    modelValue: [Object, String, Number],
-  },
-});
-
-// message bubble component
-const MessageBubble = defineComponent({
-  components: {
-    VueMarkdown
-  },
-  template: document.getElementById('message-bubble').innerHTML,
-  props: {
-    config: Object,
-    msg: Object,
-    isGenerating: Boolean,
-    showThoughtInProgress: Boolean,
-    editUserMsgAndRegenerate: Function,
-    regenerateMsg: Function,
-  },
-  data() {
-    return {
-      editingContent: null,
-    };
-  },
-  computed: {
-    timings() {
-      if (!this.msg.timings) return null;
-      return {
-        ...this.msg.timings,
-        prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
-        predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
-      };
-    },
-    splitMsgContent() {
-      const content = this.msg.content;
-      if (this.msg.role !== 'assistant') {
-        return { content };
-      }
-      let actualContent = '';
-      let cot = '';
-      let isThinking = false;
-      let thinkSplit = content.split('<think>', 2);
-      actualContent += thinkSplit[0];
-      while (thinkSplit[1] !== undefined) {
-        // <think> tag found
-        thinkSplit = thinkSplit[1].split('</think>', 2);
-        cot += thinkSplit[0];
-        isThinking = true;
-        if (thinkSplit[1] !== undefined) {
-          // </think> closing tag found
-          isThinking = false;
-          thinkSplit = thinkSplit[1].split('<think>', 2);
-          actualContent += thinkSplit[0];
-        }
-      }
-      return { content: actualContent, cot, isThinking };
-    },
-  },
-  methods: {
-    copyMsg() {
-      copyStr(this.msg.content);
-    },
-    editMsg() {
-      this.editUserMsgAndRegenerate({
-        ...this.msg,
-        content: this.editingContent,
-      });
-      this.editingContent = null;
-    },
-  },
-});
-
-// coversations is stored in localStorage
-// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
-// convId is a string prefixed with 'conv-'
-const StorageUtils = {
-  /**
-   * manage conversations
-   * @returns {Array<Conversation>}
-   */
-  getAllConversations() {
-    const res = [];
-    for (const key in localStorage) {
-      if (key.startsWith('conv-')) {
-        res.push(JSON.parse(localStorage.getItem(key)));
-      }
-    }
-    res.sort((a, b) => b.lastModified - a.lastModified);
-    return res;
-  },
-  /**
-   * can return null if convId does not exist
-   * @param {string} convId
-   * @returns {Conversation | null}
-   */
-  getOneConversation(convId) {
-    return JSON.parse(localStorage.getItem(convId) || 'null');
-  },
-  /**
-   * if convId does not exist, create one
-   * @param {string} convId
-   * @param {Message} msg
-   */
-  appendMsg(convId, msg) {
-    if (msg.content === null) return;
-    const conv = StorageUtils.getOneConversation(convId) || {
-      id: convId,
-      lastModified: Date.now(),
-      messages: [],
-    };
-    conv.messages.push(msg);
-    conv.lastModified = Date.now();
-    localStorage.setItem(convId, JSON.stringify(conv));
-  },
-  /**
-   * Get new conversation id
-   * @returns {string}
-   */
-  getNewConvId() {
-    return `conv-${Date.now()}`;
-  },
-  /**
-   * remove conversation by id
-   * @param {string} convId
-   */
-  remove(convId) {
-    localStorage.removeItem(convId);
-  },
-  /**
-   * remove all conversations
-   * @param {string} convId
-   */
-  filterAndKeepMsgs(convId, predicate) {
-    const conv = StorageUtils.getOneConversation(convId);
-    if (!conv) return;
-    conv.messages = conv.messages.filter(predicate);
-    conv.lastModified = Date.now();
-    localStorage.setItem(convId, JSON.stringify(conv));
-  },
-  /**
-   * remove last message from conversation
-   * @param {string} convId
-   * @returns {Message | undefined}
-   */
-  popMsg(convId) {
-    const conv = StorageUtils.getOneConversation(convId);
-    if (!conv) return;
-    const msg = conv.messages.pop();
-    conv.lastModified = Date.now();
-    if (conv.messages.length === 0) {
-      StorageUtils.remove(convId);
-    } else {
-      localStorage.setItem(convId, JSON.stringify(conv));
-    }
-    return msg;
-  },
-
-  // manage config
-  getConfig() {
-    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
-    // to prevent breaking changes in the future, we always provide default value for missing keys
-    return {
-      ...CONFIG_DEFAULT,
-      ...savedVal,
-    };
-  },
-  setConfig(config) {
-    localStorage.setItem('config', JSON.stringify(config));
-  },
-  getTheme() {
-    return localStorage.getItem('theme') || 'auto';
-  },
-  setTheme(theme) {
-    if (theme === 'auto') {
-      localStorage.removeItem('theme');
-    } else {
-      localStorage.setItem('theme', theme);
-    }
-  },
-};
-
-// scroll to bottom of chat messages
-// if requiresNearBottom is true, only auto-scroll if user is near bottom
-const chatScrollToBottom = (requiresNearBottom) => {
-  const msgListElem = document.getElementById('messages-list');
-  const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
-  if (!requiresNearBottom || (spaceToBottom < 100)) {
-    setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
-  }
-};
-
-// wrapper for SSE
-async function* sendSSEPostRequest(url, fetchOptions) {
-  const res = await fetch(url, fetchOptions);
-  const lines = res.body
-    .pipeThrough(new TextDecoderStream())
-    .pipeThrough(new TextLineStream());
-  for await (const line of asyncIterator(lines)) {
-    if (isDev) console.log({line});
-    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
-      const data = JSON.parse(line.slice(5));
-      yield data;
-    } else if (line.startsWith('error:')) {
-      const data = JSON.parse(line.slice(6));
-      throw new Error(data.message || 'Unknown error');
-    }
-  }
-};
-
-const mainApp = createApp({
-  components: {
-    VueMarkdown,
-    SettingsModalShortInput,
-    MessageBubble,
-  },
-  data() {
-    return {
-      conversations: StorageUtils.getAllConversations(),
-      /** @type {Array<Message>} */
-      messages: [],
-      viewingConvId: StorageUtils.getNewConvId(),
-      inputMsg: '',
-      isGenerating: false,
-      /** @type {Array<Message> | null} */
-      pendingMsg: null, // the on-going message from assistant
-      stopGeneration: () => {},
-      selectedTheme: StorageUtils.getTheme(),
-      config: StorageUtils.getConfig(),
-      showConfigDialog: false,
-      // const
-      themes: THEMES,
-      /** @type {CONFIG_DEFAULT} */
-      configDefault: {...CONFIG_DEFAULT},
-      configInfo: {...CONFIG_INFO},
-      isDev,
-    }
-  },
-  computed: {},
-  mounted() {
-    document.getElementById('app').classList.remove('opacity-0'); // show app
-    // scroll to the bottom when the pending message height is updated
-    const pendingMsgElem = document.getElementById('pending-msg');
-    const resizeObserver = new ResizeObserver(() => {
-      if (this.isGenerating) chatScrollToBottom(true);
-    });
-    resizeObserver.observe(pendingMsgElem);
-    this.setSelectedTheme(this.selectedTheme);
-  },
-  watch: {
-    viewingConvId: function(val, oldVal) {
-      if (val != oldVal) {
-        this.fetchMessages();
-        chatScrollToBottom();
-        this.hideSidebar();
-      }
-    }
-  },
-  methods: {
-    hideSidebar() {
-      document.getElementById('toggle-drawer').checked = false;
-    },
-    setSelectedTheme(theme) {
-      this.selectedTheme = theme;
-      document.body.setAttribute('data-theme', theme);
-      document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto');
-      StorageUtils.setTheme(theme);
-    },
-    newConversation() {
-      if (this.isGenerating) return;
-      this.viewingConvId = StorageUtils.getNewConvId();
-    },
-    setViewingConv(convId) {
-      if (this.isGenerating) return;
-      this.viewingConvId = convId;
-    },
-    deleteConv(convId) {
-      if (this.isGenerating) return;
-      if (window.confirm('Are you sure to delete this conversation?')) {
-        StorageUtils.remove(convId);
-        if (this.viewingConvId === convId) {
-          this.viewingConvId = StorageUtils.getNewConvId();
-        }
-        this.fetchConversation();
-        this.fetchMessages();
-      }
-    },
-    downloadConv(convId) {
-      const conversation = StorageUtils.getOneConversation(convId);
-      if (!conversation) {
-        alert('Conversation not found.');
-        return;
-      }
-      const conversationJson = JSON.stringify(conversation, null, 2);
-      const blob = new Blob([conversationJson], { type: 'application/json' });
-      const url = URL.createObjectURL(blob);
-      const a = document.createElement('a');
-      a.href = url;
-      a.download = `conversation_${convId}.json`;
-      document.body.appendChild(a);
-      a.click();
-      document.body.removeChild(a);
-      URL.revokeObjectURL(url);
-    },
-    async sendMessage() {
-      if (!this.inputMsg) return;
-      const currConvId = this.viewingConvId;
-
-      StorageUtils.appendMsg(currConvId, {
-        id: Date.now(),
-        role: 'user',
-        content: this.inputMsg,
-      });
-      this.fetchConversation();
-      this.fetchMessages();
-      this.inputMsg = '';
-      this.generateMessage(currConvId);
-      chatScrollToBottom();
-    },
-    async generateMessage(currConvId) {
-      if (this.isGenerating) return;
-      this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
-      this.isGenerating = true;
-
-      try {
-        /** @type {CONFIG_DEFAULT} */
-        const config = this.config;
-        const abortController = new AbortController();
-        this.stopGeneration = () => abortController.abort();
-        /** @type {Array<APIMessage>} */
-        let messages = [
-          { role: 'system', content: config.systemMessage },
-          ...normalizeMsgsForAPI(this.messages),
-        ];
-        if (config.excludeThoughtOnReq) {
-          messages = filterThoughtFromMsgs(messages);
-        }
-        if (isDev) console.log({messages});
-        const params = {
-          messages,
-          stream: true,
-          cache_prompt: true,
-          samplers: config.samplers,
-          temperature: config.temperature,
-          dynatemp_range: config.dynatemp_range,
-          dynatemp_exponent: config.dynatemp_exponent,
-          top_k: config.top_k,
-          top_p: config.top_p,
-          min_p: config.min_p,
-          typical_p: config.typical_p,
-          xtc_probability: config.xtc_probability,
-          xtc_threshold: config.xtc_threshold,
-          repeat_last_n: config.repeat_last_n,
-          repeat_penalty: config.repeat_penalty,
-          presence_penalty: config.presence_penalty,
-          frequency_penalty: config.frequency_penalty,
-          dry_multiplier: config.dry_multiplier,
-          dry_base: config.dry_base,
-          dry_allowed_length: config.dry_allowed_length,
-          dry_penalty_last_n: config.dry_penalty_last_n,
-          max_tokens: config.max_tokens,
-          timings_per_token: !!config.showTokensPerSecond,
-          ...(config.custom.length ? JSON.parse(config.custom) : {}),
-        };
-        const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
-          method: 'POST',
-          headers: {
-            'Content-Type': 'application/json',
-            ...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
-          },
-          body: JSON.stringify(params),
-          signal: abortController.signal,
-        });
-        for await (const chunk of chunks) {
-          const stop = chunk.stop;
-          const addedContent = chunk.choices[0].delta.content;
-          const lastContent = this.pendingMsg.content || '';
-          if (addedContent) {
-            this.pendingMsg = {
-              id: this.pendingMsg.id,
-              role: 'assistant',
-              content: lastContent + addedContent,
-            };
-          }
-          const timings = chunk.timings;
-          if (timings && config.showTokensPerSecond) {
-            // only extract what's really needed, to save some space
-            this.pendingMsg.timings = {
-              prompt_n: timings.prompt_n,
-              prompt_ms: timings.prompt_ms,
-              predicted_n: timings.predicted_n,
-              predicted_ms: timings.predicted_ms,
-            };
-          }
-        }
-
-        StorageUtils.appendMsg(currConvId, this.pendingMsg);
-        this.fetchConversation();
-        this.fetchMessages();
-        setTimeout(() => document.getElementById('msg-input').focus(), 1);
-      } catch (error) {
-        if (error.name === 'AbortError') {
-          // user stopped the generation via stopGeneration() function
-          StorageUtils.appendMsg(currConvId, this.pendingMsg);
-          this.fetchConversation();
-          this.fetchMessages();
-        } else {
-          console.error(error);
-          alert(error);
-          // pop last user message
-          const lastUserMsg = StorageUtils.popMsg(currConvId);
-          this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
-        }
-      }
-
-      this.pendingMsg = null;
-      this.isGenerating = false;
-      this.stopGeneration = () => {};
-      this.fetchMessages();
-      chatScrollToBottom();
-    },
-
-    // message actions
-    regenerateMsg(msg) {
-      if (this.isGenerating) return;
-      // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
-      const currConvId = this.viewingConvId;
-      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
-      this.fetchConversation();
-      this.fetchMessages();
-      this.generateMessage(currConvId);
-    },
-    editUserMsgAndRegenerate(msg) {
-      if (this.isGenerating) return;
-      const currConvId = this.viewingConvId;
-      const newContent = msg.content;
-      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
-      StorageUtils.appendMsg(currConvId, {
-        id: Date.now(),
-        role: 'user',
-        content: newContent,
-      });
-      this.fetchConversation();
-      this.fetchMessages();
-      this.generateMessage(currConvId);
-    },
-
-    // settings dialog methods
-    closeAndSaveConfigDialog() {
-      try {
-        if (this.config.custom.length) JSON.parse(this.config.custom);
-      } catch (error) {
-        alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
-        return;
-      }
-      for (const key of CONFIG_NUMERIC_KEYS) {
-        if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
-          alert(`Invalid number for ${key} (expected an integer or a float)`);
-          return;
-        }
-        this.config[key] = parseFloat(this.config[key]);
-      }
-      this.showConfigDialog = false;
-      StorageUtils.setConfig(this.config);
-    },
-    closeAndDiscardConfigDialog() {
-      this.showConfigDialog = false;
-      this.config = StorageUtils.getConfig();
-    },
-    resetConfigDialog() {
-      if (window.confirm('Are you sure to reset all settings?')) {
-        this.config = {...CONFIG_DEFAULT};
-      }
-    },
-
-    // sync state functions
-    fetchConversation() {
-      this.conversations = StorageUtils.getAllConversations();
-    },
-    fetchMessages() {
-      this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
-    },
-
-    // debug functions
-    async debugImportDemoConv() {
-      const res = await fetch('/demo-conversation.json');
-      const demoConv = await res.json();
-      StorageUtils.remove(demoConv.id);
-      for (const msg of demoConv.messages) {
-        StorageUtils.appendMsg(demoConv.id, msg);
-      }
-      this.fetchConversation();
-    }
-  },
-});
-mainApp.config.errorHandler = alert;
-try {
-  mainApp.mount('#app');
-} catch (err) {
-  console.error(err);
-  document.getElementById('app').innerHTML = `<div style="margin:2em auto">
-    Failed to start app. Please try clearing localStorage and try again.<br/>
-    <br/>
-    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
-  </div>`;
-}
-
-/**
- * filter out redundant fields upon sending to API
- * @param {Array<APIMessage>} messages
- * @returns {Array<APIMessage>}
- */
-function normalizeMsgsForAPI(messages) {
-  return messages.map((msg) => {
-    return {
-      role: msg.role,
-      content: msg.content,
-    };
-  });
-}
-
-/**
- * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
- * @param {Array<APIMessage>} messages
- * @returns {Array<APIMessage>}
- */
-function filterThoughtFromMsgs(messages) {
-  return messages.map((msg) => {
-    return {
-      role: msg.role,
-      content: msg.role === 'assistant'
-        ? msg.content.split('</think>').at(-1).trim()
-        : msg.content,
-    };
-  });
-}
--- a/examples/server/webui/src/main.tsx
+++ b/examples/server/webui/src/main.tsx
@ -0,0 +1,10 @@
+import { StrictMode } from 'react';
+import { createRoot } from 'react-dom/client';
+import './index.scss';
+import App from './App.tsx';
+
+createRoot(document.getElementById('root')!).render(
+  <StrictMode>
+    <App />
+  </StrictMode>
+);
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@ -0,0 +1,327 @@
+import React, { createContext, useContext, useEffect, useState } from 'react';
+import {
+  APIMessage,
+  CanvasData,
+  Conversation,
+  Message,
+  PendingMessage,
+} from './types';
+import StorageUtils from './storage';
+import {
+  filterThoughtFromMsgs,
+  normalizeMsgsForAPI,
+  getSSEStreamAsync,
+} from './misc';
+import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
+import { matchPath, useLocation } from 'react-router';
+
+interface AppContextValue {
+  // conversations and messages
+  viewingConversation: Conversation | null;
+  pendingMessages: Record<Conversation['id'], PendingMessage>;
+  isGenerating: (convId: string) => boolean;
+  sendMessage: (
+    convId: string,
+    content: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => Promise<boolean>;
+  stopGenerating: (convId: string) => void;
+  replaceMessageAndGenerate: (
+    convId: string,
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => Promise<void>;
+
+  // canvas
+  canvasData: CanvasData | null;
+  setCanvasData: (data: CanvasData | null) => void;
+
+  // config
+  config: typeof CONFIG_DEFAULT;
+  saveConfig: (config: typeof CONFIG_DEFAULT) => void;
+  showSettings: boolean;
+  setShowSettings: (show: boolean) => void;
+}
+
+// for now, this callback is only used for scrolling to the bottom of the chat
+type CallbackGeneratedChunk = () => void;
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+const AppContext = createContext<AppContextValue>({} as any);
+
+export const AppContextProvider = ({
+  children,
+}: {
+  children: React.ReactElement;
+}) => {
+  const { pathname } = useLocation();
+  const params = matchPath('/chat/:convId', pathname);
+  const convId = params?.params?.convId;
+
+  const [viewingConversation, setViewingConversation] =
+    useState<Conversation | null>(null);
+  const [pendingMessages, setPendingMessages] = useState<
+    Record<Conversation['id'], PendingMessage>
+  >({});
+  const [aborts, setAborts] = useState<
+    Record<Conversation['id'], AbortController>
+  >({});
+  const [config, setConfig] = useState(StorageUtils.getConfig());
+  const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
+  const [showSettings, setShowSettings] = useState(false);
+
+  // handle change when the convId from URL is changed
+  useEffect(() => {
+    // also reset the canvas data
+    setCanvasData(null);
+    const handleConversationChange = (changedConvId: string) => {
+      if (changedConvId !== convId) return;
+      setViewingConversation(StorageUtils.getOneConversation(convId));
+    };
+    StorageUtils.onConversationChanged(handleConversationChange);
+    setViewingConversation(StorageUtils.getOneConversation(convId ?? ''));
+    return () => {
+      StorageUtils.offConversationChanged(handleConversationChange);
+    };
+  }, [convId]);
+
+  const setPending = (convId: string, pendingMsg: PendingMessage | null) => {
+    // if pendingMsg is null, remove the key from the object
+    if (!pendingMsg) {
+      setPendingMessages((prev) => {
+        const newState = { ...prev };
+        delete newState[convId];
+        return newState;
+      });
+    } else {
+      setPendingMessages((prev) => ({ ...prev, [convId]: pendingMsg }));
+    }
+  };
+
+  const setAbort = (convId: string, controller: AbortController | null) => {
+    if (!controller) {
+      setAborts((prev) => {
+        const newState = { ...prev };
+        delete newState[convId];
+        return newState;
+      });
+    } else {
+      setAborts((prev) => ({ ...prev, [convId]: controller }));
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////
+  // public functions
+
+  const isGenerating = (convId: string) => !!pendingMessages[convId];
+
+  const generateMessage = async (
+    convId: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => {
+    if (isGenerating(convId)) return;
+
+    const config = StorageUtils.getConfig();
+    const currConversation = StorageUtils.getOneConversation(convId);
+    if (!currConversation) {
+      throw new Error('Current conversation is not found');
+    }
+
+    const abortController = new AbortController();
+    setAbort(convId, abortController);
+
+    let pendingMsg: PendingMessage = {
+      id: Date.now() + 1,
+      role: 'assistant',
+      content: null,
+    };
+    setPending(convId, pendingMsg);
+
+    try {
+      // prepare messages for API
+      let messages: APIMessage[] = [
+        ...(config.systemMessage.length === 0
+          ? []
+          : [{ role: 'system', content: config.systemMessage } as APIMessage]),
+        ...normalizeMsgsForAPI(currConversation?.messages ?? []),
+      ];
+      if (config.excludeThoughtOnReq) {
+        messages = filterThoughtFromMsgs(messages);
+      }
+      if (isDev) console.log({ messages });
+
+      // prepare params
+      const params = {
+        messages,
+        stream: true,
+        cache_prompt: true,
+        samplers: config.samplers,
+        temperature: config.temperature,
+        dynatemp_range: config.dynatemp_range,
+        dynatemp_exponent: config.dynatemp_exponent,
+        top_k: config.top_k,
+        top_p: config.top_p,
+        min_p: config.min_p,
+        typical_p: config.typical_p,
+        xtc_probability: config.xtc_probability,
+        xtc_threshold: config.xtc_threshold,
+        repeat_last_n: config.repeat_last_n,
+        repeat_penalty: config.repeat_penalty,
+        presence_penalty: config.presence_penalty,
+        frequency_penalty: config.frequency_penalty,
+        dry_multiplier: config.dry_multiplier,
+        dry_base: config.dry_base,
+        dry_allowed_length: config.dry_allowed_length,
+        dry_penalty_last_n: config.dry_penalty_last_n,
+        max_tokens: config.max_tokens,
+        timings_per_token: !!config.showTokensPerSecond,
+        ...(config.custom.length ? JSON.parse(config.custom) : {}),
+      };
+
+      // send request
+      const fetchResponse = await fetch(`${BASE_URL}/v1/chat/completions`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          ...(config.apiKey
+            ? { Authorization: `Bearer ${config.apiKey}` }
+            : {}),
+        },
+        body: JSON.stringify(params),
+        signal: abortController.signal,
+      });
+      if (fetchResponse.status !== 200) {
+        const body = await fetchResponse.json();
+        throw new Error(body?.error?.message || 'Unknown error');
+      }
+      const chunks = getSSEStreamAsync(fetchResponse);
+      for await (const chunk of chunks) {
+        // const stop = chunk.stop;
+        if (chunk.error) {
+          throw new Error(chunk.error?.message || 'Unknown error');
+        }
+        const addedContent = chunk.choices[0].delta.content;
+        const lastContent = pendingMsg.content || '';
+        if (addedContent) {
+          pendingMsg = {
+            id: pendingMsg.id,
+            role: 'assistant',
+            content: lastContent + addedContent,
+          };
+        }
+        const timings = chunk.timings;
+        if (timings && config.showTokensPerSecond) {
+          // only extract what's really needed, to save some space
+          pendingMsg.timings = {
+            prompt_n: timings.prompt_n,
+            prompt_ms: timings.prompt_ms,
+            predicted_n: timings.predicted_n,
+            predicted_ms: timings.predicted_ms,
+          };
+        }
+        setPending(convId, pendingMsg);
+        onChunk?.();
+      }
+    } catch (err) {
+      setPending(convId, null);
+      if ((err as Error).name === 'AbortError') {
+        // user stopped the generation via stopGeneration() function
+        // we can safely ignore this error
+      } else {
+        console.error(err);
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        alert((err as any)?.message ?? 'Unknown error');
+        throw err; // rethrow
+      }
+    }
+
+    if (pendingMsg.content) {
+      StorageUtils.appendMsg(currConversation.id, {
+        id: pendingMsg.id,
+        content: pendingMsg.content,
+        role: pendingMsg.role,
+        timings: pendingMsg.timings,
+      });
+    }
+    setPending(convId, null);
+    onChunk?.(); // trigger scroll to bottom
+  };
+
+  const sendMessage = async (
+    convId: string,
+    content: string,
+    onChunk?: CallbackGeneratedChunk
+  ): Promise<boolean> => {
+    if (isGenerating(convId) || content.trim().length === 0) return false;
+
+    StorageUtils.appendMsg(convId, {
+      id: Date.now(),
+      role: 'user',
+      content,
+    });
+
+    try {
+      await generateMessage(convId, onChunk);
+      return true;
+    } catch (_) {
+      // rollback
+      StorageUtils.popMsg(convId);
+    }
+    return false;
+  };
+
+  const stopGenerating = (convId: string) => {
+    setPending(convId, null);
+    aborts[convId]?.abort();
+  };
+
+  // if content is undefined, we remove last assistant message
+  const replaceMessageAndGenerate = async (
+    convId: string,
+    origMsgId: Message['id'],
+    content?: string,
+    onChunk?: CallbackGeneratedChunk
+  ) => {
+    if (isGenerating(convId)) return;
+
+    StorageUtils.filterAndKeepMsgs(convId, (msg) => msg.id < origMsgId);
+    if (content) {
+      StorageUtils.appendMsg(convId, {
+        id: Date.now(),
+        role: 'user',
+        content,
+      });
+    }
+
+    await generateMessage(convId, onChunk);
+  };
+
+  const saveConfig = (config: typeof CONFIG_DEFAULT) => {
+    StorageUtils.setConfig(config);
+    setConfig(config);
+  };
+
+  return (
+    <AppContext.Provider
+      value={{
+        isGenerating,
+        viewingConversation,
+        pendingMessages,
+        sendMessage,
+        stopGenerating,
+        replaceMessageAndGenerate,
+        canvasData,
+        setCanvasData,
+        config,
+        saveConfig,
+        showSettings,
+        setShowSettings,
+      }}
+    >
+      {children}
+    </AppContext.Provider>
+  );
+};
+
+export const useAppContext = () => useContext(AppContext);
--- a/examples/server/webui/src/utils/common.tsx
+++ b/examples/server/webui/src/utils/common.tsx
@ -0,0 +1,38 @@
+export const XCloseButton: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement>
+> = ({ className, ...props }) => (
+  <button className={`btn btn-square btn-sm ${className ?? ''}`} {...props}>
+    <svg
+      xmlns="http://www.w3.org/2000/svg"
+      className="h-6 w-6"
+      fill="none"
+      viewBox="0 0 24 24"
+      stroke="currentColor"
+    >
+      <path
+        strokeLinecap="round"
+        strokeLinejoin="round"
+        strokeWidth="2"
+        d="M6 18L18 6M6 6l12 12"
+      />
+    </svg>
+  </button>
+);
+
+export const OpenInNewTab = ({
+  href,
+  children,
+}: {
+  href: string;
+  children: string;
+}) => (
+  <a
+    className="underline"
+    href={href}
+    target="_blank"
+    rel="noopener noreferrer"
+  >
+    {children}
+  </a>
+);
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@ -0,0 +1,90 @@
+// @ts-expect-error this package does not have typing
+import TextLineStream from 'textlinestream';
+import { APIMessage, Message } from './types';
+
+// ponyfill for missing ReadableStream asyncIterator on Safari
+import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
+import { isDev } from '../Config';
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isString = (x: any) => !!x.toLowerCase;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isBoolean = (x: any) => x === true || x === false;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const isNumeric = (n: any) => !isString(n) && !isNaN(n) && !isBoolean(n);
+export const escapeAttr = (str: string) =>
+  str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+
+// wrapper for SSE
+export async function* getSSEStreamAsync(fetchResponse: Response) {
+  if (!fetchResponse.body) throw new Error('Response body is empty');
+  const lines: ReadableStream<string> = fetchResponse.body
+    .pipeThrough(new TextDecoderStream())
+    .pipeThrough(new TextLineStream());
+  // @ts-expect-error asyncIterator complains about type, but it should work
+  for await (const line of asyncIterator(lines)) {
+    if (isDev) console.log({ line });
+    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
+      const data = JSON.parse(line.slice(5));
+      yield data;
+    } else if (line.startsWith('error:')) {
+      const data = JSON.parse(line.slice(6));
+      throw new Error(data.message || 'Unknown error');
+    }
+  }
+}
+
+// copy text to clipboard
+export const copyStr = (textToCopy: string) => {
+  // Navigator clipboard api needs a secure context (https)
+  if (navigator.clipboard && window.isSecureContext) {
+    navigator.clipboard.writeText(textToCopy);
+  } else {
+    // Use the 'out of viewport hidden text area' trick
+    const textArea = document.createElement('textarea');
+    textArea.value = textToCopy;
+    // Move textarea out of the viewport so it's not visible
+    textArea.style.position = 'absolute';
+    textArea.style.left = '-999999px';
+    document.body.prepend(textArea);
+    textArea.select();
+    document.execCommand('copy');
+  }
+};
+
+/**
+ * filter out redundant fields upon sending to API
+ */
+export function normalizeMsgsForAPI(messages: Message[]) {
+  return messages.map((msg) => {
+    return {
+      role: msg.role,
+      content: msg.content,
+    };
+  }) as APIMessage[];
+}
+
+/**
+ * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
+ */
+export function filterThoughtFromMsgs(messages: APIMessage[]) {
+  return messages.map((msg) => {
+    return {
+      role: msg.role,
+      content:
+        msg.role === 'assistant'
+          ? msg.content.split('</think>').at(-1)!.trim()
+          : msg.content,
+    } as APIMessage;
+  });
+}
+
+export function classNames(classes: Record<string, boolean>): string {
+  return Object.entries(classes)
+    .filter(([_, value]) => value)
+    .map(([key, _]) => key)
+    .join(' ');
+}
+
+export const delay = (ms: number) =>
+  new Promise((resolve) => setTimeout(resolve, ms));
--- a/examples/server/webui/src/utils/storage.ts
+++ b/examples/server/webui/src/utils/storage.ts
@ -0,0 +1,138 @@
+// coversations is stored in localStorage
+// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+
+import { CONFIG_DEFAULT } from '../Config';
+import { Conversation, Message } from './types';
+
+const event = new EventTarget();
+
+type CallbackConversationChanged = (convId: string) => void;
+let onConversationChangedHandlers: [
+  CallbackConversationChanged,
+  EventListener,
+][] = [];
+const dispatchConversationChange = (convId: string) => {
+  event.dispatchEvent(
+    new CustomEvent('conversationChange', { detail: { convId } })
+  );
+};
+
+// convId is a string prefixed with 'conv-'
+const StorageUtils = {
+  /**
+   * manage conversations
+   */
+  getAllConversations(): Conversation[] {
+    const res = [];
+    for (const key in localStorage) {
+      if (key.startsWith('conv-')) {
+        res.push(JSON.parse(localStorage.getItem(key) ?? '{}'));
+      }
+    }
+    res.sort((a, b) => b.lastModified - a.lastModified);
+    return res;
+  },
+  /**
+   * can return null if convId does not exist
+   */
+  getOneConversation(convId: string): Conversation | null {
+    return JSON.parse(localStorage.getItem(convId) || 'null');
+  },
+  /**
+   * if convId does not exist, create one
+   */
+  appendMsg(convId: string, msg: Message): void {
+    if (msg.content === null) return;
+    const conv = StorageUtils.getOneConversation(convId) || {
+      id: convId,
+      lastModified: Date.now(),
+      messages: [],
+    };
+    conv.messages.push(msg);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * Get new conversation id
+   */
+  getNewConvId(): string {
+    return `conv-${Date.now()}`;
+  },
+  /**
+   * remove conversation by id
+   */
+  remove(convId: string): void {
+    localStorage.removeItem(convId);
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove all conversations
+   */
+  filterAndKeepMsgs(
+    convId: string,
+    predicate: (msg: Message) => boolean
+  ): void {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    conv.messages = conv.messages.filter(predicate);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+    dispatchConversationChange(convId);
+  },
+  /**
+   * remove last message from conversation
+   */
+  popMsg(convId: string): Message | undefined {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    const msg = conv.messages.pop();
+    conv.lastModified = Date.now();
+    if (conv.messages.length === 0) {
+      StorageUtils.remove(convId);
+    } else {
+      localStorage.setItem(convId, JSON.stringify(conv));
+    }
+    dispatchConversationChange(convId);
+    return msg;
+  },
+
+  // event listeners
+  onConversationChanged(callback: CallbackConversationChanged) {
+    const fn = (e: Event) => callback((e as CustomEvent).detail.convId);
+    onConversationChangedHandlers.push([callback, fn]);
+    event.addEventListener('conversationChange', fn);
+  },
+  offConversationChanged(callback: CallbackConversationChanged) {
+    const fn = onConversationChangedHandlers.find(([cb, _]) => cb === callback);
+    if (fn) {
+      event.removeEventListener('conversationChange', fn[1]);
+    }
+    onConversationChangedHandlers = [];
+  },
+
+  // manage config
+  getConfig(): typeof CONFIG_DEFAULT {
+    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+    // to prevent breaking changes in the future, we always provide default value for missing keys
+    return {
+      ...CONFIG_DEFAULT,
+      ...savedVal,
+    };
+  },
+  setConfig(config: typeof CONFIG_DEFAULT) {
+    localStorage.setItem('config', JSON.stringify(config));
+  },
+  getTheme(): string {
+    return localStorage.getItem('theme') || 'auto';
+  },
+  setTheme(theme: string) {
+    if (theme === 'auto') {
+      localStorage.removeItem('theme');
+    } else {
+      localStorage.setItem('theme', theme);
+    }
+  },
+};
+
+export default StorageUtils;
--- a/examples/server/webui/src/utils/types.ts
+++ b/examples/server/webui/src/utils/types.ts
@ -0,0 +1,36 @@
+export interface TimingReport {
+  prompt_n: number;
+  prompt_ms: number;
+  predicted_n: number;
+  predicted_ms: number;
+}
+
+export interface Message {
+  id: number;
+  role: 'user' | 'assistant' | 'system';
+  content: string;
+  timings?: TimingReport;
+}
+
+export type APIMessage = Pick<Message, 'role' | 'content'>;
+
+export interface Conversation {
+  id: string; // format: `conv-{timestamp}`
+  lastModified: number; // timestamp from Date.now()
+  messages: Message[];
+}
+
+export type PendingMessage = Omit<Message, 'content'> & {
+  content: string | null;
+};
+
+export enum CanvasType {
+  PY_INTERPRETER,
+}
+
+export interface CanvasPyInterpreter {
+  type: CanvasType.PY_INTERPRETER;
+  content: string;
+}
+
+export type CanvasData = CanvasPyInterpreter;
--- a/examples/server/webui/src/vite-env.d.ts
+++ b/examples/server/webui/src/vite-env.d.ts
@ -0,0 +1 @@
+/// <reference types="vite/client" />
--- a/examples/server/webui/tsconfig.app.json
+++ b/examples/server/webui/tsconfig.app.json
@ -0,0 +1,26 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
+    "target": "ES2021",
+    "useDefineForClassFields": true,
+    "lib": ["ES2021", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedSideEffectImports": true
+  },
+  "include": ["src"]
+}
--- a/examples/server/webui/tsconfig.json
+++ b/examples/server/webui/tsconfig.json
@ -0,0 +1,7 @@
+{
+  "files": [],
+  "references": [
+    { "path": "./tsconfig.app.json" },
+    { "path": "./tsconfig.node.json" }
+  ]
+}
--- a/examples/server/webui/tsconfig.node.json
+++ b/examples/server/webui/tsconfig.node.json
@ -0,0 +1,24 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
+    "target": "ES2022",
+    "lib": ["ES2023"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedSideEffectImports": true
+  },
+  "include": ["vite.config.ts"]
+}
--- a/examples/server/webui/vite.config.ts
+++ b/examples/server/webui/vite.config.ts
@ -1,8 +1,11 @@
-
+import { defineConfig, PluginOption } from 'vite';
+import react from '@vitejs/plugin-react';
 import { viteSingleFile } from 'vite-plugin-singlefile';
-import path from 'path';
-import fs from 'fs';
-import zlib from 'zlib';
+import path from 'node:path';
+import fs from 'node:fs';
+import zlib from 'node:zlib';
+
+/* eslint-disable */

 const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary

@ -15,20 +18,26 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();

+const FRONTEND_PLUGINS = [react()];
+
 const BUILD_PLUGINS = [
+  ...FRONTEND_PLUGINS,
  viteSingleFile(),
  (function llamaCppPlugin() {
-    let config;
+    let config: any;
    return {
      name: 'llamacpp:build',
      apply: 'build',
-      async configResolved(_config) {
+      async configResolved(_config: any) {
        config = _config;
      },
      writeBundle() {
        const outputIndexHtml = path.join(config.build.outDir, 'index.html');
-        const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
-        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
+        const content =
+          GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
+        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), {
+          level: 9,
+        });

        // because gzip header contains machine-specific info, we must remove these data from the header
        // timestamp
@ -42,18 +51,30 @@ const BUILD_PLUGINS = [
        if (compressed.byteLength > MAX_BUNDLE_SIZE) {
          throw new Error(
            `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
-            `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`,
+              `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`
          );
        }

-        const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz');
+        const targetOutputFile = path.join(
+          config.build.outDir,
+          '../../public/index.html.gz'
+        );
        fs.writeFileSync(targetOutputFile, compressed);
-      }
-    }
+      },
+    } satisfies PluginOption;
  })(),
 ];

-/** @type {import('vite').UserConfig} */
-export default {
-  plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS,
-};
+export default defineConfig({
+  // @ts-ignore
+  plugins: process.env.ANALYZE ? FRONTEND_PLUGINS : BUILD_PLUGINS,
+  server: {
+    proxy: {
+      '/v1': 'http://localhost:8080',
+    },
+    headers: {
+      'Cross-Origin-Embedder-Policy': 'require-corp',
+      'Cross-Origin-Opener-Policy': 'same-origin',
+    },
+  },
+});