Merge branch 'master' into concedo_experimental

# Conflicts: # flake.lock # flake.nix
2023-11-01 18:24:36 +08:00 · 2023-11-01 18:24:36 +08:00 · 9342636408
commit 9342636408
parent df7e757d40 f0e209324a
11 changed files with 1933 additions and 2199 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@
 .DS_Store
 .build/
 .cache/
+.ccls-cache/
 .direnv/
 .envrc
 .swiftpm
--- a/common/common.cpp
+++ b/common/common.cpp
@ -218,6 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            sparams.top_p = std::stof(argv[i]);
+        } else if (arg == "--min-p") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.min_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
            if (++i >= argc) {
                invalid_param = true;
@ -679,6 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
+    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@ -1275,6 +1282,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
+    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -89,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
+            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
            params.mirostat, params.mirostat_eta, params.mirostat_tau);

    return std::string(result);
@ -110,6 +110,7 @@ llama_token llama_sampling_sample(
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
+    const float   min_p           = params.min_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
@ -190,6 +191,7 @@ llama_token llama_sampling_sample(
            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
+            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
            llama_sample_temp     (ctx_main, &cur_p, temp);

            id = llama_sample_token(ctx_main, &cur_p);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho

 Example usage: `--top-p 0.95`

+### Min P Sampling
+
+-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
+
+The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
+
+Example usage: `--min-p 0.05`
+
 ### Tail Free Sampling (TFS)

 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -149,6 +149,7 @@ struct task_server {
    task_type type;
    json data;
    bool infill_mode = false;
+    bool embedding_mode = false;
 };

 struct task_result {
@ -371,6 +372,7 @@ struct llama_client_slot
    std::vector<completion_token_output> generated_token_probs;

    bool infill = false;
+    bool embedding = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@ -1244,13 +1246,14 @@ struct llama_server_context
        queue_results.push_back(res);
    }

-    int request_completion(json data, bool infill)
+    int request_completion(json data, bool infill, bool embedding)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.data = data;
        task.infill_mode = infill;
+        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
        queue_tasks.push_back(task);
        return task.id;
@ -1376,7 +1379,7 @@ struct llama_server_context
                    {
                        LOG_TEE("slot unavailable\n");
                        // send error result
-                        send_error(task.id, "slot unavaliable");
+                        send_error(task.id, "slot unavailable");
                        return;
                    }

@ -1388,6 +1391,7 @@ struct llama_server_context
                    slot->reset();

                    slot->infill = task.infill_mode;
+                    slot->embedding = task.embedding_mode;
                    slot->task_id = task.id;

                    if (!launch_slot_with_data(slot, task.data))
@ -1695,7 +1699,7 @@ struct llama_server_context
                }

                // prompt evaluated for embedding
-                if (params.embedding)
+                if (slot.embedding)
                {
                    send_embedding(slot);
                    slot.release();
@ -2274,7 +2278,7 @@ int main(int argc, char **argv)
    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false);
+                const int task_id = llama.request_completion(data, false, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@ -2329,7 +2333,7 @@ int main(int argc, char **argv)
    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true);
+                const int task_id = llama.request_completion(data, true, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@ -2433,7 +2437,7 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json");
            });
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // load kernels
    {
        NSError * error = nil;
-#define GGML_METAL_ADD_KERNEL(name) \
-        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+
+        /*
        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                (int) ctx->pipeline_##name.threadExecutionWidth); \
+        */
+#define GGML_METAL_ADD_KERNEL(name) \
+        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
        if (error) { \
-          GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }

--- a/ggml.h
+++ b/ggml.h
@ -716,7 +716,7 @@ extern "C" {
    // Context tensor enumeration and lookup
    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+    GGML_API struct ggml_tensor * ggml_get_tensor      (struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -600,6 +600,13 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

+    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+    LLAMA_API void llama_sample_min_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
+
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(
            struct llama_context * ctx,
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@ -0,0 +1,391 @@
+#!/bin/bash
+#
+# Helper script for deploying llama.cpp server with a single Bash command
+#
+# - Works on Linux and macOS
+# - Supports: CPU, CUDA, Metal, OpenCL
+# - Can run all GGUF models from HuggingFace
+# - Can serve requests in parallel
+# - Always builds latest llama.cpp from GitHub
+#
+# Limitations
+#
+# - Chat templates are poorly supported (base models recommended)
+# - Might be unstable!
+#
+# Usage:
+#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
+#
+#   --port:       port number, default is 8888
+#   --repo:       path to a repo containing GGUF model files
+#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
+#   --backend:    cpu, cuda, metal, opencl, depends on the OS
+#   --gpu-id:     gpu id, default is 0
+#   --n-parallel: number of parallel requests, default is 8
+#   --n-kv:       KV cache size, default is 4096
+#   --verbose:    verbose output
+#
+# Example:
+#
+#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
+#
+
+set -e
+
+# required utils: curl, git, make
+if ! command -v curl &> /dev/null; then
+    printf "[-] curl not found\n"
+    exit 1
+fi
+if ! command -v git &> /dev/null; then
+    printf "[-] git not found\n"
+    exit 1
+fi
+if ! command -v make &> /dev/null; then
+    printf "[-] make not found\n"
+    exit 1
+fi
+
+# parse arguments
+port=8888
+repo=""
+wtype=""
+backend="cpu"
+
+# if macOS, use metal backend by default
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    backend="metal"
+elif command -v nvcc &> /dev/null; then
+    backend="cuda"
+fi
+
+gpu_id=0
+n_parallel=8
+n_kv=4096
+verbose=0
+
+function print_usage {
+    printf "Usage:\n"
+    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
+    printf "  --port:       port number, default is 8888\n"
+    printf "  --repo:       path to a repo containing GGUF model files\n"
+    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
+    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
+    printf "  --gpu-id:     gpu id, default is 0\n"
+    printf "  --n-parallel: number of parallel requests, default is 8\n"
+    printf "  --n-kv:       KV cache size, default is 4096\n"
+    printf "  --verbose:    verbose output\n\n"
+    printf "Example:\n\n"
+    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
+}
+
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --port)
+            port="$2"
+            shift
+            shift
+            ;;
+        --repo)
+            repo="$2"
+            shift
+            shift
+            ;;
+        --wtype)
+            wtype="$2"
+            shift
+            shift
+            ;;
+        --backend)
+            backend="$2"
+            shift
+            shift
+            ;;
+        --gpu-id)
+            gpu_id="$2"
+            shift
+            shift
+            ;;
+        --n-parallel)
+            n_parallel="$2"
+            shift
+            shift
+            ;;
+        --n-kv)
+            n_kv="$2"
+            shift
+            shift
+            ;;
+        --verbose)
+            verbose=1
+            shift
+            ;;
+        --help)
+            print_usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $key"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
+# available weights types
+wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
+
+wfiles=()
+for wt in "${wtypes[@]}"; do
+    wfiles+=("")
+done
+
+# sample repos
+repos=(
+    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
+    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
+    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
+    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
+    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
+    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
+    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
+)
+
+printf "\n"
+printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
+printf "    Based on the options that follow, the script might download a model file\n"
+printf "    from the internet, which can be a few GBs in size. The script will also\n"
+printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
+printf "\n"
+printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
+printf "    model using llama.cpp for demonstration purposes.\n"
+printf "\n"
+printf "    Please note:\n"
+printf "\n"
+printf "    - All new data will be stored in the current folder\n"
+printf "    - The server will be listening on all network interfaces\n"
+printf "    - The server will run with default settings which are not always optimal\n"
+printf "    - Do not judge the quality of a model based on the results from this script\n"
+printf "    - Do not use this script to benchmark llama.cpp\n"
+printf "    - Do not use this script in production\n"
+printf "    - This script is only for demonstration purposes\n"
+printf "\n"
+printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
+printf "\n"
+printf "    Press Enter to continue ...\n\n"
+
+read
+
+if [[ -z "$repo" ]]; then
+    printf "[+] No repo provided from the command line\n"
+    printf "    Please select a number from the list below or enter an URL:\n\n"
+
+    is=0
+    for r in "${repos[@]}"; do
+        printf "    %2d) %s\n" $is "$r"
+        is=$((is+1))
+    done
+
+    # ask for repo until index of sample repo is provided or an URL
+    while [[ -z "$repo" ]]; do
+        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
+        read -p "[+] Select repo: " repo
+
+        # check if the input is a number
+        if [[ "$repo" =~ ^[0-9]+$ ]]; then
+            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
+                repo="${repos[$repo]}"
+            else
+                printf "[-] Invalid repo index: %s\n" "$repo"
+                repo=""
+            fi
+        elif [[ "$repo" =~ ^https?:// ]]; then
+            repo="$repo"
+        else
+            printf "[-] Invalid repo URL: %s\n" "$repo"
+            repo=""
+        fi
+    done
+fi
+
+# remove suffix
+repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
+
+printf "[+] Checking for GGUF model files in %s\n" "$repo"
+
+# find GGUF files in the source
+# TODO: better logic
+model_tree="${repo%/}/tree/main"
+model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
+
+# list all files in the provided git repo
+printf "[+] Model files:\n\n"
+for file in $model_files; do
+    # determine iw by grepping the filename with wtypes
+    iw=-1
+    is=0
+    for wt in "${wtypes[@]}"; do
+        # uppercase
+        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
+        if [[ "$ufile" =~ "$wt" ]]; then
+            iw=$is
+            break
+        fi
+        is=$((is+1))
+    done
+
+    if [[ $iw -eq -1 ]]; then
+        continue
+    fi
+
+    wfiles[$iw]="$file"
+
+    have=" "
+    if [[ -f "$file" ]]; then
+        have="*"
+    fi
+
+    printf "    %2d) %s %s\n" $iw "$have" "$file"
+done
+
+# ask for weights type until provided and available
+while [[ -z "$wtype" ]]; do
+    printf "\n"
+    read -p "[+] Select weight type: " wtype
+    wfile="${wfiles[$wtype]}"
+
+    if [[ -z "$wfile" ]]; then
+        printf "[-] Invalid weight type: %s\n" "$wtype"
+        wtype=""
+    fi
+done
+
+printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
+
+url="${repo%/}/resolve/main/$wfile"
+
+# check file if the model has been downloaded before
+chk="$wfile.chk"
+
+# check if we should download the file
+# - if $wfile does not exist
+# - if $wfile exists but $chk does not exist
+# - if $wfile exists and $chk exists but $wfile is newer than $chk
+# TODO: better logic using git lfs info
+
+do_download=0
+
+if [[ ! -f "$wfile" ]]; then
+    do_download=1
+elif [[ ! -f "$chk" ]]; then
+    do_download=1
+elif [[ "$wfile" -nt "$chk" ]]; then
+    do_download=1
+fi
+
+if [[ $do_download -eq 1 ]]; then
+    printf "[+] Downloading weights from %s\n" "$url"
+
+    # download the weights file
+    curl -o "$wfile" -# -L "$url"
+
+    # create a check file if successful
+    if [[ $? -eq 0 ]]; then
+        printf "[+] Creating check file %s\n" "$chk"
+        touch "$chk"
+    fi
+else
+    printf "[+] Using cached weights %s\n" "$wfile"
+fi
+
+# get latest llama.cpp and build
+
+printf "[+] Downloading latest llama.cpp\n"
+
+llama_cpp_dir="__llama_cpp_port_${port}__"
+
+if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
+    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
+    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
+    printf "[-] Please remove it and try again\n"
+    exit 1
+elif [[ -d "$llama_cpp_dir" ]]; then
+    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
+    printf "[+] Using cached llama.cpp\n"
+
+    cd "$llama_cpp_dir"
+    git reset --hard
+    git fetch
+    git checkout origin/master
+
+    cd ..
+else
+    printf "[+] Cloning llama.cpp\n"
+
+    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
+fi
+
+# mark that that the directory is made by this script
+touch "$llama_cpp_dir/__ggml_script__"
+
+if [[ $verbose -eq 1 ]]; then
+    set -x
+fi
+
+# build
+cd "$llama_cpp_dir"
+
+make clean
+
+log="--silent"
+if [[ $verbose -eq 1 ]]; then
+    log=""
+fi
+
+if [[ "$backend" == "cuda" ]]; then
+    printf "[+] Building with CUDA backend\n"
+    LLAMA_CUBLAS=1 make -j server $log
+elif [[ "$backend" == "cpu" ]]; then
+    printf "[+] Building with CPU backend\n"
+    make -j server $log
+elif [[ "$backend" == "metal" ]]; then
+    printf "[+] Building with Metal backend\n"
+    make -j server $log
+elif [[ "$backend" == "opencl" ]]; then
+    printf "[+] Building with OpenCL backend\n"
+    LLAMA_CLBLAST=1 make -j server $log
+else
+    printf "[-] Unknown backend: %s\n" "$backend"
+    exit 1
+fi
+
+# run the server
+
+printf "[+] Running server\n"
+
+args=""
+if [[ "$backend" == "cuda" ]]; then
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+    args="-ngl 999"
+elif [[ "$backend" == "cpu" ]]; then
+    args="-ngl 0"
+elif [[ "$backend" == "metal" ]]; then
+    args="-ngl 999"
+elif [[ "$backend" == "opencl" ]]; then
+    args="-ngl 999"
+else
+    printf "[-] Unknown backend: %s\n" "$backend"
+    exit 1
+fi
+
+if [[ $verbose -eq 1 ]]; then
+    args="$args --verbose"
+fi
+
+./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
+
+exit 0