Merge and update

2023-08-09 00:36:11 +03:00 · 2023-08-09 00:36:11 +03:00 · 28046d1e52
commit 28046d1e52
parent ca2467d12c 468ea24fb4
37 changed files with 6347 additions and 2237 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -163,7 +163,7 @@ node .

    `content`: Set the text to tokenize.

-    Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
+    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.

 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
@ -0,0 +1,109 @@
+#!/bin/bash
+
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+)
+
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+format_prompt() {
+    if [[ "${#CHAT[@]}" -eq 0 ]]; then
+        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
+    else
+        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
+        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
+    fi
+}
+
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+
+N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
+
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 1024,
+        stop: ["[INST]"],
+        stream: true
+    }')"
+
+    # Create a temporary file to hold the Python output
+    TEMPFILE=$(mktemp)
+
+    exec 3< <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
+        --data-raw "${DATA}")
+
+    python -c "
+import json
+import sys
+
+answer = ''
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    if line.startswith('data: '):
+        json_content = line[6:].strip()
+        content = json.loads(json_content)['content']
+        sys.stdout.write(content)
+        sys.stdout.flush()
+        answer += content
+
+answer = answer.rstrip('\n')
+
+# Write the answer to the temporary file
+with open('$TEMPFILE', 'w') as f:
+    f.write(answer)
+    " <&3
+
+    exec 3<&-
+
+    # Read the answer from the temporary file
+    ANSWER=$(cat $TEMPFILE)
+
+    # Clean up the temporary file
+    rm $TEMPFILE
+
+    printf "\n"
+
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+
+while true; do
+    echo -en "\033[0;32m"  # Green color
+    read -r -e -p "> " QUESTION
+    echo -en "\033[0m"  # Reset color
+    chat_completion "${QUESTION}"
+done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -3,12 +3,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
  <title>llama.cpp - chat</title>

  <style>
    body {
-      background-color: #fff;
-      color: #000;
      font-family: system-ui;
      font-size: 90%;
      max-width: 600px;
@ -103,6 +102,36 @@
      margin: 0;
    }

+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
    textarea {
      padding: 5px;
      flex-grow: 1;
@ -122,7 +151,7 @@

    fieldset label {
      margin: 0.5em 0;
-      /*display: block;*/
+      display: block;
    }

    header, footer {
@ -152,21 +181,21 @@
    })

    const params = signal({
-      n_predict: 400,
-      top_k: 40,
-      top_p: 0.95,
-      tfs_z: 1.0,
-      typical_p: 1.0,
-      temperature: 0.7,
-      repeat_penalty: 1.18,
-      frequency_penalty: 0.0,
-      presence_penalty: 0.0,
-      repeat_last_n: 256,
-      mirostat: 0,
-      mirostat_tau: 5.0,
-      mirostat_eta: 0.1,
      cfg_scale: 4.0,
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat_eta: 0.1, // learning rate
+      mirostat_tau: 5, // target entropy
+      mirostat: 0, // 0/1/2
+      n_predict: 400,
      penalize_nl: true,
+      presence_penalty: 0.0, // 0.0 = disabled
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      temperature: 0.7,
+      tfs_z: 1.0, // 1.0 = disabled
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.5, // 1.0 = disabled
+      typical_p: 1.0, // 1.0 = disabled
    })

    const llamaStats = signal(null)
@ -305,29 +334,48 @@
      `
    }

+    const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+    const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+    const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+    const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+    const updateArray = (el) => {
+        const [name, index] = el.target.name.split(".")
+        const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
+        session.value = { ...session.value, [name]: newarr }
+    }
+    const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
+
    const ParamSlider = ({param, min, max, step, children}) => {
-      const updateParamsFloat = (el) => params.value = { ...params.value, [param]: parseFloat(el.target.value) }
      return html`
        <div>
-          <label for="${param}"><code>${param}</code></label>
+          <label for="${param}">${children}</label>
          <input type="range" id="${param}" min="${min}" max="${max}" step="${step}" name="${param}" value="${params.value[param]}" oninput=${updateParamsFloat} />
          <span>${params.value[param]}</span>
-          <span>${children}</span>
        </div>
      `
    }

-    const ConfigForm = (props) => {
-      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
-      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
-      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
-      const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
-      const updateArray = (el) => {
-        const [name, index] = el.target.name.split(".")
-        const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
-        session.value = { ...session.value, [name]: newarr }
-      }
+    const FloatField = ({label, max, min, name, step, value}) => {
+      return html`
+        <div>
+          <label for="${name}">${label}</label>
+          <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+          <span>${value}</span>
+        </div>
+      `
+    };

+    const IntField = ({label, max, min, name, value}) => {
+      return html`
+        <div>
+          <label for="${name}">${label}</label>
+          <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+          <span>${value}</span>
+        </div>
+      `
+    };
+
+    const ConfigForm = (props) => {
      return html`
        <form>
          <fieldset>
@ -356,7 +404,9 @@
                <textarea type="text" name="system_cfg" value="${session.value.system_cfg}" rows=4 oninput=${updateSession}/>
              </div>
            `}
+          </fieldset>

+          <fieldset class="two">
            <${ParamSlider} min=1 max=1000 step=1 param=n_predict>Predict N tokens<//>
            <${ParamSlider} min=0 max=1000 step=1 param=repeat_last_n>Penalize last N tokens<//>
            ${params.value.repeat_last_n > 0 && html`
@ -365,18 +415,26 @@
              <${ParamSlider} min=0 max=4 step=0.01 param=presence_penalty>Penalize tokens not present in prompt<//>
            `}
            <${ParamSlider} min=0 max=2 step=0.01 param=temperature>Temperature<//>
-            ${params.value.temperature > 0 && html`
-              <div>
-                <input id=mirostat_0 type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
-                <label for=mirostat_0>Temperature</label>
+          </fieldset>

-                <input id=mirostat_1 type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
-                <label for=mirostat_1>Mirostat v1</label>
+          ${params.value.temperature > 0 && html`
+            <fieldset class="three">
+              <label>
+                <input type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
+                Temperature
+              </label>

-                <input id=mirostat_2 type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
-                <label for=mirostat_2>Mirostat v2</label>
-              </div>
+              <label><input type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
+                Mirostat v1
+              </label>

+              <label>
+                <input type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
+                Mirostat v2
+              </label>
+            </fieldset>
+
+            <fieldset class="two">
              ${params.value.mirostat == 0 && html`
                <${ParamSlider} min=1 max=1000 step=1 param=top_k>Top K<//>
                <${ParamSlider} min=0 max=1 step=0.01 param=tfs_z>Tail free sampling<//>
@ -387,14 +445,17 @@
                <${ParamSlider} min=0 max=1 step=0.01 param=mirostat_eta>Mirostat eta, learning rate<//>
                <${ParamSlider} min=0 max=1000 step=1 param=mirostat_tau>Mirostat tau, target entropy<//>
              `}
+              </fieldset>
            `}
-          </fieldset>
        </form>
      `
    }
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
        .replace(/__(.*?)__/g, '<strong>$1</strong>')
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -613,6 +613,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
+    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -632,6 +633,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
+    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
+    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
 #endif
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@ -708,6 +712,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
+        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -769,13 +779,19 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = true;
+#else
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {