Merge branch 'master' into name-metadata-fix

2024-01-10 10:50:47 +11:00 · 2024-01-10 10:50:47 +11:00 · 6324c528d1
commit 6324c528d1
parent c410ab8fd7 6efb8eb30e
6 changed files with 780 additions and 334 deletions
--- a/README.md
+++ b/README.md
@ -137,6 +137,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [semperai/amica](https://github.com/semperai/amica)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 ---
--- a/convert.py
+++ b/convert.py
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -243,6 +243,9 @@ int main(int argc, char ** argv) {
    }
    auto image_embed = load_image(ctx_llava, &params);
    if (!image_embed) {
        return 1;
    }
    // process the prompt
    process_prompt(ctx_llava, image_embed, &params, params.prompt);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -175,35 +175,44 @@ node index.js
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-    *Result JSON:*
+### Result JSON:
-    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
-    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
-    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+```
 {
  "content": "<the token selected by the model>",
  "probs": [
    {
      "prob": float,
      "tok_str": "<most likely token>"
    },
    {
      "prob": float,
      "tok_str": "<second most likely tonen>"
    },
    ...
  ]
 },
 ```
 Notice that each `probs` is an array of length `n_probs`.
-    `model`: The path to the model loaded with `-m`
+- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
-
+- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
-    `prompt`: The provided `prompt`
+- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
-
+- `model`: The path to the model loaded with `-m`
-    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+- `prompt`: The provided `prompt`
-
+- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
-    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-
+- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
-    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
-
+- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
-    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
-
+- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
-    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
 -   **POST** `/tokenize`: Tokenize a given text.
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -3841,8 +3841,8 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
-    half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
+    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
-    const half ml = 4.h * dl;
+    const float ml = 4.f * dl;
    il = (il/2) & 3;
    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
@ -3909,7 +3909,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
    uint8_t ul = 1 << (il/2);
    il = il & 3;
    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d = il < 2 ? xb->d : xb->d / 16.h;
+    const float d = il < 2 ? xb->d : xb->d / 16.f;
    const float min = xb->dmin;
    const float dl = d * sc[0];
    const float ml = min * sc[1];
@ -3942,17 +3942,17 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
 #if QK_K == 256
    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
    qh = qh + 32*(il/8) + 16*(il&1);
-    half sc = scales[(il%2) + 2 * ((il/2))];
+    float sc = scales[(il%2) + 2 * ((il/2))];
    il = (il/2) & 3;
 #else
    ql = ql + 16 * (il&1);
-    half sc = scales[il];
+    float sc = scales[il];
 #endif
    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
-    const half        coef = il>1 ? 1.f/16.h          : 1.h;
+    const float       coef = il>1 ? 1.f/16.f          : 1.f;
-    const half ml = d_all * sc * 32.h;
+    const float ml = d_all * sc * 32.f;
-    const half dl = d_all * sc * coef;
+    const float dl = d_all * sc * coef;
    for (int i = 0; i < 16; ++i) {
        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
--- a/scripts/get-pg.sh
+++ b/scripts/get-pg.sh
@ -0,0 +1,70 @@
 #!/bin/bash
 function usage {
    echo "usage: <n>$0"
    echo "note: n is the number of essays to download"
    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
    echo "n   | tokens"
    echo "--- | ---"
    echo "1   | 6230"
    echo "2   | 23619"
    echo "5   | 25859"
    echo "10  | 36888"
    echo "15  | 50188"
    echo "20  | 59094"
    echo "25  | 88764"
    echo "30  | 103121"
    echo "32  | 108338"
    echo "35  | 113403"
    echo "40  | 127699"
    echo "45  | 135896"
    exit 1
 }
 function has_cmd {
    if ! [ -x "$(command -v $1)" ]; then
        echo "error: $1 is not available" >&2
        exit 1
    fi
 }
 # check for: curl, html2text, tail, sed, fmt
 has_cmd curl
 has_cmd html2text
 has_cmd tail
 has_cmd sed
 if [ $# -ne 1 ]; then
    usage
 fi
 n=$1
 # get urls
 urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
 printf "urls:\n%s\n" "$urls"
 if [ -f pg.txt ]; then
    rm pg.txt
 fi
 c=1
 for url in $urls; do
    echo "processing $url"
    cc=$(printf "%03d" $c)
    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
    cat pg-$cc-one.txt >> pg.txt
    cp -v pg.txt pg-$cc-all.txt
    c=$((c+1))
    # don't flood the server
    sleep 1
 done
 echo "done. data in pg.txt"
 exit 0