Merge branch 'master' into name-metadata-fix

2024-01-10 10:50:47 +11:00 · 2024-01-10 10:50:47 +11:00 · 6324c528d1
commit 6324c528d1
parent c410ab8fd7 6efb8eb30e
6 changed files with 780 additions and 334 deletions
--- a/README.md
+++ b/README.md
@ -137,6 +137,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [semperai/amica](https://github.com/semperai/amica)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
+- [iohub/collama](https://github.com/iohub/coLLaMA)

 ---

--- a/convert.py
+++ b/convert.py
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -243,6 +243,9 @@ int main(int argc, char ** argv) {
    }

    auto image_embed = load_image(ctx_llava, &params);
+    if (!image_embed) {
+        return 1;
+    }

    // process the prompt
    process_prompt(ctx_llava, image_embed, &params, params.prompt);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -175,35 +175,44 @@ node index.js

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

-    *Result JSON:*
+### Result JSON:

-    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.

-    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.

-    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:

-    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+```
+{
+  "content": "<the token selected by the model>",
+  "probs": [
+    {
+      "prob": float,
+      "tok_str": "<most likely token>"
+    },
+    {
+      "prob": float,
+      "tok_str": "<second most likely tonen>"
+    },
+    ...
+  ]
+},
+```
+Notice that each `probs` is an array of length `n_probs`.

-    `model`: The path to the model loaded with `-m`
-
-    `prompt`: The provided `prompt`
-
-    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
-
-    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-
-    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
-
-    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
-
-    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
-
-    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
-
-    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
-
-    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+- `model`: The path to the model loaded with `-m`
+- `prompt`: The provided `prompt`
+- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
+- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
+- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

 -   **POST** `/tokenize`: Tokenize a given text.

--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -3841,8 +3841,8 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
-    half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
-    const half ml = 4.h * dl;
+    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
+    const float ml = 4.f * dl;

    il = (il/2) & 3;
    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
@ -3909,7 +3909,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
    uint8_t ul = 1 << (il/2);
    il = il & 3;
    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d = il < 2 ? xb->d : xb->d / 16.h;
+    const float d = il < 2 ? xb->d : xb->d / 16.f;
    const float min = xb->dmin;
    const float dl = d * sc[0];
    const float ml = min * sc[1];
@ -3942,17 +3942,17 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
 #if QK_K == 256
    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
    qh = qh + 32*(il/8) + 16*(il&1);
-    half sc = scales[(il%2) + 2 * ((il/2))];
+    float sc = scales[(il%2) + 2 * ((il/2))];
    il = (il/2) & 3;
 #else
    ql = ql + 16 * (il&1);
-    half sc = scales[il];
+    float sc = scales[il];
 #endif
    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
-    const half        coef = il>1 ? 1.f/16.h          : 1.h;
-    const half ml = d_all * sc * 32.h;
-    const half dl = d_all * sc * coef;
+    const float       coef = il>1 ? 1.f/16.f          : 1.f;
+    const float ml = d_all * sc * 32.f;
+    const float dl = d_all * sc * coef;
    for (int i = 0; i < 16; ++i) {
        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
--- a/scripts/get-pg.sh
+++ b/scripts/get-pg.sh
@ -0,0 +1,70 @@
+#!/bin/bash
+
+function usage {
+    echo "usage: <n>$0"
+    echo "note: n is the number of essays to download"
+    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+    echo "n   | tokens"
+    echo "--- | ---"
+    echo "1   | 6230"
+    echo "2   | 23619"
+    echo "5   | 25859"
+    echo "10  | 36888"
+    echo "15  | 50188"
+    echo "20  | 59094"
+    echo "25  | 88764"
+    echo "30  | 103121"
+    echo "32  | 108338"
+    echo "35  | 113403"
+    echo "40  | 127699"
+    echo "45  | 135896"
+    exit 1
+}
+
+function has_cmd {
+    if ! [ -x "$(command -v $1)" ]; then
+        echo "error: $1 is not available" >&2
+        exit 1
+    fi
+}
+
+# check for: curl, html2text, tail, sed, fmt
+has_cmd curl
+has_cmd html2text
+has_cmd tail
+has_cmd sed
+
+if [ $# -ne 1 ]; then
+    usage
+fi
+
+n=$1
+
+# get urls
+urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
+
+printf "urls:\n%s\n" "$urls"
+
+if [ -f pg.txt ]; then
+    rm pg.txt
+fi
+
+c=1
+for url in $urls; do
+    echo "processing $url"
+
+    cc=$(printf "%03d" $c)
+
+    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+    cat pg-$cc-one.txt >> pg.txt
+
+    cp -v pg.txt pg-$cc-all.txt
+    c=$((c+1))
+
+    # don't flood the server
+    sleep 1
+done
+
+echo "done. data in pg.txt"
+
+exit 0