Merge branch 'master' into name-metadata-fix

This commit is contained in:
Brian 2024-01-10 10:50:47 +11:00 committed by GitHub
commit 6324c528d1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 780 additions and 334 deletions

View file

@ -137,6 +137,7 @@ as the main playground for developing new features for the [ggml](https://github
- [semperai/amica](https://github.com/semperai/amica) - [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [iohub/collama](https://github.com/iohub/coLLaMA)
--- ---

File diff suppressed because it is too large Load diff

View file

@ -243,6 +243,9 @@ int main(int argc, char ** argv) {
} }
auto image_embed = load_image(ctx_llava, &params); auto image_embed = load_image(ctx_llava, &params);
if (!image_embed) {
return 1;
}
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);

View file

@ -175,35 +175,44 @@ node index.js
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
*Result JSON:* ### Result JSON:
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. * Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
`content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
`stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
`generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model` ```
{
"content": "<the token selected by the model>",
"probs": [
{
"prob": float,
"tok_str": "<most likely token>"
},
{
"prob": float,
"tok_str": "<second most likely tonen>"
},
...
]
},
```
Notice that each `probs` is an array of length `n_probs`.
`model`: The path to the model loaded with `-m` - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
`prompt`: The provided `prompt` - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
- `model`: The path to the model loaded with `-m`
`stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token - `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
`stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered - `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
`stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
`stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word) - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
`timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
`tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
`tokens_evaluated`: Number of tokens evaluated in total from the prompt
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- **POST** `/tokenize`: Tokenize a given text. - **POST** `/tokenize`: Tokenize a given text.

View file

@ -3841,8 +3841,8 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
: (scale_2&kmask2) | ((scale_1&kmask1) << 4); : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h); float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
const half ml = 4.h * dl; const float ml = 4.f * dl;
il = (il/2) & 3; il = (il/2) & 3;
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
@ -3909,7 +3909,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
uint8_t ul = 1 << (il/2); uint8_t ul = 1 << (il/2);
il = il & 3; il = il & 3;
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const float d = il < 2 ? xb->d : xb->d / 16.h; const float d = il < 2 ? xb->d : xb->d / 16.f;
const float min = xb->dmin; const float min = xb->dmin;
const float dl = d * sc[0]; const float dl = d * sc[0];
const float ml = min * sc[1]; const float ml = min * sc[1];
@ -3942,17 +3942,17 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
#if QK_K == 256 #if QK_K == 256
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
qh = qh + 32*(il/8) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1);
half sc = scales[(il%2) + 2 * ((il/2))]; float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3; il = (il/2) & 3;
#else #else
ql = ql + 16 * (il&1); ql = ql + 16 * (il&1);
half sc = scales[il]; float sc = scales[il];
#endif #endif
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const half coef = il>1 ? 1.f/16.h : 1.h; const float coef = il>1 ? 1.f/16.f : 1.f;
const half ml = d_all * sc * 32.h; const float ml = d_all * sc * 32.f;
const half dl = d_all * sc * coef; const float dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));

70
scripts/get-pg.sh Executable file
View file

@ -0,0 +1,70 @@
#!/bin/bash
function usage {
echo "usage: <n>$0"
echo "note: n is the number of essays to download"
echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
echo "n | tokens"
echo "--- | ---"
echo "1 | 6230"
echo "2 | 23619"
echo "5 | 25859"
echo "10 | 36888"
echo "15 | 50188"
echo "20 | 59094"
echo "25 | 88764"
echo "30 | 103121"
echo "32 | 108338"
echo "35 | 113403"
echo "40 | 127699"
echo "45 | 135896"
exit 1
}
function has_cmd {
if ! [ -x "$(command -v $1)" ]; then
echo "error: $1 is not available" >&2
exit 1
fi
}
# check for: curl, html2text, tail, sed, fmt
has_cmd curl
has_cmd html2text
has_cmd tail
has_cmd sed
if [ $# -ne 1 ]; then
usage
fi
n=$1
# get urls
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
printf "urls:\n%s\n" "$urls"
if [ -f pg.txt ]; then
rm pg.txt
fi
c=1
for url in $urls; do
echo "processing $url"
cc=$(printf "%03d" $c)
curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
cat pg-$cc-one.txt >> pg.txt
cp -v pg.txt pg-$cc-all.txt
c=$((c+1))
# don't flood the server
sleep 1
done
echo "done. data in pg.txt"
exit 0