Merge and update

This commit is contained in:
Henri Vasserman 2023-08-09 00:36:11 +03:00
commit 28046d1e52
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
37 changed files with 6347 additions and 2237 deletions

View file

@ -163,7 +163,7 @@ node .
`content`: Set the text to tokenize.
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

View file

@ -0,0 +1,109 @@
#!/bin/bash
API_URL="${API_URL:-http://127.0.0.1:8080}"
CHAT=(
"Hello, Assistant."
"Hello. How may I help you today?"
)
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
trim() {
shopt -s extglob
set -- "${1##+([[:space:]])}"
printf "%s" "${1%%+([[:space:]])}"
}
trim_trailing() {
shopt -s extglob
printf "%s" "${1%%+([[:space:]])}"
}
format_prompt() {
if [[ "${#CHAT[@]}" -eq 0 ]]; then
echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
else
LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
fi
}
tokenize() {
curl \
--silent \
--request POST \
--url "${API_URL}/tokenize" \
--header "Content-Type: application/json" \
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
| jq '.tokens[]'
}
N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
chat_completion() {
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
prompt: .,
temperature: 0.2,
top_k: 40,
top_p: 0.9,
n_keep: $n_keep,
n_predict: 1024,
stop: ["[INST]"],
stream: true
}')"
# Create a temporary file to hold the Python output
TEMPFILE=$(mktemp)
exec 3< <(curl \
--silent \
--no-buffer \
--request POST \
--url "${API_URL}/completion" \
--header "Content-Type: application/json" \
--data-raw "${DATA}")
python -c "
import json
import sys
answer = ''
while True:
line = sys.stdin.readline()
if not line:
break
if line.startswith('data: '):
json_content = line[6:].strip()
content = json.loads(json_content)['content']
sys.stdout.write(content)
sys.stdout.flush()
answer += content
answer = answer.rstrip('\n')
# Write the answer to the temporary file
with open('$TEMPFILE', 'w') as f:
f.write(answer)
" <&3
exec 3<&-
# Read the answer from the temporary file
ANSWER=$(cat $TEMPFILE)
# Clean up the temporary file
rm $TEMPFILE
printf "\n"
CHAT+=("$1" "$(trim "$ANSWER")")
}
while true; do
echo -en "\033[0;32m" # Green color
read -r -e -p "> " QUESTION
echo -en "\033[0m" # Reset color
chat_completion "${QUESTION}"
done

File diff suppressed because it is too large Load diff

View file

@ -3,12 +3,11 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<meta name="color-scheme" content="light dark">
<title>llama.cpp - chat</title>
<style>
body {
background-color: #fff;
color: #000;
font-family: system-ui;
font-size: 90%;
max-width: 600px;
@ -103,6 +102,36 @@
margin: 0;
}
fieldset.two {
display: grid;
grid-template: "a a";
gap: 1em;
}
fieldset.three {
display: grid;
grid-template: "a a a";
gap: 1em;
}
details {
border: 1px solid #aaa;
border-radius: 4px;
padding: 0.5em 0.5em 0;
margin-top: 0.5em;
}
summary {
font-weight: bold;
margin: -0.5em -0.5em 0;
padding: 0.5em;
cursor: pointer;
}
details[open] {
padding: 0.5em;
}
textarea {
padding: 5px;
flex-grow: 1;
@ -122,7 +151,7 @@
fieldset label {
margin: 0.5em 0;
/*display: block;*/
display: block;
}
header, footer {
@ -152,21 +181,21 @@
})
const params = signal({
n_predict: 400,
top_k: 40,
top_p: 0.95,
tfs_z: 1.0,
typical_p: 1.0,
temperature: 0.7,
repeat_penalty: 1.18,
frequency_penalty: 0.0,
presence_penalty: 0.0,
repeat_last_n: 256,
mirostat: 0,
mirostat_tau: 5.0,
mirostat_eta: 0.1,
cfg_scale: 4.0,
frequency_penalty: 0.0, // 0.0 = disabled
mirostat_eta: 0.1, // learning rate
mirostat_tau: 5, // target entropy
mirostat: 0, // 0/1/2
n_predict: 400,
penalize_nl: true,
presence_penalty: 0.0, // 0.0 = disabled
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled
temperature: 0.7,
tfs_z: 1.0, // 1.0 = disabled
top_k: 40, // <= 0 to use vocab size
top_p: 0.5, // 1.0 = disabled
typical_p: 1.0, // 1.0 = disabled
})
const llamaStats = signal(null)
@ -305,29 +334,48 @@
`
}
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
const updateArray = (el) => {
const [name, index] = el.target.name.split(".")
const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
session.value = { ...session.value, [name]: newarr }
}
const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
const ParamSlider = ({param, min, max, step, children}) => {
const updateParamsFloat = (el) => params.value = { ...params.value, [param]: parseFloat(el.target.value) }
return html`
<div>
<label for="${param}"><code>${param}</code></label>
<label for="${param}">${children}</label>
<input type="range" id="${param}" min="${min}" max="${max}" step="${step}" name="${param}" value="${params.value[param]}" oninput=${updateParamsFloat} />
<span>${params.value[param]}</span>
<span>${children}</span>
</div>
`
}
const ConfigForm = (props) => {
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
const updateArray = (el) => {
const [name, index] = el.target.name.split(".")
const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
session.value = { ...session.value, [name]: newarr }
}
const FloatField = ({label, max, min, name, step, value}) => {
return html`
<div>
<label for="${name}">${label}</label>
<input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
<span>${value}</span>
</div>
`
};
const IntField = ({label, max, min, name, value}) => {
return html`
<div>
<label for="${name}">${label}</label>
<input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
<span>${value}</span>
</div>
`
};
const ConfigForm = (props) => {
return html`
<form>
<fieldset>
@ -356,7 +404,9 @@
<textarea type="text" name="system_cfg" value="${session.value.system_cfg}" rows=4 oninput=${updateSession}/>
</div>
`}
</fieldset>
<fieldset class="two">
<${ParamSlider} min=1 max=1000 step=1 param=n_predict>Predict N tokens<//>
<${ParamSlider} min=0 max=1000 step=1 param=repeat_last_n>Penalize last N tokens<//>
${params.value.repeat_last_n > 0 && html`
@ -365,18 +415,26 @@
<${ParamSlider} min=0 max=4 step=0.01 param=presence_penalty>Penalize tokens not present in prompt<//>
`}
<${ParamSlider} min=0 max=2 step=0.01 param=temperature>Temperature<//>
${params.value.temperature > 0 && html`
<div>
<input id=mirostat_0 type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
<label for=mirostat_0>Temperature</label>
</fieldset>
<input id=mirostat_1 type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
<label for=mirostat_1>Mirostat v1</label>
${params.value.temperature > 0 && html`
<fieldset class="three">
<label>
<input type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
Temperature
</label>
<input id=mirostat_2 type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
<label for=mirostat_2>Mirostat v2</label>
</div>
<label><input type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
Mirostat v1
</label>
<label>
<input type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
Mirostat v2
</label>
</fieldset>
<fieldset class="two">
${params.value.mirostat == 0 && html`
<${ParamSlider} min=1 max=1000 step=1 param=top_k>Top K<//>
<${ParamSlider} min=0 max=1 step=0.01 param=tfs_z>Tail free sampling<//>
@ -387,14 +445,17 @@
<${ParamSlider} min=0 max=1 step=0.01 param=mirostat_eta>Mirostat eta, learning rate<//>
<${ParamSlider} min=0 max=1000 step=1 param=mirostat_tau>Mirostat tau, target entropy<//>
`}
</fieldset>
`}
</fieldset>
</form>
`
}
// poor mans markdown replacement
const Markdownish = (params) => {
const md = params.text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
.replace(/__(.*?)__/g, '<strong>$1</strong>')

View file

@ -613,6 +613,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@ -632,6 +633,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
#endif
fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@ -708,6 +712,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break;
}
params.n_gqa = std::stoi(argv[i]);
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rms_norm_eps = std::stof(argv[i]);
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
@ -769,13 +779,19 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
}
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
#endif // GGML_USE_CUBLAS
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = true;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
#endif // GGML_USE_CUBLAS
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {