Merge and update
This commit is contained in:
commit
28046d1e52
37 changed files with 6347 additions and 2237 deletions
|
@ -163,7 +163,7 @@ node .
|
|||
|
||||
`content`: Set the text to tokenize.
|
||||
|
||||
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||
|
||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||
|
||||
|
|
109
examples/server/chat-llama2.sh
Normal file
109
examples/server/chat-llama2.sh
Normal file
|
@ -0,0 +1,109 @@
|
|||
#!/bin/bash
|
||||
|
||||
API_URL="${API_URL:-http://127.0.0.1:8080}"
|
||||
|
||||
CHAT=(
|
||||
"Hello, Assistant."
|
||||
"Hello. How may I help you today?"
|
||||
)
|
||||
|
||||
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
|
||||
|
||||
trim() {
|
||||
shopt -s extglob
|
||||
set -- "${1##+([[:space:]])}"
|
||||
printf "%s" "${1%%+([[:space:]])}"
|
||||
}
|
||||
|
||||
trim_trailing() {
|
||||
shopt -s extglob
|
||||
printf "%s" "${1%%+([[:space:]])}"
|
||||
}
|
||||
|
||||
format_prompt() {
|
||||
if [[ "${#CHAT[@]}" -eq 0 ]]; then
|
||||
echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
|
||||
else
|
||||
LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
|
||||
echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
|
||||
fi
|
||||
}
|
||||
|
||||
tokenize() {
|
||||
curl \
|
||||
--silent \
|
||||
--request POST \
|
||||
--url "${API_URL}/tokenize" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
||||
| jq '.tokens[]'
|
||||
}
|
||||
|
||||
N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
|
||||
|
||||
chat_completion() {
|
||||
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
|
||||
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
|
||||
prompt: .,
|
||||
temperature: 0.2,
|
||||
top_k: 40,
|
||||
top_p: 0.9,
|
||||
n_keep: $n_keep,
|
||||
n_predict: 1024,
|
||||
stop: ["[INST]"],
|
||||
stream: true
|
||||
}')"
|
||||
|
||||
# Create a temporary file to hold the Python output
|
||||
TEMPFILE=$(mktemp)
|
||||
|
||||
exec 3< <(curl \
|
||||
--silent \
|
||||
--no-buffer \
|
||||
--request POST \
|
||||
--url "${API_URL}/completion" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "${DATA}")
|
||||
|
||||
python -c "
|
||||
import json
|
||||
import sys
|
||||
|
||||
answer = ''
|
||||
while True:
|
||||
line = sys.stdin.readline()
|
||||
if not line:
|
||||
break
|
||||
if line.startswith('data: '):
|
||||
json_content = line[6:].strip()
|
||||
content = json.loads(json_content)['content']
|
||||
sys.stdout.write(content)
|
||||
sys.stdout.flush()
|
||||
answer += content
|
||||
|
||||
answer = answer.rstrip('\n')
|
||||
|
||||
# Write the answer to the temporary file
|
||||
with open('$TEMPFILE', 'w') as f:
|
||||
f.write(answer)
|
||||
" <&3
|
||||
|
||||
exec 3<&-
|
||||
|
||||
# Read the answer from the temporary file
|
||||
ANSWER=$(cat $TEMPFILE)
|
||||
|
||||
# Clean up the temporary file
|
||||
rm $TEMPFILE
|
||||
|
||||
printf "\n"
|
||||
|
||||
CHAT+=("$1" "$(trim "$ANSWER")")
|
||||
}
|
||||
|
||||
while true; do
|
||||
echo -en "\033[0;32m" # Green color
|
||||
read -r -e -p "> " QUESTION
|
||||
echo -en "\033[0m" # Reset color
|
||||
chat_completion "${QUESTION}"
|
||||
done
|
File diff suppressed because it is too large
Load diff
|
@ -3,12 +3,11 @@
|
|||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
||||
<meta name="color-scheme" content="light dark">
|
||||
<title>llama.cpp - chat</title>
|
||||
|
||||
<style>
|
||||
body {
|
||||
background-color: #fff;
|
||||
color: #000;
|
||||
font-family: system-ui;
|
||||
font-size: 90%;
|
||||
max-width: 600px;
|
||||
|
@ -103,6 +102,36 @@
|
|||
margin: 0;
|
||||
}
|
||||
|
||||
fieldset.two {
|
||||
display: grid;
|
||||
grid-template: "a a";
|
||||
gap: 1em;
|
||||
}
|
||||
|
||||
fieldset.three {
|
||||
display: grid;
|
||||
grid-template: "a a a";
|
||||
gap: 1em;
|
||||
}
|
||||
|
||||
details {
|
||||
border: 1px solid #aaa;
|
||||
border-radius: 4px;
|
||||
padding: 0.5em 0.5em 0;
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
|
||||
summary {
|
||||
font-weight: bold;
|
||||
margin: -0.5em -0.5em 0;
|
||||
padding: 0.5em;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
details[open] {
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
||||
textarea {
|
||||
padding: 5px;
|
||||
flex-grow: 1;
|
||||
|
@ -122,7 +151,7 @@
|
|||
|
||||
fieldset label {
|
||||
margin: 0.5em 0;
|
||||
/*display: block;*/
|
||||
display: block;
|
||||
}
|
||||
|
||||
header, footer {
|
||||
|
@ -152,21 +181,21 @@
|
|||
})
|
||||
|
||||
const params = signal({
|
||||
n_predict: 400,
|
||||
top_k: 40,
|
||||
top_p: 0.95,
|
||||
tfs_z: 1.0,
|
||||
typical_p: 1.0,
|
||||
temperature: 0.7,
|
||||
repeat_penalty: 1.18,
|
||||
frequency_penalty: 0.0,
|
||||
presence_penalty: 0.0,
|
||||
repeat_last_n: 256,
|
||||
mirostat: 0,
|
||||
mirostat_tau: 5.0,
|
||||
mirostat_eta: 0.1,
|
||||
cfg_scale: 4.0,
|
||||
frequency_penalty: 0.0, // 0.0 = disabled
|
||||
mirostat_eta: 0.1, // learning rate
|
||||
mirostat_tau: 5, // target entropy
|
||||
mirostat: 0, // 0/1/2
|
||||
n_predict: 400,
|
||||
penalize_nl: true,
|
||||
presence_penalty: 0.0, // 0.0 = disabled
|
||||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||
repeat_penalty: 1.18, // 1.0 = disabled
|
||||
temperature: 0.7,
|
||||
tfs_z: 1.0, // 1.0 = disabled
|
||||
top_k: 40, // <= 0 to use vocab size
|
||||
top_p: 0.5, // 1.0 = disabled
|
||||
typical_p: 1.0, // 1.0 = disabled
|
||||
})
|
||||
|
||||
const llamaStats = signal(null)
|
||||
|
@ -305,29 +334,48 @@
|
|||
`
|
||||
}
|
||||
|
||||
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
|
||||
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
|
||||
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
|
||||
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
|
||||
const updateArray = (el) => {
|
||||
const [name, index] = el.target.name.split(".")
|
||||
const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
|
||||
session.value = { ...session.value, [name]: newarr }
|
||||
}
|
||||
const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
|
||||
|
||||
const ParamSlider = ({param, min, max, step, children}) => {
|
||||
const updateParamsFloat = (el) => params.value = { ...params.value, [param]: parseFloat(el.target.value) }
|
||||
return html`
|
||||
<div>
|
||||
<label for="${param}"><code>${param}</code></label>
|
||||
<label for="${param}">${children}</label>
|
||||
<input type="range" id="${param}" min="${min}" max="${max}" step="${step}" name="${param}" value="${params.value[param]}" oninput=${updateParamsFloat} />
|
||||
<span>${params.value[param]}</span>
|
||||
<span>${children}</span>
|
||||
</div>
|
||||
`
|
||||
}
|
||||
|
||||
const ConfigForm = (props) => {
|
||||
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
|
||||
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
|
||||
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
|
||||
const appendArray = () => session.value = { ...session.value, stop: [...session.value.stop, ""] }
|
||||
const updateArray = (el) => {
|
||||
const [name, index] = el.target.name.split(".")
|
||||
const newarr = session.value[name].map((v, i) => i == index ? el.target.value : v).filter(x => x !== "")
|
||||
session.value = { ...session.value, [name]: newarr }
|
||||
}
|
||||
const FloatField = ({label, max, min, name, step, value}) => {
|
||||
return html`
|
||||
<div>
|
||||
<label for="${name}">${label}</label>
|
||||
<input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
|
||||
<span>${value}</span>
|
||||
</div>
|
||||
`
|
||||
};
|
||||
|
||||
const IntField = ({label, max, min, name, value}) => {
|
||||
return html`
|
||||
<div>
|
||||
<label for="${name}">${label}</label>
|
||||
<input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
|
||||
<span>${value}</span>
|
||||
</div>
|
||||
`
|
||||
};
|
||||
|
||||
const ConfigForm = (props) => {
|
||||
return html`
|
||||
<form>
|
||||
<fieldset>
|
||||
|
@ -356,7 +404,9 @@
|
|||
<textarea type="text" name="system_cfg" value="${session.value.system_cfg}" rows=4 oninput=${updateSession}/>
|
||||
</div>
|
||||
`}
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="two">
|
||||
<${ParamSlider} min=1 max=1000 step=1 param=n_predict>Predict N tokens<//>
|
||||
<${ParamSlider} min=0 max=1000 step=1 param=repeat_last_n>Penalize last N tokens<//>
|
||||
${params.value.repeat_last_n > 0 && html`
|
||||
|
@ -365,18 +415,26 @@
|
|||
<${ParamSlider} min=0 max=4 step=0.01 param=presence_penalty>Penalize tokens not present in prompt<//>
|
||||
`}
|
||||
<${ParamSlider} min=0 max=2 step=0.01 param=temperature>Temperature<//>
|
||||
${params.value.temperature > 0 && html`
|
||||
<div>
|
||||
<input id=mirostat_0 type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
|
||||
<label for=mirostat_0>Temperature</label>
|
||||
</fieldset>
|
||||
|
||||
<input id=mirostat_1 type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
|
||||
<label for=mirostat_1>Mirostat v1</label>
|
||||
${params.value.temperature > 0 && html`
|
||||
<fieldset class="three">
|
||||
<label>
|
||||
<input type=radio name=mirostat checked=${params.value.mirostat == 0} value=0 oninput=${updateParamsFloat} />
|
||||
Temperature
|
||||
</label>
|
||||
|
||||
<input id=mirostat_2 type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
|
||||
<label for=mirostat_2>Mirostat v2</label>
|
||||
</div>
|
||||
<label><input type=radio name=mirostat checked=${params.value.mirostat == 1} value=1 oninput=${updateParamsFloat} />
|
||||
Mirostat v1
|
||||
</label>
|
||||
|
||||
<label>
|
||||
<input type=radio name=mirostat checked=${params.value.mirostat == 2} value=2 oninput=${updateParamsFloat} />
|
||||
Mirostat v2
|
||||
</label>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="two">
|
||||
${params.value.mirostat == 0 && html`
|
||||
<${ParamSlider} min=1 max=1000 step=1 param=top_k>Top K<//>
|
||||
<${ParamSlider} min=0 max=1 step=0.01 param=tfs_z>Tail free sampling<//>
|
||||
|
@ -387,14 +445,17 @@
|
|||
<${ParamSlider} min=0 max=1 step=0.01 param=mirostat_eta>Mirostat eta, learning rate<//>
|
||||
<${ParamSlider} min=0 max=1000 step=1 param=mirostat_tau>Mirostat tau, target entropy<//>
|
||||
`}
|
||||
</fieldset>
|
||||
`}
|
||||
</fieldset>
|
||||
</form>
|
||||
`
|
||||
}
|
||||
// poor mans markdown replacement
|
||||
const Markdownish = (params) => {
|
||||
const md = params.text
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
||||
|
|
|
@ -613,6 +613,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
||||
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
|
||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
|
@ -632,6 +633,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
||||
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
||||
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
||||
#endif
|
||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
||||
|
@ -708,6 +712,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
break;
|
||||
}
|
||||
params.n_gqa = std::stoi(argv[i]);
|
||||
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rms_norm_eps = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -769,13 +779,19 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|||
}
|
||||
}
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--low-vram" || arg == "-lv") {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.low_vram = true;
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.mul_mat_q = true;
|
||||
#else
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||
if (++i >= argc) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue