main: update refs -> llama
fix examples/main ref
This commit is contained in:
parent
f5f19a236f
commit
8b7c734473
42 changed files with 101 additions and 101 deletions
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
|||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
fi
|
||||
|
||||
./main "${GEN_OPTIONS[@]}" \
|
||||
./llama "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--in-prefix " " \
|
||||
--in-suffix "${AI_NAME}:" \
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
--color \
|
||||
-f ./prompts/alpaca.txt \
|
||||
--ctx_size 2048 \
|
||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
|||
model=$1
|
||||
|
||||
# generate the most likely continuation until the string "===" is found
|
||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
|||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./main $GEN_OPTIONS \
|
||||
./llama $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
|
|
|
@ -62,7 +62,7 @@ fi
|
|||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||
echo 'Prompt cache does not exist, building...'
|
||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||
./main 2>>"$LOG" \
|
||||
./llama 2>>"$LOG" \
|
||||
--batch_size 64 \
|
||||
"${OPTS[@]}" \
|
||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
|||
|
||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||
|
||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
||||
./llama 2>>"$LOG" "${OPTS[@]}" \
|
||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||
--prompt-cache-all \
|
||||
--file "$CUR_PROMPT_FILE" \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--n_predict "$n_predict" |
|
||||
skip_bytes 1 | # skip BOS token added by ./main
|
||||
skip_bytes 1 | # skip BOS token added by ./llama
|
||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||
skip_bytes "$n_prompt_len_pre" # print generation
|
||||
|
||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
|||
# TODO get both messages in one go
|
||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
||||
echo >&2 "Couldn't get number of tokens from ./llama output!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
|||
fi
|
||||
|
||||
# Update cache for next prompt in background, ideally during user input
|
||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||
--file "$NEXT_PROMPT_FILE" \
|
||||
--n_predict 1 &
|
||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
|||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./bin/main $GEN_OPTIONS \
|
||||
./bin/llama $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
|||
#
|
||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||
#
|
||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
--repeat_penalty 1.0 --color -i \
|
||||
-r "User:" -f prompts/chat-with-bob.txt
|
||||
|
|
|
@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b
|
|||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
|
|
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||
--use-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
```
|
||||
|
||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||
|
@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
|
|||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||
|
||||
```bash
|
||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
||||
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||
```
|
||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
|||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||
|
||||
```bash
|
||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
||||
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||
|
|
|
@ -19,7 +19,7 @@ fi
|
|||
set -x
|
||||
|
||||
SPLIT=$1/gguf-split
|
||||
MAIN=$1/main
|
||||
MAIN=$1/llama
|
||||
WORK_PATH=$TMP_DIR/gguf-split
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main --color --instruct --threads 4 \
|
||||
./llama --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
|
|
|
@ -21,7 +21,7 @@ counter=1
|
|||
echo 'Running'
|
||||
while IFS= read -r question
|
||||
do
|
||||
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
echo $counter
|
||||
echo "Current Question: $question"
|
||||
eval "$exe_cmd"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# llama.cpp/example/llama-bench
|
||||
# llama.cpp/examples/llama-bench
|
||||
|
||||
Performance testing tool for llama.cpp.
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# llama.cpp/example/main
|
||||
# llama.cpp/examples/main
|
||||
|
||||
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||
|
||||
|
@ -20,13 +20,13 @@ To get started right away, run the following command, making sure to use the cor
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
|
||||
llama.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
For an interactive experience, try this command:
|
||||
|
@ -34,7 +34,7 @@ For an interactive experience, try this command:
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||
./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||
'User: Hi
|
||||
AI: Hello. I am an AI chatbot. Would you like to talk?
|
||||
User: Sure!
|
||||
|
@ -45,7 +45,7 @@ User:'
|
|||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
llama.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos -n -1
|
||||
./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
|
||||
llama.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
|
||||
```
|
||||
|
||||
## Common Options
|
||||
|
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
|
|||
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./main -r "User:" --in-prefix " "
|
||||
./llama -r "User:" --in-prefix " "
|
||||
```
|
||||
|
||||
### In-Suffix
|
||||
|
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
|
|||
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||
./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||
```
|
||||
|
||||
## Context Management
|
||||
|
|
|
@ -20,7 +20,7 @@ set -x
|
|||
|
||||
SPLIT=$1/gguf-split
|
||||
QUANTIZE=$1/quantize
|
||||
MAIN=$1/main
|
||||
MAIN=$1/llama
|
||||
WORK_PATH=$TMP_DIR/quantize
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
|
|||
MODEL="-m $2 "
|
||||
fi
|
||||
|
||||
./main $MODEL --color \
|
||||
./llama $MODEL --color \
|
||||
-f ./prompts/reason-act.txt \
|
||||
-i --interactive-first \
|
||||
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
|
||||
|
|
|
@ -70,5 +70,5 @@ cmake --build . --config Release
|
|||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
|
||||
```bash
|
||||
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
```
|
||||
|
|
|
@ -23,15 +23,15 @@ fi
|
|||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||
--no-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/main -m ggml-shakespeare-256x16-f32.gguf
|
||||
./bin/llama -m ggml-shakespeare-256x16-f32.gguf
|
||||
```
|
||||
|
||||
Output files will be saved every N iterations (config with `--save-every N`).
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue