main: update refs -> llama

fix examples/main ref
This commit is contained in:
Olivier Chafik 2024-06-06 15:44:05 +01:00
parent f5f19a236f
commit 8b7c734473
42 changed files with 101 additions and 101 deletions

View file

@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./main "${GEN_OPTIONS[@]}" \
./llama "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \

View file

@ -58,4 +58,4 @@ echo "$2
model=$1
# generate the most likely continuation until the string "===" is found
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View file

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./main $GEN_OPTIONS \
./llama $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View file

@ -62,7 +62,7 @@ fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 64 here for better user feedback during initial prompt processing
./main 2>>"$LOG" \
./llama 2>>"$LOG" \
--batch_size 64 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./main 2>>"$LOG" "${OPTS[@]}" \
./llama 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./main
skip_bytes 1 | # skip BOS token added by ./llama
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
@ -133,7 +133,7 @@ while read -e line; do
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./main output!"
echo >&2 "Couldn't get number of tokens from ./llama output!"
exit 1
fi
@ -144,7 +144,7 @@ while read -e line; do
fi
# Update cache for next prompt in background, ideally during user input
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &

View file

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
./bin/llama $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View file

@ -11,6 +11,6 @@ cd ..
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View file

@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--use-checkpointing
# predict
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/main -m open-llama-3b-v2-q8_0.gguf \
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/main -m open-llama-3b-v2-q8_0.gguf \
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin

View file

@ -19,7 +19,7 @@ fi
set -x
SPLIT=$1/gguf-split
MAIN=$1/main
MAIN=$1/llama
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./main --color --instruct --threads 4 \
./llama --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 -n -1 \

View file

@ -21,7 +21,7 @@ counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"

View file

@ -1,4 +1,4 @@
# llama.cpp/example/llama-bench
# llama.cpp/examples/llama-bench
Performance testing tool for llama.cpp.

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \

View file

@ -1,4 +1,4 @@
# llama.cpp/example/main
# llama.cpp/examples/main
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
@ -20,13 +20,13 @@ To get started right away, run the following command, making sure to use the cor
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
```
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
llama.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
```
For an interactive experience, try this command:
@ -34,7 +34,7 @@ For an interactive experience, try this command:
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
'User: Hi
AI: Hello. I am an AI chatbot. Would you like to talk?
User: Sure!
@ -45,7 +45,7 @@ User:'
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
llama.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin --ignore-eos -n -1
./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
```
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
llama.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
```
## Common Options
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
```sh
./main -r "User:" --in-prefix " "
./llama -r "User:" --in-prefix " "
```
### In-Suffix
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
```sh
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
```
## Context Management

View file

@ -20,7 +20,7 @@ set -x
SPLIT=$1/gguf-split
QUANTIZE=$1/quantize
MAIN=$1/main
MAIN=$1/llama
WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../)

View file

@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
MODEL="-m $2 "
fi
./main $MODEL --color \
./llama $MODEL --color \
-f ./prompts/reason-act.txt \
-i --interactive-first \
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \

View file

@ -70,5 +70,5 @@ cmake --build . --config Release
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```

View file

@ -23,15 +23,15 @@ fi
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
fi
#use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#use multiple GPUs with same max compute units
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

View file

@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--no-checkpointing
# predict
./bin/main -m ggml-shakespeare-256x16-f32.gguf
./bin/llama -m ggml-shakespeare-256x16-f32.gguf
```
Output files will be saved every N iterations (config with `--save-every N`).