rename llama|main -> llama-cli; consistent RPM bin prefixes

This commit is contained in:
Olivier Chafik 2024-06-10 15:34:14 +01:00
parent 347f30803f
commit 5265c15d4c
51 changed files with 142 additions and 144 deletions

View file

@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./llama "${GEN_OPTIONS[@]}" \
./llama-cli "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
./llama-cli -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \

View file

@ -1,4 +1,4 @@
set(TARGET llama-baby)
set(TARGET llama-baby-llama)
add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

View file

@ -58,4 +58,4 @@ echo "$2
model=$1
# generate the most likely continuation until the string "===" is found
./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs

View file

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./llama $GEN_OPTIONS \
./llama-cli $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View file

@ -62,7 +62,7 @@ fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...'
# Default batch_size to 64 here for better user feedback during initial prompt processing
./llama 2>>"$LOG" \
./llama-cli 2>>"$LOG" \
--batch_size 64 \
"${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
./llama 2>>"$LOG" "${OPTS[@]}" \
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
--prompt-cache "$CUR_PROMPT_CACHE" \
--prompt-cache-all \
--file "$CUR_PROMPT_FILE" \
--reverse-prompt "${USER_NAME}:" \
--n_predict "$n_predict" |
skip_bytes 1 | # skip BOS token added by ./llama
skip_bytes 1 | # skip BOS token added by ./llama-cli
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
skip_bytes "$n_prompt_len_pre" # print generation
@ -133,7 +133,7 @@ while read -e line; do
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./llama output!"
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1
fi
@ -144,7 +144,7 @@ while read -e line; do
fi
# Update cache for next prompt in background, ideally during user input
./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
--prompt-cache "$NEXT_PROMPT_CACHE" \
--file "$NEXT_PROMPT_FILE" \
--n_predict 1 &

View file

@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/llama $GEN_OPTIONS \
./bin/llama-cli $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \

View file

@ -11,6 +11,6 @@ cd ..
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View file

@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b
Now you can use the model with a command like:
`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--use-checkpointing
# predict
./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin

View file

@ -19,7 +19,7 @@ fi
set -x
SPLIT=$1/gguf-split
MAIN=$1/llama
MAIN=$1/llama-cli
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./llama --color --instruct --threads 4 \
./llama-cli --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 -n -1 \

View file

@ -21,7 +21,7 @@ counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"

View file

@ -524,7 +524,7 @@ class SchemaConverter:
def main(args_in = None):
parser = argparse.ArgumentParser(
description='''
Generates a grammar (suitable for use in ./llama) that produces JSON conforming to a
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
given JSON schema. Only a subset of JSON schema features are supported; more may be
added in the future.
''',

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
./llama-cli -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \

View file

@ -7,7 +7,7 @@
cd `dirname $0`
cd ..
./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
./llama-cli -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \

View file

@ -30,9 +30,9 @@ if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
endif()
set(TARGET llama-llava)
set(TARGET llama-llava-cli)
add_executable(${TARGET} llava-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
## Usage
Build with cmake or run `make llama-llava` to build it.
Build with cmake or run `make llama-llava-cli` to build it.
After building, run: `./llama-llava` to see the usage. For example:
After building, run: `./llama-llava-cli` to see the usage. For example:
```sh
./llama-llava -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
### case 1
**input**
```sh
/data/local/tmp/llama-llava \
/data/local/tmp/llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
### case 2
**input**
```sh
/data/local/tmp/llama-llava \
/data/local/tmp/llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms
#### llava-cli release-b2005
**input**
```sh
/data/local/tmp/llama-llava \
/data/local/tmp/llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
### case 1
**input**
```sh
./llama-llava \
./llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
--image /data/local/tmp/demo.jpeg \
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
### case 2
**input**
```sh
./llama-llava \
./llama-llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \

View file

@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
After API is confirmed, more models will be supported / uploaded.
## Usage
Build with cmake or run `make llama-llava` to build it.
Build with cmake or run `make llama-llava-cli` to build it.
After building, run: `./llama-llava` to see the usage. For example:
After building, run: `./llama-llava-cli` to see the usage. For example:
```sh
./llama-llava -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@ -97,7 +97,7 @@ python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow
7) And finally we can run the llava cli using the 1.6 model version:
```console
./llama-llava -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
```
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)

View file

@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
program_dir="build_64/bin"
binName="llama-llava"
binName="llama-llava-cli"
n_threads=4

View file

@ -1,6 +1,5 @@
set(TARGET llama-cli)
add_executable(${TARGET} main.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -20,7 +20,7 @@ To get started right away, run the following command, making sure to use the cor
#### Unix-based systems (Linux, macOS, etc.):
```bash
./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
```
#### Windows:
@ -34,7 +34,7 @@ For an interactive experience, try this command:
#### Unix-based systems (Linux, macOS, etc.):
```bash
./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
'User: Hi
AI: Hello. I am an AI chatbot. Would you like to talk?
User: Sure!
@ -53,7 +53,7 @@ The following command generates "infinite" text from a starting prompt (you can
#### Unix-based systems (Linux, macOS, etc.):
```bash
./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
```
#### Windows:
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
```sh
./llama -r "User:" --in-prefix " "
./llama-cli -r "User:" --in-prefix " "
```
### In-Suffix
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
```sh
./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
```
## Context Management

View file

@ -20,7 +20,7 @@ set -x
SPLIT=$1/gguf-split
QUANTIZE=$1/quantize
MAIN=$1/llama
MAIN=$1/llama-cli
WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../)

View file

@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
MODEL="-m $2 "
fi
./llama $MODEL --color \
./llama-cli $MODEL --color \
-f ./prompts/reason-act.txt \
-i --interactive-first \
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \

View file

@ -70,5 +70,5 @@ cmake --build . --config Release
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```

View file

@ -23,15 +23,15 @@ fi
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
fi
#use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
#use multiple GPUs with same max compute units
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

View file

@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
--no-checkpointing
# predict
./bin/llama -m ggml-shakespeare-256x16-f32.gguf
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
```
Output files will be saved every N iterations (config with `--save-every N`).