rename llama|main -> llama-cli; consistent RPM bin prefixes
This commit is contained in:
parent
347f30803f
commit
5265c15d4c
51 changed files with 142 additions and 144 deletions
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
|||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
fi
|
||||
|
||||
./llama "${GEN_OPTIONS[@]}" \
|
||||
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--in-prefix " " \
|
||||
--in-suffix "${AI_NAME}:" \
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
./llama-cli -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
--color \
|
||||
-f ./prompts/alpaca.txt \
|
||||
--ctx_size 2048 \
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(TARGET llama-baby)
|
||||
set(TARGET llama-baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
|||
model=$1
|
||||
|
||||
# generate the most likely continuation until the string "===" is found
|
||||
./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
|||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./llama $GEN_OPTIONS \
|
||||
./llama-cli $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
|
|
|
@ -62,7 +62,7 @@ fi
|
|||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||
echo 'Prompt cache does not exist, building...'
|
||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||
./llama 2>>"$LOG" \
|
||||
./llama-cli 2>>"$LOG" \
|
||||
--batch_size 64 \
|
||||
"${OPTS[@]}" \
|
||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
|||
|
||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||
|
||||
./llama 2>>"$LOG" "${OPTS[@]}" \
|
||||
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||
--prompt-cache-all \
|
||||
--file "$CUR_PROMPT_FILE" \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--n_predict "$n_predict" |
|
||||
skip_bytes 1 | # skip BOS token added by ./llama
|
||||
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||
skip_bytes "$n_prompt_len_pre" # print generation
|
||||
|
||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
|||
# TODO get both messages in one go
|
||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||
echo >&2 "Couldn't get number of tokens from ./llama output!"
|
||||
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
|||
fi
|
||||
|
||||
# Update cache for next prompt in background, ideally during user input
|
||||
./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||
--file "$NEXT_PROMPT_FILE" \
|
||||
--n_predict 1 &
|
||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
|||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./bin/llama $GEN_OPTIONS \
|
||||
./bin/llama-cli $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
|||
#
|
||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||
#
|
||||
./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||
--repeat_penalty 1.0 --color -i \
|
||||
-r "User:" -f prompts/chat-with-bob.txt
|
||||
|
|
|
@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b
|
|||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
|
|
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||
--use-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
```
|
||||
|
||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||
|
@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
|
|||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||
|
||||
```bash
|
||||
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||
```
|
||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
|||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||
|
||||
```bash
|
||||
./bin/llama -m open-llama-3b-v2-q8_0.gguf \
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||
|
|
|
@ -19,7 +19,7 @@ fi
|
|||
set -x
|
||||
|
||||
SPLIT=$1/gguf-split
|
||||
MAIN=$1/llama
|
||||
MAIN=$1/llama-cli
|
||||
WORK_PATH=$TMP_DIR/gguf-split
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama --color --instruct --threads 4 \
|
||||
./llama-cli --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
|
|
|
@ -21,7 +21,7 @@ counter=1
|
|||
echo 'Running'
|
||||
while IFS= read -r question
|
||||
do
|
||||
exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
echo $counter
|
||||
echo "Current Question: $question"
|
||||
eval "$exe_cmd"
|
||||
|
|
|
@ -524,7 +524,7 @@ class SchemaConverter:
|
|||
def main(args_in = None):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='''
|
||||
Generates a grammar (suitable for use in ./llama) that produces JSON conforming to a
|
||||
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
|
||||
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
||||
added in the future.
|
||||
''',
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
./llama-cli -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
./llama-cli -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
|
|
|
@ -30,9 +30,9 @@ if(TARGET BUILD_INFO)
|
|||
add_dependencies(llava BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET llama-llava)
|
||||
set(TARGET llama-llava-cli)
|
||||
add_executable(${TARGET} llava-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
|
|||
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llama-llava` to build it.
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
|
||||
After building, run: `./llama-llava` to see the usage. For example:
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llama-llava -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||
--image path/to/an/image.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||
|
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
|||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llama-llava \
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
|
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
|
|||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llama-llava \
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
|
@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms
|
|||
#### llava-cli release-b2005
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llama-llava \
|
||||
/data/local/tmp/llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
|
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
|||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
./llama-llava \
|
||||
./llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
--image /data/local/tmp/demo.jpeg \
|
||||
|
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
|
|||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
./llama-llava \
|
||||
./llama-llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||
|
|
|
@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
|
|||
After API is confirmed, more models will be supported / uploaded.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llama-llava` to build it.
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
|
||||
After building, run: `./llama-llava` to see the usage. For example:
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llama-llava -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||
```
|
||||
|
||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||
|
@ -97,7 +97,7 @@ python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow
|
|||
|
||||
7) And finally we can run the llava cli using the 1.6 model version:
|
||||
```console
|
||||
./llama-llava -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
```
|
||||
|
||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||
|
|
|
@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
|
|||
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||
|
||||
program_dir="build_64/bin"
|
||||
binName="llama-llava"
|
||||
binName="llama-llava-cli"
|
||||
n_threads=4
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
set(TARGET llama-cli)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -20,7 +20,7 @@ To get started right away, run the following command, making sure to use the cor
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
@ -34,7 +34,7 @@ For an interactive experience, try this command:
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||
./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||
'User: Hi
|
||||
AI: Hello. I am an AI chatbot. Would you like to talk?
|
||||
User: Sure!
|
||||
|
@ -53,7 +53,7 @@ The following command generates "infinite" text from a starting prompt (you can
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
|
||||
./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
|
|||
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./llama -r "User:" --in-prefix " "
|
||||
./llama-cli -r "User:" --in-prefix " "
|
||||
```
|
||||
|
||||
### In-Suffix
|
||||
|
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
|
|||
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||
./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||
```
|
||||
|
||||
## Context Management
|
||||
|
|
|
@ -20,7 +20,7 @@ set -x
|
|||
|
||||
SPLIT=$1/gguf-split
|
||||
QUANTIZE=$1/quantize
|
||||
MAIN=$1/llama
|
||||
MAIN=$1/llama-cli
|
||||
WORK_PATH=$TMP_DIR/quantize
|
||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
|
|||
MODEL="-m $2 "
|
||||
fi
|
||||
|
||||
./llama $MODEL --color \
|
||||
./llama-cli $MODEL --color \
|
||||
-f ./prompts/reason-act.txt \
|
||||
-i --interactive-first \
|
||||
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
|
||||
|
|
|
@ -70,5 +70,5 @@ cmake --build . --config Release
|
|||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
|
||||
```bash
|
||||
$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
```
|
||||
|
|
|
@ -23,15 +23,15 @@ fi
|
|||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
|||
--no-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/llama -m ggml-shakespeare-256x16-f32.gguf
|
||||
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
|
||||
```
|
||||
|
||||
Output files will be saved every N iterations (config with `--save-every N`).
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue