rename llama|main -> llama-cli; consistent RPM bin prefixes

2024-06-10 15:34:14 +01:00 · 2024-06-10 15:34:14 +01:00 · 5265c15d4c
commit 5265c15d4c
parent 347f30803f
51 changed files with 142 additions and 144 deletions
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
 fi

-./llama "${GEN_OPTIONS[@]}" \
+./llama-cli "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
    --in-prefix " " \
    --in-suffix "${AI_NAME}:" \
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
+./llama-cli -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
       --color \
       -f ./prompts/alpaca.txt \
       --ctx_size 2048 \
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET llama-baby)
+set(TARGET llama-baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -58,4 +58,4 @@ echo "$2
 model=$1

 # generate the most likely continuation until the string "===" is found
-./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE

 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama $GEN_OPTIONS \
+./llama-cli $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -62,7 +62,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
    echo 'Prompt cache does not exist, building...'
    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama 2>>"$LOG" \
+    ./llama-cli 2>>"$LOG" \
        --batch_size 64 \
        "${OPTS[@]}" \
        --prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do

    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"

-    ./llama 2>>"$LOG" "${OPTS[@]}" \
+    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
            --prompt-cache "$CUR_PROMPT_CACHE" \
            --prompt-cache-all \
            --file "$CUR_PROMPT_FILE" \
            --reverse-prompt "${USER_NAME}:" \
            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama
+        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
        skip_bytes "$n_prompt_len_pre"  # print generation

@ -133,7 +133,7 @@ while read -e line; do
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./llama output!"
+        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
        exit 1
    fi

@ -144,7 +144,7 @@ while read -e line; do
    fi

    # Update cache for next prompt in background, ideally during user input
-    ./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
          --prompt-cache "$NEXT_PROMPT_CACHE" \
          --file "$NEXT_PROMPT_FILE" \
          --n_predict 1 &
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE

 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama $GEN_OPTIONS \
+./bin/llama-cli $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
--- a/examples/chat.sh
+++ b/examples/chat.sh
@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color -i \
    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b

 Now you can use the model with a command like:

-`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
+`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
        --use-checkpointing

 # predict
-./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```

 **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
 For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:

 ```bash
-./bin/llama -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
 ```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
 For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:

 ```bash
-./bin/llama -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@ -19,7 +19,7 @@ fi
 set -x

 SPLIT=$1/gguf-split
-MAIN=$1/llama
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./llama --color --instruct --threads 4 \
+./llama-cli --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
       --batch_size 8 --ctx_size 2048 -n -1 \
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
  echo $counter
  echo "Current Question: $question"
  eval "$exe_cmd"
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -524,7 +524,7 @@ class SchemaConverter:
 def main(args_in = None):
    parser = argparse.ArgumentParser(
        description='''
-            Generates a grammar (suitable for use in ./llama) that produces JSON conforming to a
+            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
            given JSON schema. Only a subset of JSON schema features are supported; more may be
            added in the future.
        ''',
--- a/examples/llama2-13b.sh
+++ b/examples/llama2-13b.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
+./llama-cli -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
       --color \
       --ctx_size 2048 \
       -n -1 \
--- a/examples/llama2.sh
+++ b/examples/llama2.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
+./llama-cli -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
       --color \
       --ctx_size 2048 \
       -n -1 \
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -30,9 +30,9 @@ if(TARGET BUILD_INFO)
    add_dependencies(llava BUILD_INFO)
 endif()

-set(TARGET llama-llava)
+set(TARGET llama-llava-cli)
 add_executable(${TARGET} llava-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.

 ## Usage
-Build with cmake or run `make llama-llava` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.

-After building, run: `./llama-llava` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:

 ```sh
-./llama-llava -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
    --image path/to/an/image.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llama-llava \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llama-llava \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@ -126,7 +126,7 @@ llama_print_timings:       total time =   34570.79 ms
 #### llava-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llama-llava \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llama-llava \
+./llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    --image /data/local/tmp/demo.jpeg \
@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llama-llava \
+./llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.

 ## Usage
-Build with cmake or run `make llama-llava` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.

-After building, run: `./llama-llava` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:

 ```sh
-./llama-llava -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```

 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@ -97,7 +97,7 @@ python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow

 7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llama-llava -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```

 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"

 program_dir="build_64/bin"
-binName="llama-llava"
+binName="llama-llava-cli"
 n_threads=4


--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -1,6 +1,5 @@
 set(TARGET llama-cli)
 add_executable(${TARGET} main.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -20,7 +20,7 @@ To get started right away, run the following command, making sure to use the cor
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```

 #### Windows:
@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@ -53,7 +53,7 @@ The following command generates "infinite" text from a starting prompt (you can
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
+./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
 ```

 #### Windows:
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
 The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:

 ```sh
-./llama -r "User:" --in-prefix " "
+./llama-cli -r "User:" --in-prefix " "
 ```

 ### In-Suffix
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:

 ```sh
-./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
+./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```

 ## Context Management
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@ -20,7 +20,7 @@ set -x

 SPLIT=$1/gguf-split
 QUANTIZE=$1/quantize
-MAIN=$1/llama
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/quantize
 ROOT_DIR=$(realpath $(dirname $0)/../../)

--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
  MODEL="-m $2 "
 fi

-./llama $MODEL --color \
+./llama-cli $MODEL --color \
    -f ./prompts/reason-act.txt \
    -i --interactive-first \
    --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -70,5 +70,5 @@ cmake --build . --config Release
 Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:

 ```bash
-$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -23,15 +23,15 @@ fi
 if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 fi

 #use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none

 #use multiple GPUs with same max compute units
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
        --no-checkpointing

 # predict
-./bin/llama -m ggml-shakespeare-256x16-f32.gguf
+./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
 ```

 Output files will be saved every N iterations (config with `--save-every N`).