main: update refs -> llama

fix examples/main ref
2024-06-06 15:44:05 +01:00 · 2024-06-06 15:44:05 +01:00 · 8b7c734473
commit 8b7c734473
parent f5f19a236f
42 changed files with 101 additions and 101 deletions
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
 fi

-./main "${GEN_OPTIONS[@]}" \
+./llama "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
    --in-prefix " " \
    --in-suffix "${AI_NAME}:" \
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
+./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
       --color \
       -f ./prompts/alpaca.txt \
       --ctx_size 2048 \
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -58,4 +58,4 @@ echo "$2
 model=$1

 # generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE

 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./main $GEN_OPTIONS \
+./llama $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -62,7 +62,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
    echo 'Prompt cache does not exist, building...'
    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./main 2>>"$LOG" \
+    ./llama 2>>"$LOG" \
        --batch_size 64 \
        "${OPTS[@]}" \
        --prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do

    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"

-    ./main 2>>"$LOG" "${OPTS[@]}" \
+    ./llama 2>>"$LOG" "${OPTS[@]}" \
            --prompt-cache "$CUR_PROMPT_CACHE" \
            --prompt-cache-all \
            --file "$CUR_PROMPT_FILE" \
            --reverse-prompt "${USER_NAME}:" \
            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./main
+        skip_bytes 1 |                  # skip BOS token added by ./llama
        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
        skip_bytes "$n_prompt_len_pre"  # print generation

@ -133,7 +133,7 @@ while read -e line; do
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./main output!"
+        echo >&2 "Couldn't get number of tokens from ./llama output!"
        exit 1
    fi

@ -144,7 +144,7 @@ while read -e line; do
    fi

    # Update cache for next prompt in background, ideally during user input
-    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
          --prompt-cache "$NEXT_PROMPT_CACHE" \
          --file "$NEXT_PROMPT_FILE" \
          --n_predict 1 &
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE

 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/main $GEN_OPTIONS \
+./bin/llama $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
--- a/examples/chat.sh
+++ b/examples/chat.sh
@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color -i \
    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b

 Now you can use the model with a command like:

-`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
+`$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
        --use-checkpointing

 # predict
-./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```

 **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog
 For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:

 ```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama -m open-llama-3b-v2-q8_0.gguf \
  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
 ```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
 For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:

 ```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama -m open-llama-3b-v2-q8_0.gguf \
  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@ -19,7 +19,7 @@ fi
 set -x

 SPLIT=$1/gguf-split
-MAIN=$1/main
+MAIN=$1/llama
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./main --color --instruct --threads 4 \
+./llama --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
       --batch_size 8 --ctx_size 2048 -n -1 \
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
  echo $counter
  echo "Current Question: $question"
  eval "$exe_cmd"
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -1,4 +1,4 @@
-# llama.cpp/example/llama-bench
+# llama.cpp/examples/llama-bench

 Performance testing tool for llama.cpp.

--- a/examples/llama2-13b.sh
+++ b/examples/llama2-13b.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
+./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
       --color \
       --ctx_size 2048 \
       -n -1 \
--- a/examples/llama2.sh
+++ b/examples/llama2.sh
@ -7,7 +7,7 @@
 cd `dirname $0`
 cd ..

-./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
+./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
       --color \
       --ctx_size 2048 \
       -n -1 \
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -1,4 +1,4 @@
-# llama.cpp/example/main
+# llama.cpp/examples/main

 This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.

@ -20,13 +20,13 @@ To get started right away, run the following command, making sure to use the cor
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```

 #### Windows:

 ```powershell
-main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
+llama.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
 ```

 For an interactive experience, try this command:
@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@ -45,7 +45,7 @@ User:'
 #### Windows:

 ```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+llama.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```

 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1
+./llama -m models/7B/ggml-model.bin --ignore-eos -n -1
 ```

 #### Windows:

 ```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
+llama.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
 ```

 ## Common Options
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
 The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:

 ```sh
-./main -r "User:" --in-prefix " "
+./llama -r "User:" --in-prefix " "
 ```

 ### In-Suffix
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:

 ```sh
-./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
+./llama -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```

 ## Context Management
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@ -20,7 +20,7 @@ set -x

 SPLIT=$1/gguf-split
 QUANTIZE=$1/quantize
-MAIN=$1/main
+MAIN=$1/llama
 WORK_PATH=$TMP_DIR/quantize
 ROOT_DIR=$(realpath $(dirname $0)/../../)

--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
  MODEL="-m $2 "
 fi

-./main $MODEL --color \
+./llama $MODEL --color \
    -f ./prompts/reason-act.txt \
    -i --interactive-first \
    --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -70,5 +70,5 @@ cmake --build . --config Release
 Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:

 ```bash
-$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+$ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -23,15 +23,15 @@ fi
 if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 fi

 #use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none

 #use multiple GPUs with same max compute units
-#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
        --no-checkpointing

 # predict
-./bin/main -m ggml-shakespeare-256x16-f32.gguf
+./bin/llama -m ggml-shakespeare-256x16-f32.gguf
 ```

 Output files will be saved every N iterations (config with `--save-every N`).