feat: include all-in-one command tool & update readme.md

2023-03-17 10:31:53 +01:00 · 2023-03-17 10:31:53 +01:00 · 79a48d9876
commit 79a48d9876
parent 50fa1a006e
4 changed files with 130 additions and 6 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # Read the first argument into a variable
 arg1="$1"
@ -12,13 +13,34 @@ arg2="$@"
 if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
    python3 ./convert-pth-to-ggml.py $arg2
 elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
-    /app/quantize $arg2
+    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
-    /app/main $arg2
+    ./main $arg2
 elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
    echo "Downloading model..."
    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
            ./quantize "$i" "${i/f16/q4_0}" 2
        fi
    done
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r)"
+    echo "  --run (-r): Run a model previously converted into ggml"
-    echo "  --convert (-c)"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512"
-    echo "  --quantize (-q)"
+    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
    echo "              ex: \"/models/\" 7B"
    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
 fi
--- a/README.md
+++ b/README.md
@ -32,6 +32,7 @@ Supported platforms:
 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker
 ---
@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 ### Docker
 #### Prerequisites
 * Docker must be installed and running on your system.
 * Create a folder to store big models & intermediate files (in ex. im using /llama/models)
 #### Images
 We have two Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
 #### Usage
 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
 ```bash
 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```
 On complete, you are ready to play!
 ```bash
 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
 ```
 or with light image:
 ```bash
 docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
 ```
 ## Limitations
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -16,7 +16,7 @@
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
-
+import os
 import sys
 import json
 import struct
@ -64,6 +64,10 @@ if len(sys.argv) > 2:
        sys.exit(1)
    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 if os.path.exists(fname_out):
    print(f"Skip conversion, it already exists: {fname_out}")
    sys.exit(0)
 with open(fname_hparams, "r") as f:
    hparams = json.load(f)
--- a/download-pth.py
+++ b/download-pth.py
@ -0,0 +1,66 @@
 import os
 import sys
 from tqdm import tqdm
 import requests
 if len(sys.argv) < 3:
    print("Usage: download-pth.py dir-model model-type\n")
    print("  model-type: Available models 7B, 13B, 30B or 65B")
    sys.exit(1)
 modelsDir = sys.argv[1]
 model = sys.argv[2]
 num = {
    "7B": 1,
    "13B": 2,
    "30B": 4,
    "65B": 8,
 }
 if model not in num:
    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
    sys.exit(1)
 print(f"Downloading model {model}")
 files = ["checklist.chk", "params.json"]
 for i in range(num[model]):
    files.append(f"consolidated.0{i}.pth")
 resolved_path = os.path.abspath(os.path.join(modelsDir, model))
 os.makedirs(resolved_path, exist_ok=True)
 for file in files:
    dest_path = os.path.join(resolved_path, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
 files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
 for file in files2:
    dest_path = os.path.join(modelsDir, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))