Fix CI, scripts, readme files

2024-05-21 19:06:24 +02:00 · 2024-05-21 19:06:24 +02:00 · 85e4e2b777
commit 85e4e2b777
parent dc24e7ef67
9 changed files with 30 additions and 127 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1305,7 +1305,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 install(
-    FILES convert.py
+    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
--- a/README.md
+++ b/README.md
@ -690,7 +690,8 @@ Building the program with BLAS support may lead to some performance improvements

 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-no-torch.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.

 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@ -707,10 +708,10 @@ ls ./models
 python3 -m pip install -r requirements.txt

 # convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/

 # [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe

 # quantize the model to 4-bits (using Q4_K_M method)
 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
--- a/ci/run.sh
+++ b/ci/run.sh
@ -282,7 +282,7 @@ function gg_run_open_llama_3b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert.py ${path_models}
+    python3 ../examples/convert-no-torch.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -417,7 +417,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert.py ${path_models}
+    python3 ../examples/convert-no-torch.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-no-torch.py](../examples/convert-no-torch.py) (for `llama/llama2` models in `.pth` format).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
    --projector-type ldpv2
 ```

-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-no-torch.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/convert-no-torch.py path/to/MobileVLM-1.7B
 ```

 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-no-torch.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-no-torch.py ../llava-v1.5-7b --skip-unknown
 ```

 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto

 6) Then convert the model to gguf format:
 ```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-no-torch.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

 7) And finally we can run the llava-cli using the 1.6 model version:
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
-
-Usage:
-python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
-
-Arguments:
- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
-
-Old quant types (some base model types require these):
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
-
-New quant types (recommended):
- Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss
- Q3_K_M: very small, very high quality loss
- Q3_K_L: small, substantial quality loss
- Q4_K: alias for Q4_K_M
- Q4_K_S: small, significant quality loss
- Q4_K_M: medium, balanced quality - recommended
- Q5_K: alias for Q5_K_M
- Q5_K_S: large, low quality loss - recommended
- Q5_K_M: large, very low quality loss - recommended
- Q6_K: very large, extremely low quality loss
- Q8_0: very large, extremely low quality loss - not recommended
- F16: extremely large, virtually no quality loss - not recommended
- F32: absolutely huge, lossless - not recommended
-"""
-import subprocess
-subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
-
-import argparse
-import os
-from huggingface_hub import snapshot_download
-
-def main(model, model_type, outname, outdir, quants, keep_fp16):
-    if not os.path.isdir(model):
-        print(f"Model not found at {model}. Downloading...")
-        try:
-            if outname is None:
-                outname = model.split('/')[-1]
-            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
-        except Exception as e:
-            raise Exception(f"Could not download the model: {e}")
-
-    if outdir is None:
-        outdir = f'../models/{outname}'
-
-    if not os.path.isfile(f"{model}/config.json"):
-        raise Exception(f"Could not find config.json in {model}")
-
-    os.makedirs(outdir, exist_ok=True)
-
-    print("Building llama.cpp")
-    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
-
-    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
-
-    print(f"Making unquantised GGUF at {fp16}")
-    if not os.path.isfile(fp16):
-        if model_type != "llama":
-            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
-        else:
-            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
-    else:
-        print(f"Unquantised GGML already exists at: {fp16}")
-
-    print("Making quants")
-    for type in quants:
-        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
-        print(f"Making {type} : {outfile}")
-        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
-
-    if not keep_fp16:
-        os.remove(fp16)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
-    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
-    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
-    parser.add_argument('--outname', default=None, help='Output model(s) name')
-    parser.add_argument('--outdir', default=None, help='Output directory')
-    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
-    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
-
-    args = parser.parse_args()
-
-    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/scripts/convert-gg.sh
+++ b/scripts/convert-gg.sh
@ -3,20 +3,20 @@
 set -e

 # LLaMA v1
-python3 convert.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-no-torch.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16

 # LLaMA v2
-python3 convert.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-no-torch.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16

 # Code Llama
-python3 convert.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-no-torch.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16

 # Falcon
 python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b  1
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@ -75,7 +75,7 @@ if [ "$1" -eq "1" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-no-torch.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16

    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
@ -90,7 +90,7 @@ if [ "$1" -eq "2" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16

    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
@ -105,7 +105,7 @@ if [ "$1" -eq "3" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16

    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
@ -120,7 +120,7 @@ if [ "$1" -eq "4" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16

    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
@ -135,7 +135,7 @@ if [ "$1" -eq "5" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16

    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
@ -150,7 +150,7 @@ if [ "$1" -eq "6" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16

    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
@ -165,7 +165,7 @@ if [ "$1" -eq "7" ]; then

    cd /workspace/llama.cpp

-    python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16

    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k