update more files

2024-03-25 17:31:05 +01:00 · 2024-03-25 17:31:05 +01:00 · fe080353ff
commit fe080353ff
parent 4a60e88065
14 changed files with 40 additions and 40 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -26,8 +26,8 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1

 RUN make

--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@ -12,7 +12,7 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.

-Name:           llama.cpp-cublas
+Name:           llama.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@ -36,12 +36,12 @@ make -j LLAMA_CUDA=1

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple

 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
+ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@ -67,10 +67,10 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
+%{_bindir}/llamacppcuda
+%{_bindir}/llamacppcudaserver
+%{_bindir}/llamacppcudasimple
+/usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama

 %pre
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -728,13 +728,13 @@ jobs:
          path: |
            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip

-  windows-latest-cmake-cublas:
+  windows-latest-cmake-cuda:
    runs-on: windows-latest

    strategy:
      matrix:
        cuda: ['12.2.0', '11.7.1']
-        build: ['cublas']
+        build: ['cuda']

    steps:
      - name: Clone
@ -755,7 +755,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
@ -911,7 +911,7 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-cublas
+      - windows-latest-cmake-cuda
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64

--- a/2
+++ b/2
@ -390,7 +390,7 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
-#	$(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.)
+# LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
 endif

--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@ -1,7 +1,7 @@
 # Token generation performance troubleshooting

-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+## Verifying that the model is running on the GPU with CUDA
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -124,7 +124,7 @@ llama_print_timings:       total time =   34570.79 ms
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```

 ### run on Orin
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b

 ### Considerations

-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.

 ### Build llama.cpp and install to C:\LlamaCPP directory

--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa

 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
 - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
 - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
 set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
+set(LLAMA_CUDA @LLAMA_CUDA@)
 set(LLAMA_METAL @LLAMA_METAL@)
 set(LLAMA_MPI @LLAMA_MPI@)
 set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
@ -27,7 +27,7 @@ if (LLAMA_BLAS)
    find_package(BLAS REQUIRED)
 endif()

-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
    find_package(CUDAToolkit REQUIRED)
 endif()

--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@ -23,7 +23,7 @@ fi
 make_opts=""

 if [[ "$backend" == "cuda" ]]; then
-    make_opts="LLAMA_CUBLAS=1"
+    make_opts="LLAMA_CUDA=1"
 fi

 git checkout $1
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp

 cd llama.cpp

-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j

 ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3  ./models/tinyllama-1b
 ln -sfn /workspace/CodeLlama-7b-hf           ./models/codellama-7b
@ -60,7 +60,7 @@ cd /workspace/llama.cpp
 mkdir build-cublas
 cd build-cublas

-cmake -DLLAMA_CUBLAS=1 ../
+cmake -DLLAMA_CUDA=1 ../
 make -j

 if [ "$1" -eq "0" ]; then
@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then
    # batched
    cd /workspace/llama.cpp

-    LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
+    LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999

    # batched-bench
    cd /workspace/llama.cpp

-    LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
+    LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32

    # parallel
    cd /workspace/llama.cpp

-    LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
+    LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb

 fi

@ -204,10 +204,10 @@ fi
 #if [ "$1" -eq "7" ]; then
 #    cd /workspace/llama.cpp
 #
-#    LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
+#    LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi

 # more benches
-#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
+#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
+#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1

--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@ -380,7 +380,7 @@ fi

 if [[ "$backend" == "cuda" ]]; then
    printf "[+] Building with CUDA backend\n"
-    LLAMA_CUBLAS=1 make -j server $log
+    LLAMA_CUDA=1 make -j server $log
 elif [[ "$backend" == "cpu" ]]; then
    printf "[+] Building with CPU backend\n"
    make -j server $log