diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 77a9ddc14..8cc1480d3 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -26,8 +26,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec similarity index 82% rename from .devops/llama-cpp-cublas.srpm.spec rename to .devops/llama-cpp-cuda.srpm.spec index 6cd6e4c72..66bdc871e 100644 --- a/.devops/llama-cpp-cublas.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -12,7 +12,7 @@ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # It is up to the user to install the correct vendor-specific support. -Name: llama.cpp-cublas +Name: llama.cpp-cuda Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) @@ -36,12 +36,12 @@ make -j LLAMA_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacppcublas -cp -p server %{buildroot}%{_bindir}/llamacppcublasserver -cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple +cp -p main %{buildroot}%{_bindir}/llamacppcuda +cp -p server %{buildroot}%{_bindir}/llamacppcudaserver +cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacublas.service +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacuda.service [Unit] Description=Llama.cpp server, CPU only (no GPU support in this build). After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target @@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS +ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -67,10 +67,10 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacppcublas -%{_bindir}/llamacppcublasserver -%{_bindir}/llamacppcublassimple -/usr/lib/systemd/system/llamacublas.service +%{_bindir}/llamacppcuda +%{_bindir}/llamacppcudaserver +%{_bindir}/llamacppcudasimple +/usr/lib/systemd/system/llamacuda.service %config /etc/sysconfig/llama %pre diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0e7643bba..9329b94ee 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -728,13 +728,13 @@ jobs: path: | llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip - windows-latest-cmake-cublas: + windows-latest-cmake-cuda: runs-on: windows-latest strategy: matrix: cuda: ['12.2.0', '11.7.1'] - build: ['cublas'] + build: ['cuda'] steps: - name: Clone @@ -755,7 +755,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name @@ -911,7 +911,7 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake - - windows-latest-cmake-cublas + - windows-latest-cmake-cuda - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 diff --git a/Makefile b/Makefile index 35bdb1eb7..1741151eb 100644 --- a/Makefile +++ b/Makefile @@ -390,7 +390,7 @@ ifdef LLAMA_BLIS endif # LLAMA_BLIS ifdef LLAMA_CUBLAS -# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.) +# LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 endif diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index d7e863dff..3c4343147 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -1,7 +1,7 @@ # Token generation performance troubleshooting -## Verifying that the model is running on the GPU with cuBLAS -Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +## Verifying that the model is running on the GPU with CUDA +Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 4d5fef020..b3b66331f 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -124,7 +124,7 @@ llama_print_timings: total time = 34570.79 ms ## Orin compile and run ### compile ```sh -make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 ``` ### run on Orin diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md index 6d665f28f..f599fbaec 100644 --- a/examples/main-cmake-pkg/README.md +++ b/examples/main-cmake-pkg/README.md @@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b ### Considerations -When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. +When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. ### Build llama.cpp and install to C:\LlamaCPP directory diff --git a/examples/main/README.md b/examples/main/README.md index 6a8d1e1c5..9c83fd3bf 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `--verbose-prompt`: Print the prompt before generating text. -- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/server/README.md b/examples/server/README.md index 49121a460..aadc73b4b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. -- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`. - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. diff --git a/ggml.h b/ggml.h index efb5177d7..c670caa6a 100644 --- a/ggml.h +++ b/ggml.h @@ -2354,7 +2354,7 @@ extern "C" { GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_blas (void); - GGML_API int ggml_cpu_has_cuda (void); + GGML_API int ggml_cpu_has_cuda (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_kompute (void); diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in index 6a6d8e39e..f842c7137 100644 --- a/scripts/LlamaConfig.cmake.in +++ b/scripts/LlamaConfig.cmake.in @@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_BLAS @LLAMA_BLAS@) -set(LLAMA_CUBLAS @LLAMA_CUBLAS@) +set(LLAMA_CUDA @LLAMA_CUDA@) set(LLAMA_METAL @LLAMA_METAL@) set(LLAMA_MPI @LLAMA_MPI@) set(LLAMA_CLBLAST @LLAMA_CLBLAST@) @@ -27,7 +27,7 @@ if (LLAMA_BLAS) find_package(BLAS REQUIRED) endif() -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) find_package(CUDAToolkit REQUIRED) endif() diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 331c4b9ce..d1272506c 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -23,7 +23,7 @@ fi make_opts="" if [[ "$backend" == "cuda" ]]; then - make_opts="LLAMA_CUBLAS=1" + make_opts="LLAMA_CUDA=1" fi git checkout $1 diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 6cf1ab4f3..2058ceabf 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -LLAMA_CUBLAS=1 make -j +LLAMA_CUDA=1 make -j ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b @@ -60,7 +60,7 @@ cd /workspace/llama.cpp mkdir build-cublas cd build-cublas -cmake -DLLAMA_CUBLAS=1 ../ +cmake -DLLAMA_CUDA=1 ../ make -j if [ "$1" -eq "0" ]; then @@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then # batched cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 30bbac321..eb6ce458e 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -380,7 +380,7 @@ fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" - LLAMA_CUBLAS=1 make -j server $log + LLAMA_CUDA=1 make -j server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" make -j server $log