update more files

This commit is contained in:
slaren 2024-03-25 17:31:05 +01:00
parent 4a60e88065
commit fe080353ff
14 changed files with 40 additions and 40 deletions

View file

@ -26,8 +26,8 @@ COPY . .
# Set nvcc architecture # Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS # Enable CUDA
ENV LLAMA_CUBLAS=1 ENV LLAMA_CUDA=1
RUN make RUN make

View file

@ -12,7 +12,7 @@
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support. # It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-cublas Name: llama.cpp-cuda
Version: %( date "+%%Y%%m%%d" ) Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist} Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@ -36,12 +36,12 @@ make -j LLAMA_CUDA=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcublas cp -p main %{buildroot}%{_bindir}/llamacppcuda
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
mkdir -p %{buildroot}/usr/lib/systemd/system mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
[Unit] [Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build). Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
[Service] [Service]
Type=simple Type=simple
EnvironmentFile=/etc/sysconfig/llama EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID ExecReload=/bin/kill -s HUP $MAINPID
Restart=never Restart=never
@ -67,10 +67,10 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/* rm -rf %{_builddir}/*
%files %files
%{_bindir}/llamacppcublas %{_bindir}/llamacppcuda
%{_bindir}/llamacppcublasserver %{_bindir}/llamacppcudaserver
%{_bindir}/llamacppcublassimple %{_bindir}/llamacppcudasimple
/usr/lib/systemd/system/llamacublas.service /usr/lib/systemd/system/llamacuda.service
%config /etc/sysconfig/llama %config /etc/sysconfig/llama
%pre %pre

View file

@ -728,13 +728,13 @@ jobs:
path: | path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cublas: windows-latest-cmake-cuda:
runs-on: windows-latest runs-on: windows-latest
strategy: strategy:
matrix: matrix:
cuda: ['12.2.0', '11.7.1'] cuda: ['12.2.0', '11.7.1']
build: ['cublas'] build: ['cuda']
steps: steps:
- name: Clone - name: Clone
@ -755,7 +755,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name - name: Determine tag name
@ -911,7 +911,7 @@ jobs:
- macOS-latest-make - macOS-latest-make
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-cublas - windows-latest-cmake-cuda
- macOS-latest-cmake-arm64 - macOS-latest-cmake-arm64
- macOS-latest-cmake-x64 - macOS-latest-cmake-x64

View file

@ -390,7 +390,7 @@ ifdef LLAMA_BLIS
endif # LLAMA_BLIS endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.) # LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1 LLAMA_CUDA := 1
endif endif

View file

@ -1,7 +1,7 @@
# Token generation performance troubleshooting # Token generation performance troubleshooting
## Verifying that the model is running on the GPU with cuBLAS ## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell ```shell
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
``` ```

View file

@ -124,7 +124,7 @@ llama_print_timings: total time = 34570.79 ms
## Orin compile and run ## Orin compile and run
### compile ### compile
```sh ```sh
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
``` ```
### run on Orin ### run on Orin

View file

@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations ### Considerations
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory ### Build llama.cpp and install to C:\LlamaCPP directory

View file

@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text. - `--verbose-prompt`: Print the prompt before generating text.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

View file

@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`. - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`. - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.

2
ggml.h
View file

@ -2354,7 +2354,7 @@ extern "C" {
GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cuda (void); GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void); GGML_API int ggml_cpu_has_kompute (void);

View file

@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(LLAMA_BLAS @LLAMA_BLAS@) set(LLAMA_BLAS @LLAMA_BLAS@)
set(LLAMA_CUBLAS @LLAMA_CUBLAS@) set(LLAMA_CUDA @LLAMA_CUDA@)
set(LLAMA_METAL @LLAMA_METAL@) set(LLAMA_METAL @LLAMA_METAL@)
set(LLAMA_MPI @LLAMA_MPI@) set(LLAMA_MPI @LLAMA_MPI@)
set(LLAMA_CLBLAST @LLAMA_CLBLAST@) set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
@ -27,7 +27,7 @@ if (LLAMA_BLAS)
find_package(BLAS REQUIRED) find_package(BLAS REQUIRED)
endif() endif()
if (LLAMA_CUBLAS) if (LLAMA_CUDA)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
endif() endif()

View file

@ -23,7 +23,7 @@ fi
make_opts="" make_opts=""
if [[ "$backend" == "cuda" ]]; then if [[ "$backend" == "cuda" ]]; then
make_opts="LLAMA_CUBLAS=1" make_opts="LLAMA_CUDA=1"
fi fi
git checkout $1 git checkout $1

View file

@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
LLAMA_CUBLAS=1 make -j LLAMA_CUDA=1 make -j
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
@ -60,7 +60,7 @@ cd /workspace/llama.cpp
mkdir build-cublas mkdir build-cublas
cd build-cublas cd build-cublas
cmake -DLLAMA_CUBLAS=1 ../ cmake -DLLAMA_CUDA=1 ../
make -j make -j
if [ "$1" -eq "0" ]; then if [ "$1" -eq "0" ]; then
@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then
# batched # batched
cd /workspace/llama.cpp cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
# batched-bench # batched-bench
cd /workspace/llama.cpp cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
# parallel # parallel
cd /workspace/llama.cpp cd /workspace/llama.cpp
LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
fi fi
@ -204,10 +204,10 @@ fi
#if [ "$1" -eq "7" ]; then #if [ "$1" -eq "7" ]; then
# cd /workspace/llama.cpp # cd /workspace/llama.cpp
# #
# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 # LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
#fi #fi
# more benches # more benches
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 #LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 #LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1

View file

@ -380,7 +380,7 @@ fi
if [[ "$backend" == "cuda" ]]; then if [[ "$backend" == "cuda" ]]; then
printf "[+] Building with CUDA backend\n" printf "[+] Building with CUDA backend\n"
LLAMA_CUBLAS=1 make -j server $log LLAMA_CUDA=1 make -j server $log
elif [[ "$backend" == "cpu" ]]; then elif [[ "$backend" == "cpu" ]]; then
printf "[+] Building with CPU backend\n" printf "[+] Building with CPU backend\n"
make -j server $log make -j server $log