update more files
This commit is contained in:
parent
4a60e88065
commit
fe080353ff
14 changed files with 40 additions and 40 deletions
|
@ -26,8 +26,8 @@ COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable cuBLAS
|
# Enable CUDA
|
||||||
ENV LLAMA_CUBLAS=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make
|
RUN make
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
Name: llama.cpp-cublas
|
Name: llama.cpp-cuda
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
|
@ -36,12 +36,12 @@ make -j LLAMA_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p main %{buildroot}%{_bindir}/llamacppcublas
|
cp -p main %{buildroot}%{_bindir}/llamacppcuda
|
||||||
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
|
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
|
||||||
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
|
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/llama
|
||||||
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
|
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -67,10 +67,10 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llamacppcublas
|
%{_bindir}/llamacppcuda
|
||||||
%{_bindir}/llamacppcublasserver
|
%{_bindir}/llamacppcudaserver
|
||||||
%{_bindir}/llamacppcublassimple
|
%{_bindir}/llamacppcudasimple
|
||||||
/usr/lib/systemd/system/llamacublas.service
|
/usr/lib/systemd/system/llamacuda.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/llama
|
||||||
|
|
||||||
%pre
|
%pre
|
8
.github/workflows/build.yml
vendored
8
.github/workflows/build.yml
vendored
|
@ -728,13 +728,13 @@ jobs:
|
||||||
path: |
|
path: |
|
||||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
||||||
|
|
||||||
windows-latest-cmake-cublas:
|
windows-latest-cmake-cuda:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda: ['12.2.0', '11.7.1']
|
cuda: ['12.2.0', '11.7.1']
|
||||||
build: ['cublas']
|
build: ['cuda']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -755,7 +755,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
@ -911,7 +911,7 @@ jobs:
|
||||||
- macOS-latest-make
|
- macOS-latest-make
|
||||||
- macOS-latest-cmake
|
- macOS-latest-cmake
|
||||||
- windows-latest-cmake
|
- windows-latest-cmake
|
||||||
- windows-latest-cmake-cublas
|
- windows-latest-cmake-cuda
|
||||||
- macOS-latest-cmake-arm64
|
- macOS-latest-cmake-arm64
|
||||||
- macOS-latest-cmake-x64
|
- macOS-latest-cmake-x64
|
||||||
|
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -390,7 +390,7 @@ ifdef LLAMA_BLIS
|
||||||
endif # LLAMA_BLIS
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.)
|
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
||||||
LLAMA_CUDA := 1
|
LLAMA_CUDA := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Token generation performance troubleshooting
|
# Token generation performance troubleshooting
|
||||||
|
|
||||||
## Verifying that the model is running on the GPU with cuBLAS
|
## Verifying that the model is running on the GPU with CUDA
|
||||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||||
```shell
|
```shell
|
||||||
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||||
```
|
```
|
||||||
|
|
|
@ -124,7 +124,7 @@ llama_print_timings: total time = 34570.79 ms
|
||||||
## Orin compile and run
|
## Orin compile and run
|
||||||
### compile
|
### compile
|
||||||
```sh
|
```sh
|
||||||
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||||
```
|
```
|
||||||
|
|
||||||
### run on Orin
|
### run on Orin
|
||||||
|
|
|
@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b
|
||||||
|
|
||||||
### Considerations
|
### Considerations
|
||||||
|
|
||||||
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||||
|
|
||||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||||
|
|
||||||
|
|
|
@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa
|
||||||
|
|
||||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||||
- `--verbose-prompt`: Print the prompt before generating text.
|
- `--verbose-prompt`: Print the prompt before generating text.
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
|
@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
|
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
|
||||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
|
||||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
|
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
|
||||||
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
|
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -2354,7 +2354,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||||
GGML_API int ggml_cpu_has_blas (void);
|
GGML_API int ggml_cpu_has_blas (void);
|
||||||
GGML_API int ggml_cpu_has_cuda (void);
|
GGML_API int ggml_cpu_has_cuda (void);
|
||||||
GGML_API int ggml_cpu_has_clblast (void);
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
GGML_API int ggml_cpu_has_vulkan (void);
|
GGML_API int ggml_cpu_has_vulkan (void);
|
||||||
GGML_API int ggml_cpu_has_kompute (void);
|
GGML_API int ggml_cpu_has_kompute (void);
|
||||||
|
|
|
@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||||
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
|
set(LLAMA_CUDA @LLAMA_CUDA@)
|
||||||
set(LLAMA_METAL @LLAMA_METAL@)
|
set(LLAMA_METAL @LLAMA_METAL@)
|
||||||
set(LLAMA_MPI @LLAMA_MPI@)
|
set(LLAMA_MPI @LLAMA_MPI@)
|
||||||
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||||
|
@ -27,7 +27,7 @@ if (LLAMA_BLAS)
|
||||||
find_package(BLAS REQUIRED)
|
find_package(BLAS REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUDA)
|
||||||
find_package(CUDAToolkit REQUIRED)
|
find_package(CUDAToolkit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ fi
|
||||||
make_opts=""
|
make_opts=""
|
||||||
|
|
||||||
if [[ "$backend" == "cuda" ]]; then
|
if [[ "$backend" == "cuda" ]]; then
|
||||||
make_opts="LLAMA_CUBLAS=1"
|
make_opts="LLAMA_CUDA=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
git checkout $1
|
git checkout $1
|
||||||
|
|
|
@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j
|
LLAMA_CUDA=1 make -j
|
||||||
|
|
||||||
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
|
ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b
|
||||||
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
|
ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b
|
||||||
|
@ -60,7 +60,7 @@ cd /workspace/llama.cpp
|
||||||
mkdir build-cublas
|
mkdir build-cublas
|
||||||
cd build-cublas
|
cd build-cublas
|
||||||
|
|
||||||
cmake -DLLAMA_CUBLAS=1 ../
|
cmake -DLLAMA_CUDA=1 ../
|
||||||
make -j
|
make -j
|
||||||
|
|
||||||
if [ "$1" -eq "0" ]; then
|
if [ "$1" -eq "0" ]; then
|
||||||
|
@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then
|
||||||
# batched
|
# batched
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
|
LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999
|
||||||
|
|
||||||
# batched-bench
|
# batched-bench
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
|
LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
|
||||||
|
|
||||||
# parallel
|
# parallel
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
|
LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -204,10 +204,10 @@ fi
|
||||||
#if [ "$1" -eq "7" ]; then
|
#if [ "$1" -eq "7" ]; then
|
||||||
# cd /workspace/llama.cpp
|
# cd /workspace/llama.cpp
|
||||||
#
|
#
|
||||||
# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
|
# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
|
||||||
#fi
|
#fi
|
||||||
|
|
||||||
# more benches
|
# more benches
|
||||||
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
||||||
#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
||||||
|
|
||||||
|
|
|
@ -380,7 +380,7 @@ fi
|
||||||
|
|
||||||
if [[ "$backend" == "cuda" ]]; then
|
if [[ "$backend" == "cuda" ]]; then
|
||||||
printf "[+] Building with CUDA backend\n"
|
printf "[+] Building with CUDA backend\n"
|
||||||
LLAMA_CUBLAS=1 make -j server $log
|
LLAMA_CUDA=1 make -j server $log
|
||||||
elif [[ "$backend" == "cpu" ]]; then
|
elif [[ "$backend" == "cpu" ]]; then
|
||||||
printf "[+] Building with CPU backend\n"
|
printf "[+] Building with CPU backend\n"
|
||||||
make -j server $log
|
make -j server $log
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue