Merge remote-tracking branch 'origin/master' into opencl-dev
This commit is contained in:
commit
49aaf08387
17 changed files with 766 additions and 90 deletions
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip
|
apt-get install -y build-essential python3 python3-pip git
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential
|
apt-get install -y build-essential git
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|
20
.github/workflows/build.yml
vendored
20
.github/workflows/build.yml
vendored
|
@ -10,10 +10,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
@ -157,15 +157,15 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'avx2'
|
- build: 'avx2'
|
||||||
defines: ''
|
defines: '-DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'avx'
|
- build: 'avx'
|
||||||
defines: '-DLLAMA_AVX2=OFF'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
|
||||||
- build: 'avx512'
|
- build: 'avx512'
|
||||||
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'clblast'
|
- build: 'clblast'
|
||||||
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||||
- build: 'openblas'
|
- build: 'openblas'
|
||||||
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
|
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -187,7 +187,7 @@ jobs:
|
||||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
||||||
rename-item $env:RUNNER_TEMP/clblast_release_dir clblast
|
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
|
||||||
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
||||||
$txt = Get-Content -Path $f -Raw
|
$txt = Get-Content -Path $f -Raw
|
||||||
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
||||||
|
@ -213,7 +213,6 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
cmake .. ${{ matrix.defines }}
|
cmake .. ${{ matrix.defines }}
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
cp ../LICENSE ./bin/Release/llama.cpp.txt
|
|
||||||
|
|
||||||
- name: Add clblast.dll
|
- name: Add clblast.dll
|
||||||
id: add_clblast_dll
|
id: add_clblast_dll
|
||||||
|
@ -258,6 +257,7 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
|
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||||
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
|
@ -292,7 +292,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_CUBLAS=ON
|
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
|
|
|
@ -66,7 +66,7 @@ endif()
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||||
|
|
19
Makefile
19
Makefile
|
@ -1,5 +1,11 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
default: main quantize quantize-stats perplexity embedding vdot
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
|
||||||
|
|
||||||
|
ifdef LLAMA_BUILD_SERVER
|
||||||
|
BUILD_TARGETS += server
|
||||||
|
endif
|
||||||
|
|
||||||
|
default: $(BUILD_TARGETS)
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
|
@ -38,7 +44,11 @@ CFLAGS = -I. -O3 -std=c11 -fPIC
|
||||||
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
|
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
ifndef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
|
CFLAGS += -O0 -g
|
||||||
|
CXXFLAGS += -O0 -g
|
||||||
|
LDFLAGS += -g
|
||||||
|
else
|
||||||
CFLAGS += -DNDEBUG
|
CFLAGS += -DNDEBUG
|
||||||
CXXFLAGS += -DNDEBUG
|
CXXFLAGS += -DNDEBUG
|
||||||
endif
|
endif
|
||||||
|
@ -210,7 +220,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
|
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
@ -237,6 +247,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
|
||||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh > $@.tmp
|
@sh scripts/build-info.sh > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
|
|
85
README.md
85
README.md
|
@ -240,11 +240,11 @@ In order to build llama.cpp you have three different options.
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
||||||
|
|
||||||
- Accelerate Framework:
|
- **Accelerate Framework**:
|
||||||
|
|
||||||
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
||||||
|
|
||||||
- OpenBLAS:
|
- **OpenBLAS**:
|
||||||
|
|
||||||
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
||||||
|
|
||||||
|
@ -278,11 +278,11 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
- BLIS
|
- **BLIS**
|
||||||
|
|
||||||
Check [BLIS.md](BLIS.md) for more information.
|
Check [BLIS.md](BLIS.md) for more information.
|
||||||
|
|
||||||
- Intel MKL
|
- **Intel MKL**
|
||||||
|
|
||||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||||
|
|
||||||
|
@ -293,7 +293,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake --build . -config Release
|
cmake --build . -config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
- cuBLAS
|
- **cuBLAS**
|
||||||
|
|
||||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
|
@ -308,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
cmake .. -DLLAMA_CUBLAS=ON
|
cmake .. -DLLAMA_CUBLAS=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
|
||||||
|
|
||||||
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
|
- **CLBlast**
|
||||||
|
|
||||||
|
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
||||||
|
|
||||||
|
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
|
||||||
|
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>Installing the OpenCL SDK from source</summary>
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
|
||||||
|
mkdir OpenCL-SDK/build
|
||||||
|
cd OpenCL-SDK/build
|
||||||
|
cmake .. -DBUILD_DOCS=OFF \
|
||||||
|
-DBUILD_EXAMPLES=OFF \
|
||||||
|
-DBUILD_TESTING=OFF \
|
||||||
|
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
|
||||||
|
-DOPENCL_SDK_TEST_SAMPLES=OFF
|
||||||
|
cmake --build . --config Release
|
||||||
|
cmake --install . --prefix /some/path
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|
||||||
|
Installing CLBlast: it may be found in your operating system's packages.
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>If not, then installing from source:</summary>
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone https://github.com/CNugteren/CLBlast.git
|
||||||
|
mkdir CLBlast/build
|
||||||
|
cd CLBLast/build
|
||||||
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
||||||
|
cmake --build . --config Release
|
||||||
|
cmake --install . --prefix /some/path
|
||||||
|
```
|
||||||
|
|
||||||
|
Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
|
||||||
|
</details>
|
||||||
|
|
||||||
|
Building:
|
||||||
|
|
||||||
|
- Build with make:
|
||||||
|
```sh
|
||||||
|
make LLAMA_CLBLAST=1
|
||||||
|
```
|
||||||
|
- CMake:
|
||||||
|
```sh
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
Running:
|
||||||
|
|
||||||
|
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
|
||||||
|
|
||||||
|
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
|
||||||
|
The selection can be a number (starting from 0) or a text string to search:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
GGML_OPENCL_PLATFORM=1 ./main ...
|
||||||
|
GGML_OPENCL_DEVICE=2 ./main ...
|
||||||
|
GGML_OPENCL_PLATFORM=Intel ./main ...
|
||||||
|
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
|
||||||
|
```
|
||||||
|
|
||||||
|
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
|
||||||
|
Using the variables it is possible to select a CPU-based driver as well, if so desired.
|
||||||
|
|
||||||
|
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||||
|
|
||||||
### Prepare Data & Run
|
### Prepare Data & Run
|
||||||
|
|
||||||
|
|
|
@ -251,6 +251,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.model = argv[i];
|
params.model = argv[i];
|
||||||
|
} else if (arg == "-a" || arg == "--alias") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model_alias = argv[i];
|
||||||
} else if (arg == "--lora") {
|
} else if (arg == "--lora") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -283,7 +289,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
params.n_gpu_layers = std::stoi(argv[i]);
|
params.n_gpu_layers = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--mtest") {
|
} else if (arg == "--mtest") {
|
||||||
|
@ -410,7 +421,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
|
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||||
|
@ -421,8 +433,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
if (llama_mmap_supported()) {
|
if (llama_mmap_supported()) {
|
||||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||||
|
#endif
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
|
|
|
@ -45,6 +45,7 @@ struct gpt_params {
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
|
||||||
std::string model = "models/7B/ggml-model.bin"; // model path
|
std::string model = "models/7B/ggml-model.bin"; // model path
|
||||||
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
|
|
|
@ -69,8 +69,8 @@ In this section, we cover the most commonly used options for running the `main`
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||||
- `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||||
- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
||||||
|
@ -136,9 +136,9 @@ During text generation, LLaMA models have a limited context size, which means th
|
||||||
|
|
||||||
### Context Size
|
### Context Size
|
||||||
|
|
||||||
The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
|
The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
|
||||||
|
|
||||||
- `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
||||||
|
|
||||||
### Keep Prompt
|
### Keep Prompt
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ The `--keep` option allows users to retain the original prompt when the model ru
|
||||||
|
|
||||||
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||||
|
|
||||||
By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
|
By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
|
||||||
|
|
||||||
## Generation Flags
|
## Generation Flags
|
||||||
|
|
||||||
|
@ -154,11 +154,11 @@ The following options allow you to control the text generation process and fine-
|
||||||
|
|
||||||
### Number of Tokens to Predict
|
### Number of Tokens to Predict
|
||||||
|
|
||||||
- `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
|
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
|
||||||
|
|
||||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||||
|
|
||||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
|
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
|
||||||
|
|
||||||
### Temperature
|
### Temperature
|
||||||
|
|
||||||
|
@ -170,33 +170,33 @@ Example usage: `--temp 0.5`
|
||||||
|
|
||||||
### Repeat Penalty
|
### Repeat Penalty
|
||||||
|
|
||||||
- `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
|
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||||
- `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
|
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
|
||||||
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
|
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
|
||||||
|
|
||||||
The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
|
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
|
||||||
|
|
||||||
The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
|
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
|
||||||
|
|
||||||
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
|
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
|
||||||
|
|
||||||
Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`
|
Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
|
||||||
|
|
||||||
### Top-K Sampling
|
### Top-K Sampling
|
||||||
|
|
||||||
- `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
|
- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||||
|
|
||||||
Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
|
Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
|
||||||
|
|
||||||
Example usage: `--top_k 30`
|
Example usage: `--top-k 30`
|
||||||
|
|
||||||
### Top-P Sampling
|
### Top-P Sampling
|
||||||
|
|
||||||
- `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||||
|
|
||||||
Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
|
Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
|
||||||
|
|
||||||
Example usage: `--top_p 0.95`
|
Example usage: `--top-p 0.95`
|
||||||
|
|
||||||
### Tail Free Sampling (TFS)
|
### Tail Free Sampling (TFS)
|
||||||
|
|
||||||
|
@ -217,16 +217,16 @@ Example usage: `--typical 0.9`
|
||||||
### Mirostat Sampling
|
### Mirostat Sampling
|
||||||
|
|
||||||
- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
||||||
- `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
|
- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
|
||||||
- `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
||||||
|
|
||||||
Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
|
Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
|
||||||
|
|
||||||
The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
|
The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
|
||||||
|
|
||||||
The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
|
The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
|
||||||
|
|
||||||
Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
|
Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
|
||||||
|
|
||||||
### Logit Bias
|
### Logit Bias
|
||||||
|
|
||||||
|
@ -264,11 +264,11 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
### Memory Float 32
|
### Memory Float 32
|
||||||
|
|
||||||
- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
|
||||||
|
|
||||||
### Batch Size
|
### Batch Size
|
||||||
|
|
||||||
- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
||||||
|
|
||||||
### Prompt Caching
|
### Prompt Caching
|
||||||
|
|
||||||
|
@ -285,5 +285,6 @@ These options provide extra functionality and customization when running the LLa
|
||||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||||
- `--verbose-prompt`: Print the prompt before generating text.
|
- `--verbose-prompt`: Print the prompt before generating text.
|
||||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
||||||
|
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
|
@ -360,6 +360,12 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
|
// check if we've used up all the prompt but not all cached tokens
|
||||||
|
if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) {
|
||||||
|
// force revaluation of the last token to recalculate logits
|
||||||
|
i--;
|
||||||
|
n_past--;
|
||||||
|
}
|
||||||
embd.erase(embd.begin(), embd.begin() + i);
|
embd.erase(embd.begin(), embd.begin() + i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -285,7 +285,8 @@ Test();
|
||||||
## Common Options
|
## Common Options
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||||
|
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
|
||||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
|
||||||
- `--port`: Set the port to listen. Default: `8080`.
|
- `--port`: Set the port to listen. Default: `8080`.
|
||||||
|
@ -304,7 +305,7 @@ The RNG seed is used to initialize the random number generator that influences t
|
||||||
|
|
||||||
### Memory Float 32
|
### Memory Float 32
|
||||||
|
|
||||||
- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
|
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
|
||||||
|
|
||||||
## Limitations:
|
## Limitations:
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ struct llama_server_context
|
||||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
int new_prompt_len = 0;
|
int new_prompt_len = 0;
|
||||||
for (int i = 0;i < prompt_tokens.size(); i++) {
|
for (size_t i = 0; i < prompt_tokens.size(); i++) {
|
||||||
if (i < processed_tokens.size() &&
|
if (i < processed_tokens.size() &&
|
||||||
processed_tokens[i] == prompt_tokens[i])
|
processed_tokens[i] == prompt_tokens[i])
|
||||||
{
|
{
|
||||||
|
@ -71,7 +71,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
embd_inp.push_back(prompt_tokens[i]);
|
embd_inp.push_back(prompt_tokens[i]);
|
||||||
if(new_prompt_len == 0) {
|
if(new_prompt_len == 0) {
|
||||||
if(i - 1 < n_past) {
|
if(int32_t(i) - 1 < n_past) {
|
||||||
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
|
||||||
}
|
}
|
||||||
// Evaluate the new fragment prompt from the last token processed.
|
// Evaluate the new fragment prompt from the last token processed.
|
||||||
|
@ -136,7 +136,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
// const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
|
@ -306,12 +306,12 @@ struct llama_server_context
|
||||||
// Avoid add the no show words to the response
|
// Avoid add the no show words to the response
|
||||||
for (std::vector<llama_token> word_tokens : no_show_words)
|
for (std::vector<llama_token> word_tokens : no_show_words)
|
||||||
{
|
{
|
||||||
int match_token = 1;
|
size_t match_token = 1;
|
||||||
if (tokens_predicted.front() == word_tokens.front())
|
if (tokens_predicted.front() == word_tokens.front())
|
||||||
{
|
{
|
||||||
bool execute_matching = true;
|
bool execute_matching = true;
|
||||||
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
|
||||||
for (int i = 1; i < word_tokens.size(); i++)
|
for (size_t i = 1; i < word_tokens.size(); i++)
|
||||||
{
|
{
|
||||||
if (i >= tokens_predicted.size()) {
|
if (i >= tokens_predicted.size()) {
|
||||||
match_token = i;
|
match_token = i;
|
||||||
|
@ -385,7 +385,9 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||||
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
fprintf(stderr, " --embedding enable embedding mode\n");
|
fprintf(stderr, " --embedding enable embedding mode\n");
|
||||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
if (llama_mlock_supported())
|
if (llama_mlock_supported())
|
||||||
|
@ -396,12 +398,16 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
||||||
{
|
{
|
||||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||||
|
#endif
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stderr, " -host ip address to listen (default 127.0.0.1)\n");
|
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
|
||||||
fprintf(stderr, " -port PORT port to listen (default 8080)\n");
|
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
|
||||||
|
fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n");
|
||||||
|
fprintf(stderr, " --port PORT port to listen (default 8080)\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -453,6 +459,15 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
}
|
}
|
||||||
params.model = argv[i];
|
params.model = argv[i];
|
||||||
}
|
}
|
||||||
|
else if (arg == "-a" || arg == "--alias")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model_alias = argv[i];
|
||||||
|
}
|
||||||
else if (arg == "--embedding")
|
else if (arg == "--embedding")
|
||||||
{
|
{
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
@ -462,7 +477,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
server_print_usage(argc, argv, default_params);
|
server_print_usage(argc, argv, default_params);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
else if (arg == "-c" || arg == "--ctx_size")
|
else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
{
|
{
|
||||||
|
@ -471,7 +486,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--memory_f32")
|
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||||
{
|
{
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
}
|
}
|
||||||
|
@ -482,7 +497,12 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
params.n_gpu_layers = std::stoi(argv[i]);
|
params.n_gpu_layers = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -601,7 +621,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
Server svr;
|
Server svr;
|
||||||
|
|
||||||
svr.Get("/", [](const Request &req, Response &res)
|
svr.Get("/", [](const Request &, Response &res)
|
||||||
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
svr.Post("/completion", [&llama](const Request &req, Response &res)
|
||||||
|
@ -645,11 +665,12 @@ int main(int argc, char **argv)
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
json data = {
|
json data = {
|
||||||
|
{"model", llama.params.model_alias },
|
||||||
{"content", llama.generated_text },
|
{"content", llama.generated_text },
|
||||||
{"tokens_predicted", llama.num_tokens_predicted}};
|
{"tokens_predicted", llama.num_tokens_predicted}};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
}
|
}
|
||||||
catch (json::exception e)
|
catch (const json::exception &e)
|
||||||
{
|
{
|
||||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
json data = {
|
json data = {
|
||||||
|
@ -701,7 +722,7 @@ int main(int argc, char **argv)
|
||||||
{"content", result },
|
{"content", result },
|
||||||
{"stop", !llama.has_next_token }};
|
{"stop", !llama.has_next_token }};
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
} catch (json::exception e) {
|
} catch (const json::exception &e) {
|
||||||
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
|
||||||
json data = {
|
json data = {
|
||||||
{"content", "" },
|
{"content", "" },
|
||||||
|
|
|
@ -497,16 +497,11 @@ void ggml_cl_init(void) {
|
||||||
|
|
||||||
size_t ext_str_size;
|
size_t ext_str_size;
|
||||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
||||||
char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
|
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||||
|
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
||||||
// Check if ext_buffer contains cl_khr_fp16
|
// Check if ext_buffer contains cl_khr_fp16
|
||||||
for (size_t i = 0; i < ext_str_size - 12; i++) {
|
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||||
if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
|
|
||||||
fp16_support = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
free(ext_buffer);
|
|
||||||
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
||||||
|
|
||||||
cl_context_properties properties[] = {
|
cl_context_properties properties[] = {
|
||||||
|
@ -795,7 +790,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
cl_mem d_X;
|
cl_mem d_X;
|
||||||
if (src0->backend == GGML_BACKEND_CL) {
|
if (src0->backend == GGML_BACKEND_CL) {
|
||||||
d_X = *(cl_mem*) src0->data;
|
d_X = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||||
}
|
}
|
||||||
|
@ -871,7 +866,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
cl_mem d_X;
|
cl_mem d_X;
|
||||||
if (src0->backend == GGML_BACKEND_CL) {
|
if (src0->backend == GGML_BACKEND_CL) {
|
||||||
d_X = *(cl_mem*) src0->data;
|
d_X = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||||
}
|
}
|
||||||
|
@ -998,7 +993,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||||
} else if (src0->backend == GGML_BACKEND_CL) {
|
} else if (src0->backend == GGML_BACKEND_CL) {
|
||||||
d_Q = *(cl_mem*) src0->data;
|
d_Q = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -1145,14 +1140,13 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||||
|
|
||||||
size_t q_size;
|
size_t q_size;
|
||||||
cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||||
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
|
||||||
|
|
||||||
// copy tensor to device
|
// copy tensor to device
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
int i = i3*ne2 + i2;
|
int i = i3*ne2 + i2;
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
542
ggml.c
542
ggml.c
|
@ -186,10 +186,12 @@ typedef double ggml_float;
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#else
|
#else
|
||||||
|
#if !defined(__riscv)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __F16C__
|
#ifdef __F16C__
|
||||||
|
|
||||||
|
@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
|
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
|
||||||
|
|
||||||
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"NONE",
|
"NONE",
|
||||||
|
|
||||||
"DUP",
|
"DUP",
|
||||||
|
@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
|
||||||
return GGML_TYPE_NAME[type];
|
return GGML_TYPE_NAME[type];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * ggml_op_name(enum ggml_op op) {
|
||||||
|
return GGML_OP_NAME[op];
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||||
return GGML_TYPE_SIZE[tensor->type];
|
return GGML_TYPE_SIZE[tensor->type];
|
||||||
|
@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
return wtype;
|
return wtype;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_tensor_overhead(void) {
|
||||||
|
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||||
return tensor->nb[0] > tensor->nb[1];
|
return tensor->nb[0] > tensor->nb[1];
|
||||||
}
|
}
|
||||||
|
@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
||||||
|
ctx->no_alloc = no_alloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
||||||
|
return ctx->mem_buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
||||||
|
return ctx->mem_size;
|
||||||
|
}
|
||||||
|
|
||||||
// IMPORTANT:
|
// IMPORTANT:
|
||||||
// when creating "opt" tensors, always save and load the scratch buffer
|
// when creating "opt" tensors, always save and load the scratch buffer
|
||||||
// this is an error prone process, but it is necessary to support inplace
|
// this is an error prone process, but it is necessary to support inplace
|
||||||
|
@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
||||||
|
|
||||||
if (ctx->scratch.data == NULL || data != NULL) {
|
if (ctx->scratch.data == NULL || data != NULL) {
|
||||||
size_needed += sizeof(struct ggml_tensor);
|
size_needed += GGML_TENSOR_SIZE;
|
||||||
|
|
||||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||||
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||||
|
@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
|
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
|
||||||
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
|
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
||||||
|
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
|
||||||
assert(false);
|
assert(false);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
|
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||||
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||||
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
|
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
|
||||||
assert(false);
|
assert(false);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
|
|
||||||
*obj_new = (struct ggml_object) {
|
*obj_new = (struct ggml_object) {
|
||||||
.offs = cur_end + GGML_OBJECT_SIZE,
|
.offs = cur_end + GGML_OBJECT_SIZE,
|
||||||
.size = sizeof(struct ggml_tensor),
|
.size = GGML_TENSOR_SIZE,
|
||||||
.next = NULL,
|
.next = NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
||||||
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
|
char * const mem_buffer = ctx->mem_buffer;
|
||||||
|
|
||||||
|
while (obj != NULL) {
|
||||||
|
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
|
||||||
|
if (strcmp(cur->name, name) == 0) {
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
obj = obj->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// ggml_dup
|
// ggml_dup
|
||||||
|
@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
||||||
|
|
||||||
((int32_t *) b->data)[0] = n_past;
|
((int32_t *) b->data)[0] = n_past;
|
||||||
((int32_t *) b->data)[1] = n_head;
|
((int32_t *) b->data)[1] = n_head;
|
||||||
|
@ -13799,11 +13838,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||||
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
||||||
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
||||||
|
|
||||||
|
if (strlen(node->name) == 0) {
|
||||||
|
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
||||||
|
}
|
||||||
|
|
||||||
cgraph->leafs[cgraph->n_leafs] = node;
|
cgraph->leafs[cgraph->n_leafs] = node;
|
||||||
cgraph->n_leafs++;
|
cgraph->n_leafs++;
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
||||||
|
|
||||||
|
if (strlen(node->name) == 0) {
|
||||||
|
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
||||||
|
}
|
||||||
|
|
||||||
cgraph->nodes[cgraph->n_nodes] = node;
|
cgraph->nodes[cgraph->n_nodes] = node;
|
||||||
cgraph->grads[cgraph->n_nodes] = node->grad;
|
cgraph->grads[cgraph->n_nodes] = node->grad;
|
||||||
cgraph->n_nodes++;
|
cgraph->n_nodes++;
|
||||||
|
@ -14517,6 +14564,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
||||||
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||||
|
struct ggml_tensor * leaf = cgraph->leafs[i];
|
||||||
|
|
||||||
|
if (strcmp(leaf->name, name) == 0) {
|
||||||
|
return leaf;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
if (strcmp(node->name, name) == 0) {
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
|
||||||
|
const int64_t * ne = tensor->ne;
|
||||||
|
const size_t * nb = tensor->nb;
|
||||||
|
|
||||||
|
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
|
||||||
|
ggml_type_name(tensor->type),
|
||||||
|
ggml_op_name (tensor->op),
|
||||||
|
tensor->n_dims,
|
||||||
|
ne[0], ne[1], ne[2], ne[3],
|
||||||
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
|
tensor->data,
|
||||||
|
tensor->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
|
||||||
|
const int64_t * ne = tensor->ne;
|
||||||
|
const size_t * nb = tensor->nb;
|
||||||
|
|
||||||
|
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
|
||||||
|
arg,
|
||||||
|
ggml_type_name(tensor->type),
|
||||||
|
ggml_op_name (tensor->op),
|
||||||
|
tensor->n_dims,
|
||||||
|
ne[0], ne[1], ne[2], ne[3],
|
||||||
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
|
tensor->n_tasks,
|
||||||
|
tensor->data,
|
||||||
|
tensor->name);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||||
|
assert(cgraph->work == NULL);
|
||||||
|
assert(cgraph->work_size == 0);
|
||||||
|
|
||||||
|
uint64_t size_eval = 0;
|
||||||
|
|
||||||
|
// compute size of intermediate results
|
||||||
|
// TODO: does not take into account scratch buffers !!!!
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
|
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// print
|
||||||
|
{
|
||||||
|
FILE * fout = stdout;
|
||||||
|
|
||||||
|
fprintf(fout, "\n");
|
||||||
|
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
||||||
|
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
||||||
|
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
||||||
|
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
||||||
|
fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
|
||||||
|
|
||||||
|
// header
|
||||||
|
fprintf(fout, "\n");
|
||||||
|
fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
|
||||||
|
"TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
|
||||||
|
|
||||||
|
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
||||||
|
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
||||||
|
|
||||||
|
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
||||||
|
GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
|
||||||
|
GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// header
|
||||||
|
fprintf(fout, "\n");
|
||||||
|
fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
|
||||||
|
"ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
|
||||||
|
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
|
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
||||||
|
|
||||||
|
if (cgraph->nodes[i]->src0) {
|
||||||
|
ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cgraph->nodes[i]->src1) {
|
||||||
|
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
||||||
|
if (cgraph->nodes[i]->opt[j]) {
|
||||||
|
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(fout, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(fout, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// write binary data
|
||||||
|
{
|
||||||
|
FILE * fout = fopen(fname, "wb");
|
||||||
|
|
||||||
|
if (!fout) {
|
||||||
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// header
|
||||||
|
{
|
||||||
|
const uint32_t magic = GGML_FILE_MAGIC;
|
||||||
|
const uint32_t version = GGML_FILE_VERSION;
|
||||||
|
const uint32_t n_leafs = cgraph->n_leafs;
|
||||||
|
const uint32_t nodes = cgraph->n_nodes;
|
||||||
|
|
||||||
|
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&version, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&nodes, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
// leafs
|
||||||
|
{
|
||||||
|
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
||||||
|
const struct ggml_tensor * tensor = cgraph->leafs[i];
|
||||||
|
|
||||||
|
const uint32_t type = tensor->type;
|
||||||
|
const uint32_t op = tensor->op;
|
||||||
|
const uint32_t n_dims = tensor->n_dims;
|
||||||
|
|
||||||
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
const uint64_t ne = tensor->ne[j];
|
||||||
|
const uint64_t nb = tensor->nb[j];
|
||||||
|
|
||||||
|
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
||||||
|
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
// store the pointer address
|
||||||
|
{
|
||||||
|
const uint64_t ptr = (uint64_t) tensor->data;
|
||||||
|
|
||||||
|
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
||||||
|
|
||||||
|
// dump the data
|
||||||
|
// TODO: pad this to 32 byte boundary
|
||||||
|
{
|
||||||
|
const size_t size = ggml_nbytes(tensor);
|
||||||
|
|
||||||
|
fwrite(tensor->data, sizeof(char), size, fout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// nodes
|
||||||
|
{
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
|
const struct ggml_tensor * tensor = cgraph->nodes[i];
|
||||||
|
|
||||||
|
const uint32_t type = tensor->type;
|
||||||
|
const uint32_t op = tensor->op;
|
||||||
|
const uint32_t n_dims = tensor->n_dims;
|
||||||
|
|
||||||
|
fwrite(&type, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&op, sizeof(uint32_t), 1, fout);
|
||||||
|
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
const uint64_t ne = tensor->ne[j];
|
||||||
|
const uint64_t nb = tensor->nb[j];
|
||||||
|
|
||||||
|
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
||||||
|
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
// store the pointer address
|
||||||
|
{
|
||||||
|
const uint64_t ptr = (uint64_t) tensor->data;
|
||||||
|
|
||||||
|
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
||||||
|
|
||||||
|
// output the op arguments
|
||||||
|
{
|
||||||
|
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
||||||
|
|
||||||
|
args[0] = tensor->src0;
|
||||||
|
args[1] = tensor->src1;
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
||||||
|
args[2 + j] = tensor->opt[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
||||||
|
if (args[j]) {
|
||||||
|
int32_t idx = -1;
|
||||||
|
|
||||||
|
// check if leaf
|
||||||
|
{
|
||||||
|
for (int k = 0; k < cgraph->n_leafs; ++k) {
|
||||||
|
if (args[j] == cgraph->leafs[k]) {
|
||||||
|
idx = k;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if node
|
||||||
|
if (idx == -1) {
|
||||||
|
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
||||||
|
if (args[j] == cgraph->nodes[k]) {
|
||||||
|
idx = GGML_MAX_NODES + k;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx == -1) {
|
||||||
|
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fwrite(&idx, sizeof(int32_t), 1, fout);
|
||||||
|
} else {
|
||||||
|
const int32_t nul = -1;
|
||||||
|
|
||||||
|
fwrite(&nul, sizeof(int32_t), 1, fout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(fout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
||||||
|
assert(*ctx_data == NULL);
|
||||||
|
assert(*ctx_eval == NULL);
|
||||||
|
|
||||||
|
struct ggml_cgraph result = { 0 };
|
||||||
|
|
||||||
|
struct ggml_tensor * data = NULL;
|
||||||
|
|
||||||
|
// read file into data
|
||||||
|
{
|
||||||
|
FILE * fin = fopen(fname, "rb");
|
||||||
|
|
||||||
|
if (!fin) {
|
||||||
|
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t fsize = 0;
|
||||||
|
|
||||||
|
fseek(fin, 0, SEEK_END);
|
||||||
|
fsize = ftell(fin);
|
||||||
|
fseek(fin, 0, SEEK_SET);
|
||||||
|
|
||||||
|
// create the data context
|
||||||
|
{
|
||||||
|
const size_t overhead = 1*ggml_tensor_overhead();
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = fsize + overhead,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
.no_alloc = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
*ctx_data = ggml_init(params);
|
||||||
|
|
||||||
|
if (!*ctx_data) {
|
||||||
|
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
||||||
|
|
||||||
|
fread(data->data, sizeof(char), fsize, fin);
|
||||||
|
|
||||||
|
fclose(fin);
|
||||||
|
}
|
||||||
|
|
||||||
|
// populate result
|
||||||
|
{
|
||||||
|
char * ptr = (char *) data->data;
|
||||||
|
|
||||||
|
const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
|
||||||
|
|
||||||
|
if (magic != GGML_FILE_MAGIC) {
|
||||||
|
fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
|
||||||
|
|
||||||
|
if (version != GGML_FILE_VERSION) {
|
||||||
|
fprintf(stderr, "%s: invalid version number\n", __func__);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
||||||
|
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
||||||
|
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
||||||
|
|
||||||
|
result.n_leafs = n_leafs;
|
||||||
|
result.n_nodes = n_nodes;
|
||||||
|
|
||||||
|
// create the data context
|
||||||
|
{
|
||||||
|
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = size_eval + overhead,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
.no_alloc = true,
|
||||||
|
};
|
||||||
|
|
||||||
|
*ctx_eval = ggml_init(params);
|
||||||
|
|
||||||
|
if (!*ctx_eval) {
|
||||||
|
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// leafs
|
||||||
|
{
|
||||||
|
uint32_t type;
|
||||||
|
uint32_t op;
|
||||||
|
uint32_t n_dims;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_leafs; ++i) {
|
||||||
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
||||||
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
||||||
|
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
||||||
|
|
||||||
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
size_t nb[GGML_MAX_DIMS];
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
uint64_t ne_cur;
|
||||||
|
uint64_t nb_cur;
|
||||||
|
|
||||||
|
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
||||||
|
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
||||||
|
|
||||||
|
ne[j] = ne_cur;
|
||||||
|
nb[j] = nb_cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
||||||
|
|
||||||
|
tensor->op = (enum ggml_op) op;
|
||||||
|
|
||||||
|
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
||||||
|
|
||||||
|
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
||||||
|
|
||||||
|
tensor->data = (void *) ptr;
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
tensor->nb[j] = nb[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
result.leafs[i] = tensor;
|
||||||
|
|
||||||
|
ptr += ggml_nbytes(tensor);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_no_alloc(*ctx_eval, false);
|
||||||
|
|
||||||
|
// nodes
|
||||||
|
{
|
||||||
|
uint32_t type;
|
||||||
|
uint32_t op;
|
||||||
|
uint32_t n_dims;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_nodes; ++i) {
|
||||||
|
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
||||||
|
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
||||||
|
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
||||||
|
|
||||||
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
size_t nb[GGML_MAX_DIMS];
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
uint64_t ne_cur;
|
||||||
|
uint64_t nb_cur;
|
||||||
|
|
||||||
|
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
||||||
|
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
||||||
|
|
||||||
|
ne[j] = ne_cur;
|
||||||
|
nb[j] = nb_cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
||||||
|
|
||||||
|
tensor->op = (enum ggml_op) op;
|
||||||
|
|
||||||
|
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
||||||
|
|
||||||
|
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
tensor->nb[j] = nb[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse args
|
||||||
|
{
|
||||||
|
struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
|
||||||
|
&tensor->src0,
|
||||||
|
&tensor->src1,
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
||||||
|
args[2 + j] = &tensor->opt[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
||||||
|
const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
|
||||||
|
|
||||||
|
if (arg_idx == -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (arg_idx < GGML_MAX_NODES) {
|
||||||
|
*args[j] = result.leafs[arg_idx];
|
||||||
|
} else {
|
||||||
|
*args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.nodes[i] = tensor;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
|
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
|
||||||
|
|
||||||
|
@ -14534,7 +15056,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
||||||
i,
|
i,
|
||||||
node->ne[0], node->ne[1], node->ne[2],
|
node->ne[0], node->ne[1], node->ne[2],
|
||||||
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
||||||
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
||||||
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
||||||
(double) node->perf_time_us / 1000.0,
|
(double) node->perf_time_us / 1000.0,
|
||||||
|
@ -14548,7 +15070,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
||||||
i,
|
i,
|
||||||
node->ne[0], node->ne[1],
|
node->ne[0], node->ne[1],
|
||||||
GGML_OP_LABEL[node->op]);
|
GGML_OP_NAME[node->op]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
||||||
|
@ -14556,7 +15078,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
|
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_PRINT("========================================\n");
|
GGML_PRINT("========================================\n");
|
||||||
|
|
22
ggml.h
22
ggml.h
|
@ -198,6 +198,7 @@
|
||||||
#define GGML_MAX_PARAMS 256
|
#define GGML_MAX_PARAMS 256
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_OPT 4
|
#define GGML_MAX_OPT 4
|
||||||
|
#define GGML_MAX_NAME 32
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
|
@ -372,11 +373,13 @@ extern "C" {
|
||||||
|
|
||||||
void * data;
|
void * data;
|
||||||
|
|
||||||
char name[32];
|
char name[GGML_MAX_NAME];
|
||||||
|
|
||||||
char padding[16];
|
char padding[16];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
|
@ -429,6 +432,7 @@ extern "C" {
|
||||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||||
|
|
||||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||||
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||||
|
|
||||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
@ -437,6 +441,9 @@ extern "C" {
|
||||||
// TODO: temporary until model loading of ggml examples is refactored
|
// TODO: temporary until model loading of ggml examples is refactored
|
||||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
|
// use this to compute the memory overhead of a tensor
|
||||||
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
|
||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||||
|
@ -444,7 +451,11 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||||
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
||||||
|
|
||||||
|
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
||||||
|
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_new_tensor(
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -484,6 +495,8 @@ extern "C" {
|
||||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||||
|
@ -970,6 +983,11 @@ extern "C" {
|
||||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
|
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||||
|
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
||||||
|
|
||||||
// print info and performance information for the graph
|
// print info and performance information for the graph
|
||||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@
|
||||||
// available llama models
|
// available llama models
|
||||||
enum e_model {
|
enum e_model {
|
||||||
MODEL_UNKNOWN,
|
MODEL_UNKNOWN,
|
||||||
|
MODEL_3B,
|
||||||
MODEL_7B,
|
MODEL_7B,
|
||||||
MODEL_13B,
|
MODEL_13B,
|
||||||
MODEL_30B,
|
MODEL_30B,
|
||||||
|
@ -58,6 +59,7 @@ static const size_t MB = 1024*1024;
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
{ MODEL_3B, 128ull * MB },
|
||||||
{ MODEL_7B, 512ull * MB },
|
{ MODEL_7B, 512ull * MB },
|
||||||
{ MODEL_13B, 512ull * MB },
|
{ MODEL_13B, 512ull * MB },
|
||||||
{ MODEL_30B, 512ull * MB },
|
{ MODEL_30B, 512ull * MB },
|
||||||
|
@ -69,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
{ MODEL_3B, 128ull * MB },
|
||||||
{ MODEL_7B, 512ull * MB },
|
{ MODEL_7B, 512ull * MB },
|
||||||
{ MODEL_13B, 512ull * MB },
|
{ MODEL_13B, 512ull * MB },
|
||||||
{ MODEL_30B, 512ull * MB },
|
{ MODEL_30B, 512ull * MB },
|
||||||
|
@ -81,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
{ MODEL_3B, 682ull * MB },
|
||||||
{ MODEL_7B, 1026ull * MB },
|
{ MODEL_7B, 1026ull * MB },
|
||||||
{ MODEL_13B, 1608ull * MB },
|
{ MODEL_13B, 1608ull * MB },
|
||||||
{ MODEL_30B, 3124ull * MB },
|
{ MODEL_30B, 3124ull * MB },
|
||||||
|
@ -94,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
{ MODEL_3B, 512ull * MB },
|
||||||
{ MODEL_7B, 768ull * MB },
|
{ MODEL_7B, 768ull * MB },
|
||||||
{ MODEL_13B, 1024ull * MB },
|
{ MODEL_13B, 1024ull * MB },
|
||||||
{ MODEL_30B, 1280ull * MB },
|
{ MODEL_30B, 1280ull * MB },
|
||||||
|
@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
|
|
||||||
static const char *llama_model_type_name(e_model type) {
|
static const char *llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
case MODEL_3B: return "3B";
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
case MODEL_13B: return "13B";
|
case MODEL_13B: return "13B";
|
||||||
case MODEL_30B: return "30B";
|
case MODEL_30B: return "30B";
|
||||||
|
@ -932,6 +938,7 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
{
|
{
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
case 60: model.type = e_model::MODEL_30B; break;
|
||||||
|
|
5
llama.h
5
llama.h
|
@ -31,6 +31,11 @@
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 1
|
#define LLAMA_SESSION_VERSION 1
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue