Merge remote-tracking branch 'origin/master' into json-fixes
This commit is contained in:
commit
6dcf856259
41 changed files with 6046 additions and 4648 deletions
68
.github/workflows/build.yml
vendored
68
.github/workflows/build.yml
vendored
|
@ -98,40 +98,40 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
# ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
|
#
|
||||||
continue-on-error: true
|
# continue-on-error: true
|
||||||
|
#
|
||||||
strategy:
|
# strategy:
|
||||||
matrix:
|
# matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
build_type: [Debug, Release]
|
# build_type: [Debug, Release]
|
||||||
|
#
|
||||||
steps:
|
# steps:
|
||||||
- name: Clone
|
# - name: Clone
|
||||||
id: checkout
|
# id: checkout
|
||||||
uses: actions/checkout@v3
|
# uses: actions/checkout@v3
|
||||||
|
#
|
||||||
- name: Dependencies
|
# - name: Dependencies
|
||||||
id: depends
|
# id: depends
|
||||||
run: |
|
# run: |
|
||||||
sudo apt-get update
|
# sudo apt-get update
|
||||||
sudo apt-get install build-essential
|
# sudo apt-get install build-essential
|
||||||
|
#
|
||||||
- name: Build
|
# - name: Build
|
||||||
id: cmake_build
|
# id: cmake_build
|
||||||
run: |
|
# run: |
|
||||||
mkdir build
|
# mkdir build
|
||||||
cd build
|
# cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
#
|
||||||
- name: Test
|
# - name: Test
|
||||||
id: cmake_test
|
# id: cmake_test
|
||||||
run: |
|
# run: |
|
||||||
cd build
|
# cd build
|
||||||
ctest -L main --verbose --timeout 900
|
# ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-mpi:
|
ubuntu-latest-cmake-mpi:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
2
.github/workflows/close-issue.yml
vendored
2
.github/workflows/close-issue.yml
vendored
|
@ -12,10 +12,10 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@v5
|
- uses: actions/stale@v5
|
||||||
with:
|
with:
|
||||||
|
exempt-issue-labels: "refactor,help wanted,good first issue,research"
|
||||||
days-before-issue-stale: 30
|
days-before-issue-stale: 30
|
||||||
days-before-issue-close: 14
|
days-before-issue-close: 14
|
||||||
stale-issue-label: "stale"
|
stale-issue-label: "stale"
|
||||||
stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
|
|
||||||
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
||||||
days-before-pr-stale: -1
|
days-before-pr-stale: -1
|
||||||
days-before-pr-close: -1
|
days-before-pr-close: -1
|
||||||
|
|
6
.github/workflows/server.yml
vendored
6
.github/workflows/server.yml
vendored
|
@ -24,13 +24,13 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
# TODO: temporary disabled due to linux kernel issues
|
||||||
|
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
sanitizer: [UNDEFINED]
|
||||||
build_type: [Debug]
|
build_type: [Debug]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
- build_type: Debug
|
|
||||||
sanitizer: THREAD
|
|
||||||
disabled_on_pr: true
|
disabled_on_pr: true
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -13,6 +13,8 @@
|
||||||
*.bat
|
*.bat
|
||||||
*.tmp
|
*.tmp
|
||||||
*.metallib
|
*.metallib
|
||||||
|
*.etag
|
||||||
|
*.lastModified
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -757,6 +757,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
129
README-sycl.md
129
README-sycl.md
|
@ -29,6 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||||
## News
|
## News
|
||||||
|
|
||||||
- 2024.3
|
- 2024.3
|
||||||
|
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
||||||
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||||
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||||
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||||
|
@ -81,7 +82,7 @@ For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, rec
|
||||||
|-|-|-|
|
|-|-|-|
|
||||||
|Ampere Series| Support| A100|
|
|Ampere Series| Support| A100|
|
||||||
|
|
||||||
### oneMKL
|
### oneMKL for CUDA
|
||||||
|
|
||||||
The current oneMKL release does not contain the oneMKL cuBlas backend.
|
The current oneMKL release does not contain the oneMKL cuBlas backend.
|
||||||
As a result for Nvidia GPU's oneMKL must be built from source.
|
As a result for Nvidia GPU's oneMKL must be built from source.
|
||||||
|
@ -254,16 +255,16 @@ Run without parameter:
|
||||||
Check the ID in startup log, like:
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
```
|
```
|
||||||
found 4 SYCL devices:
|
found 6 SYCL devices:
|
||||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||||
|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||||
```
|
```
|
||||||
|
|
||||||
|Attribute|Note|
|
|Attribute|Note|
|
||||||
|
@ -271,12 +272,35 @@ found 4 SYCL devices:
|
||||||
|compute capability 1.3|Level-zero running time, recommended |
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
4. Set device ID and execute llama.cpp
|
4. Device selection and execution of llama.cpp
|
||||||
|
|
||||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
There are two device selection modes:
|
||||||
|
|
||||||
|
- Single device: Use one device assigned by user.
|
||||||
|
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||||
|
|
||||||
|
|Device selection|Parameter|
|
||||||
|
|-|-|
|
||||||
|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|
||||||
|
|Multiple devices|--split-mode layer (default)|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
|
```
|
||||||
|
or run by script:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./examples/sycl/run_llama2.sh 0
|
||||||
|
```
|
||||||
|
|
||||||
|
- Use multiple devices:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
|
@ -289,12 +313,18 @@ Note:
|
||||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
5. Check the device ID in output
|
5. Verify the device ID in output
|
||||||
|
|
||||||
|
Verify to see if the selected GPU is shown in the output, like:
|
||||||
|
|
||||||
Like:
|
|
||||||
```
|
```
|
||||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
detect 1 SYCL GPUs: [0] with top Max compute units:512
|
||||||
```
|
```
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Windows
|
## Windows
|
||||||
|
|
||||||
|
@ -355,7 +385,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
||||||
|
|
||||||
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
- Extract `w64devkit` on your pc.
|
||||||
|
|
||||||
|
@ -430,15 +460,16 @@ build\bin\main.exe
|
||||||
Check the ID in startup log, like:
|
Check the ID in startup log, like:
|
||||||
|
|
||||||
```
|
```
|
||||||
found 4 SYCL devices:
|
found 6 SYCL devices:
|
||||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||||
|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -447,13 +478,31 @@ found 4 SYCL devices:
|
||||||
|compute capability 1.3|Level-zero running time, recommended |
|
|compute capability 1.3|Level-zero running time, recommended |
|
||||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||||
|
|
||||||
4. Set device ID and execute llama.cpp
|
|
||||||
|
|
||||||
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
|
4. Device selection and execution of llama.cpp
|
||||||
|
|
||||||
|
There are two device selection modes:
|
||||||
|
|
||||||
|
- Single device: Use one device assigned by user.
|
||||||
|
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||||
|
|
||||||
|
|Device selection|Parameter|
|
||||||
|
|-|-|
|
||||||
|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|
||||||
|
|Multiple devices|--split-mode layer (default)|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
set GGML_SYCL_DEVICE=0
|
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
|
```
|
||||||
|
|
||||||
|
- Use multiple devices:
|
||||||
|
|
||||||
|
```
|
||||||
|
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||||
```
|
```
|
||||||
or run by script:
|
or run by script:
|
||||||
|
|
||||||
|
@ -466,11 +515,17 @@ Note:
|
||||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||||
|
|
||||||
|
|
||||||
5. Check the device ID in output
|
|
||||||
|
|
||||||
Like:
|
5. Verify the device ID in output
|
||||||
|
|
||||||
|
Verify to see if the selected GPU is shown in the output, like:
|
||||||
|
|
||||||
```
|
```
|
||||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
detect 1 SYCL GPUs: [0] with top Max compute units:512
|
||||||
|
```
|
||||||
|
Or
|
||||||
|
```
|
||||||
|
use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
```
|
```
|
||||||
|
|
||||||
## Environment Variable
|
## Environment Variable
|
||||||
|
@ -489,7 +544,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
|
||||||
|Name|Value|Function|
|
|Name|Value|Function|
|
||||||
|-|-|-|
|
|-|-|-|
|
||||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
|
||||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||||
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
|
||||||
|
|
||||||
|
@ -507,6 +561,9 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
|
||||||
## Q&A
|
## Q&A
|
||||||
|
|
||||||
|
Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
|
||||||
|
|
||||||
|
|
||||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||||
|
|
||||||
Miss to enable oneAPI running environment.
|
Miss to enable oneAPI running environment.
|
||||||
|
@ -538,4 +595,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||||
|
|
||||||
## Todo
|
## Todo
|
||||||
|
|
||||||
- Support multiple cards.
|
- Support row layer split for multiple card runs.
|
||||||
|
|
2058
common/common.cpp
2058
common/common.cpp
File diff suppressed because it is too large
Load diff
1630
common/json.hpp
1630
common/json.hpp
File diff suppressed because it is too large
Load diff
|
@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
|
||||||
std::vector<llama_sampler_type> samplers_sequence = {
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
llama_sampler_type::TOP_K,
|
llama_sampler_type::TOP_K,
|
||||||
|
|
|
@ -1634,7 +1634,7 @@ in chat mode so that the conversation can end normally.")
|
||||||
self.post_write_tensors(tensor_map, name, data_torch)
|
self.post_write_tensors(tensor_map, name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BertModel")
|
@Model.register("BertModel", "CamembertModel")
|
||||||
class BertModel(Model):
|
class BertModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
|
|
|
@ -1167,9 +1167,9 @@ class OutputFile:
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
|
||||||
return GGMLFileType.MostlyF16
|
return GGMLFileType.MostlyF16
|
||||||
if output_type_str == "q8_0":
|
if output_type_str == "q8_0":
|
||||||
return GGMLFileType.MostlyQ8_0
|
return GGMLFileType.MostlyQ8_0
|
||||||
|
|
|
@ -21,6 +21,7 @@ else()
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gritlm)
|
||||||
|
add_subdirectory(gguf-split)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
|
|
5
examples/gguf-split/CMakeLists.txt
Normal file
5
examples/gguf-split/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET gguf-split)
|
||||||
|
add_executable(${TARGET} gguf-split.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
9
examples/gguf-split/README.md
Normal file
9
examples/gguf-split/README.md
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
## GGUF split Example
|
||||||
|
|
||||||
|
CLI to split / merge GGUF files.
|
||||||
|
|
||||||
|
**Command line options:**
|
||||||
|
|
||||||
|
- `--split`: split GGUF to multiple GGUF, default operation.
|
||||||
|
- `--split-max-tensors`: maximum tensors in each split: default(128)
|
||||||
|
- `--merge`: merge multiple GGUF to a single GGUF.
|
489
examples/gguf-split/gguf-split.cpp
Normal file
489
examples/gguf-split/gguf-split.cpp
Normal file
|
@ -0,0 +1,489 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <fstream>
|
||||||
|
#include <ios>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
enum split_operation : uint8_t {
|
||||||
|
SPLIT_OP_SPLIT,
|
||||||
|
SPLIT_OP_MERGE,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
|
||||||
|
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
|
||||||
|
|
||||||
|
static const int SPLIT_FILENAME_MAX = 256;
|
||||||
|
|
||||||
|
static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||||
|
|
||||||
|
struct split_params {
|
||||||
|
split_operation operation = SPLIT_OP_SPLIT;
|
||||||
|
int n_split_tensors = 128;
|
||||||
|
std::string input;
|
||||||
|
std::string output;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void split_print_usage(const char * executable) {
|
||||||
|
const split_params default_params;
|
||||||
|
printf("\n");
|
||||||
|
printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
|
||||||
|
printf("\n");
|
||||||
|
printf("Apply a GGUF operation on IN to OUT.");
|
||||||
|
printf("\n");
|
||||||
|
printf("options:\n");
|
||||||
|
printf(" -h, --help show this help message and exit\n");
|
||||||
|
printf(" --version show version and build info\n");
|
||||||
|
printf(" --split split GGUF to multiple GGUF (default)\n");
|
||||||
|
printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors);
|
||||||
|
printf(" --merge merge multiple GGUF to a single GGUF\n");
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) {
|
||||||
|
std::string arg;
|
||||||
|
const std::string arg_prefix = "--";
|
||||||
|
bool invalid_param = false;
|
||||||
|
|
||||||
|
int arg_idx = 1;
|
||||||
|
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||||
|
arg = argv[arg_idx];
|
||||||
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||||
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||||
|
}
|
||||||
|
|
||||||
|
bool arg_found = false;
|
||||||
|
if (arg == "-h" || arg == "--help") {
|
||||||
|
split_print_usage(argv[0]);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
if (arg == "--version") {
|
||||||
|
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
|
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (arg == "--merge") {
|
||||||
|
arg_found = true;
|
||||||
|
params.operation = SPLIT_OP_MERGE;
|
||||||
|
}
|
||||||
|
if (arg == "--split") {
|
||||||
|
arg_found = true;
|
||||||
|
params.operation = SPLIT_OP_SPLIT;
|
||||||
|
}
|
||||||
|
if (arg == "--split-max-tensors") {
|
||||||
|
if (++arg_idx >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
arg_found = true;
|
||||||
|
params.n_split_tensors = atoi(argv[arg_idx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!arg_found) {
|
||||||
|
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (invalid_param) {
|
||||||
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc - arg_idx < 2) {
|
||||||
|
printf("%s: bad arguments\n", argv[0]);
|
||||||
|
split_print_usage(argv[0]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.input = argv[arg_idx++];
|
||||||
|
params.output = argv[arg_idx++];
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool split_params_parse(int argc, const char ** argv, split_params & params) {
|
||||||
|
bool result = true;
|
||||||
|
try {
|
||||||
|
if (!split_params_parse_ex(argc, argv, params)) {
|
||||||
|
split_print_usage(argv[0]);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (const std::invalid_argument & ex) {
|
||||||
|
fprintf(stderr, "%s\n", ex.what());
|
||||||
|
split_print_usage(argv[0]);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void zeros(std::ofstream & file, size_t n) {
|
||||||
|
char zero = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
file.write(&zero, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string split_file_name(const std::string & path, int i_split, int n_split) {
|
||||||
|
char f_split[SPLIT_FILENAME_MAX] = {0};
|
||||||
|
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
|
||||||
|
return std::string(f_split);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct split_strategy {
|
||||||
|
const split_params params;
|
||||||
|
std::ifstream & f_input;
|
||||||
|
struct gguf_context * ctx_gguf;
|
||||||
|
struct ggml_context * ctx_meta = NULL;
|
||||||
|
const int n_tensors;
|
||||||
|
|
||||||
|
const int n_split;
|
||||||
|
int i_split = 0;
|
||||||
|
|
||||||
|
int i_tensor = 0;
|
||||||
|
|
||||||
|
std::vector<uint8_t> read_data;
|
||||||
|
|
||||||
|
struct gguf_context * ctx_out;
|
||||||
|
std::ofstream fout;
|
||||||
|
|
||||||
|
split_strategy(const split_params & params,
|
||||||
|
std::ifstream & f_input,
|
||||||
|
struct gguf_context * ctx_gguf,
|
||||||
|
struct ggml_context * ctx_meta) :
|
||||||
|
params(params),
|
||||||
|
f_input(f_input),
|
||||||
|
ctx_gguf(ctx_gguf),
|
||||||
|
ctx_meta(ctx_meta),
|
||||||
|
n_tensors(gguf_get_n_tensors(ctx_gguf)),
|
||||||
|
n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool should_split() const {
|
||||||
|
return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void split_start() {
|
||||||
|
ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
|
// Save all metadata in first split only
|
||||||
|
if (i_split == 0) {
|
||||||
|
gguf_set_kv(ctx_out, ctx_gguf);
|
||||||
|
}
|
||||||
|
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
|
||||||
|
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
|
||||||
|
|
||||||
|
// populate the original tensors, so we get an initial metadata
|
||||||
|
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
|
||||||
|
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
||||||
|
gguf_add_tensor(ctx_out, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto split_name = split_file_name(params.output, i_split, n_split);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
|
||||||
|
fout = std::ofstream(split_name, std::ios::binary);
|
||||||
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
|
||||||
|
auto meta_size = gguf_get_meta_size(ctx_out);
|
||||||
|
|
||||||
|
// placeholder for the meta data
|
||||||
|
::zeros(fout, meta_size);
|
||||||
|
|
||||||
|
i_split++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void next_tensor() {
|
||||||
|
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||||
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
|
||||||
|
if (read_data.size() < n_bytes) {
|
||||||
|
read_data.resize(n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
|
||||||
|
f_input.seekg(offset);
|
||||||
|
f_input.read((char *)read_data.data(), n_bytes);
|
||||||
|
|
||||||
|
t->data = read_data.data();
|
||||||
|
|
||||||
|
// write tensor data + padding
|
||||||
|
fout.write((const char *)t->data, n_bytes);
|
||||||
|
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
|
||||||
|
|
||||||
|
i_tensor++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void split_end() {
|
||||||
|
// go back to beginning of file and write the updated metadata
|
||||||
|
fout.seekp(0);
|
||||||
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||||
|
gguf_get_meta_data(ctx_out, data.data());
|
||||||
|
fout.write((const char *)data.data(), data.size());
|
||||||
|
|
||||||
|
fout.close();
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
|
||||||
|
fprintf(stderr, "\033[3Ddone\n");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void gguf_split(const split_params & split_params) {
|
||||||
|
struct ggml_context * ctx_meta = NULL;
|
||||||
|
|
||||||
|
struct gguf_init_params params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ &ctx_meta,
|
||||||
|
};
|
||||||
|
|
||||||
|
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
|
||||||
|
if (!f_input.is_open()) {
|
||||||
|
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
|
||||||
|
if (!ctx_gguf) {
|
||||||
|
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
|
||||||
|
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
|
||||||
|
__func__, split_params.input.c_str(),
|
||||||
|
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
|
||||||
|
split_params.n_split_tensors);
|
||||||
|
|
||||||
|
strategy.split_start();
|
||||||
|
|
||||||
|
while (strategy.i_tensor < strategy.n_tensors) {
|
||||||
|
strategy.next_tensor();
|
||||||
|
if (strategy.should_split()) {
|
||||||
|
strategy.split_end();
|
||||||
|
strategy.split_start();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
strategy.split_end();
|
||||||
|
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
f_input.close();
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
|
||||||
|
__func__, strategy.n_split, strategy.n_tensors);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gguf_merge(const split_params & split_params) {
|
||||||
|
fprintf(stderr, "%s: %s -> %s\n",
|
||||||
|
__func__, split_params.input.c_str(),
|
||||||
|
split_params.output.c_str());
|
||||||
|
int n_split = 1;
|
||||||
|
int total_tensors = 0;
|
||||||
|
|
||||||
|
auto * ctx_out = gguf_init_empty();
|
||||||
|
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
|
||||||
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
|
||||||
|
std::vector<uint8_t> read_data;
|
||||||
|
std::vector<ggml_context *> ctx_metas;
|
||||||
|
std::vector<gguf_context *> ctx_ggufs;
|
||||||
|
|
||||||
|
std::string split_prefix;
|
||||||
|
|
||||||
|
// First pass to find KV and tensors metadata
|
||||||
|
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||||
|
struct ggml_context * ctx_meta = NULL;
|
||||||
|
|
||||||
|
struct gguf_init_params params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ &ctx_meta,
|
||||||
|
};
|
||||||
|
|
||||||
|
auto split_name = split_params.input;
|
||||||
|
if (i_split > 0) {
|
||||||
|
split_name = split_file_name(split_prefix, i_split, n_split);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
|
||||||
|
|
||||||
|
auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
|
||||||
|
if (!ctx_gguf) {
|
||||||
|
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
ctx_ggufs.push_back(ctx_gguf);
|
||||||
|
ctx_metas.push_back(ctx_meta);
|
||||||
|
|
||||||
|
if (i_split == 0) {
|
||||||
|
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||||
|
if (key_n_split < 0) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"\n%s: input file does not contain %s metadata\n",
|
||||||
|
__func__,
|
||||||
|
LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
fout.close();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
|
||||||
|
if (n_split < 1) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"\n%s: input file does not contain a valid split count %d\n",
|
||||||
|
__func__,
|
||||||
|
n_split);
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
fout.close();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do not trigger merge if we try to merge again the output
|
||||||
|
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
|
||||||
|
|
||||||
|
// Set metadata from the first split
|
||||||
|
gguf_set_kv(ctx_out, ctx_gguf);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the file naming
|
||||||
|
{
|
||||||
|
int i_split_file = 0;
|
||||||
|
int n_split_file = 0;
|
||||||
|
const char * i_split_format = "-00000-of-00000.gguf";
|
||||||
|
|
||||||
|
if (split_name.size() < strlen(i_split_format)) {
|
||||||
|
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
|
||||||
|
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||||
|
gguf_free(_ctx_gguf);
|
||||||
|
}
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
fout.close();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
|
||||||
|
|
||||||
|
const char * split_name_c_str = split_name.c_str();
|
||||||
|
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
|
||||||
|
|
||||||
|
if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
|
||||||
|
fprintf(stderr, "\n%s: unexpected input file name: %s"
|
||||||
|
" i_split=%d i_split_file=%d"
|
||||||
|
" n_split=%d n_split_file=%d\n", __func__,
|
||||||
|
split_params.input.c_str(),
|
||||||
|
i_split, i_split_file,
|
||||||
|
n_split, n_split_file);
|
||||||
|
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||||
|
gguf_free(_ctx_gguf);
|
||||||
|
}
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
fout.close();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
|
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
|
||||||
|
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||||
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
|
||||||
|
gguf_add_tensor(ctx_out, t);
|
||||||
|
}
|
||||||
|
total_tensors += n_tensors;
|
||||||
|
|
||||||
|
fprintf(stderr, "\033[3Ddone\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// placeholder for the meta data
|
||||||
|
{
|
||||||
|
auto meta_size = gguf_get_meta_size(ctx_out);
|
||||||
|
::zeros(fout, meta_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write tensors data
|
||||||
|
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||||
|
auto split_name = split_file_name(split_prefix, i_split, n_split);
|
||||||
|
std::ifstream f_input(split_name.c_str(), std::ios::binary);
|
||||||
|
if (!f_input.is_open()) {
|
||||||
|
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
|
||||||
|
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||||
|
gguf_free(_ctx_gguf);
|
||||||
|
}
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
fout.close();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
|
||||||
|
|
||||||
|
auto * ctx_gguf = ctx_ggufs[i_split];
|
||||||
|
auto * ctx_meta = ctx_metas[i_split];
|
||||||
|
|
||||||
|
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
|
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
|
||||||
|
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||||
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
|
||||||
|
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
|
||||||
|
if (read_data.size() < n_bytes) {
|
||||||
|
read_data.resize(n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
|
||||||
|
f_input.seekg(offset);
|
||||||
|
f_input.read((char *)read_data.data(), n_bytes);
|
||||||
|
|
||||||
|
// write tensor data + padding
|
||||||
|
fout.write((const char *)read_data.data(), n_bytes);
|
||||||
|
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
ggml_free(ctx_meta);
|
||||||
|
f_input.close();
|
||||||
|
fprintf(stderr, "\033[3Ddone\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// go back to beginning of file and write the updated metadata
|
||||||
|
fout.seekp(0);
|
||||||
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||||
|
gguf_get_meta_data(ctx_out, data.data());
|
||||||
|
fout.write((const char *)data.data(), data.size());
|
||||||
|
|
||||||
|
fout.close();
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
|
||||||
|
__func__, split_params.output.c_str(), n_split, total_tensors);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, const char ** argv) {
|
||||||
|
if (argc < 3) {
|
||||||
|
split_print_usage(argv[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
split_params params;
|
||||||
|
split_params_parse(argc, argv, params);
|
||||||
|
|
||||||
|
switch (params.operation) {
|
||||||
|
case SPLIT_OP_SPLIT: gguf_split(params);
|
||||||
|
break;
|
||||||
|
case SPLIT_OP_MERGE: gguf_merge(params);
|
||||||
|
break;
|
||||||
|
default:split_print_usage(argv[0]);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
const struct ggml_tensor * src0 = t->src[0];
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
const struct ggml_tensor * src1 = t->src[1];
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
std::string wname;
|
||||||
|
{
|
||||||
|
// remove any prefix and suffixes from the name
|
||||||
|
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
|
||||||
|
const char * p = strchr(src0->name, '#');
|
||||||
|
if (p != NULL) {
|
||||||
|
p = p + 1;
|
||||||
|
const char * q = strchr(p, '#');
|
||||||
|
if (q != NULL) {
|
||||||
|
wname = std::string(p, q - p);
|
||||||
|
} else {
|
||||||
|
wname = p;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
wname = src0->name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||||
if (ask) {
|
if (ask) {
|
||||||
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
||||||
if (t->op != GGML_OP_MUL_MAT) return false;
|
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
// this is necessary to guarantee equal number of "ncall" for each tensor
|
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||||
for (int ex = 0; ex < n_as; ++ex) {
|
for (int ex = 0; ex < n_as; ++ex) {
|
||||||
src0 = t->src[2 + ex];
|
src0 = t->src[2 + ex];
|
||||||
auto& e = m_stats[src0->name];
|
auto& e = m_stats[wname];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||||
|
@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
}
|
}
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const int excur = m_ids[row*n_as + idx];
|
const int excur = m_ids[row*n_as + idx];
|
||||||
|
@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto& e = m_stats[src0->name];
|
auto& e = m_stats[wname];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
}
|
}
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const float * x = data + row * src1->ne[0];
|
const float * x = data + row * src1->ne[0];
|
||||||
|
|
|
@ -114,10 +114,10 @@ static std::string get_cpu_info() {
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
std::string id;
|
std::string id;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
int count = ggml_cuda_get_device_count();
|
int count = ggml_backend_cuda_get_device_count();
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
char buf[128];
|
char buf[128];
|
||||||
ggml_cuda_get_device_description(i, buf, sizeof(buf));
|
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
||||||
id += buf;
|
id += buf;
|
||||||
if (i < count - 1) {
|
if (i < count - 1) {
|
||||||
id += "/";
|
id += "/";
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
# MobileVLM
|
# MobileVLM
|
||||||
|
|
||||||
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
|
Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
|
||||||
|
|
||||||
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
|
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
|
||||||
|
|
||||||
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
||||||
|
|
||||||
|
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava-cli` to build it.
|
Build with cmake or run `make llava-cli` to build it.
|
||||||
|
|
||||||
|
@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||||
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
|
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||||
|
@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \
|
||||||
--projector-type ldp
|
--projector-type ldp
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||||
|
-m path/to/clip-vit-large-patch14-336 \
|
||||||
|
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||||
|
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||||
|
--projector-type ldpv2
|
||||||
|
```
|
||||||
|
|
||||||
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|
|
@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
|
|
||||||
|
|
||||||
|
@ -126,12 +127,14 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_MLP,
|
PROJECTOR_TYPE_MLP,
|
||||||
PROJECTOR_TYPE_MLP_NORM,
|
PROJECTOR_TYPE_MLP_NORM,
|
||||||
PROJECTOR_TYPE_LDP,
|
PROJECTOR_TYPE_LDP,
|
||||||
|
PROJECTOR_TYPE_LDPV2,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||||
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -475,6 +478,14 @@ struct clip_vision_model {
|
||||||
struct ggml_tensor * mm_model_block_2_block_2_0_w;
|
struct ggml_tensor * mm_model_block_2_block_2_0_w;
|
||||||
struct ggml_tensor * mm_model_block_2_block_2_1_w;
|
struct ggml_tensor * mm_model_block_2_block_2_1_w;
|
||||||
struct ggml_tensor * mm_model_block_2_block_2_1_b;
|
struct ggml_tensor * mm_model_block_2_block_2_1_b;
|
||||||
|
|
||||||
|
// MobileVLM_V2 projection
|
||||||
|
struct ggml_tensor * mm_model_mlp_0_w;
|
||||||
|
struct ggml_tensor * mm_model_mlp_0_b;
|
||||||
|
struct ggml_tensor * mm_model_mlp_2_w;
|
||||||
|
struct ggml_tensor * mm_model_mlp_2_b;
|
||||||
|
struct ggml_tensor * mm_model_peg_0_w;
|
||||||
|
struct ggml_tensor * mm_model_peg_0_b;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_ctx {
|
struct clip_ctx {
|
||||||
|
@ -497,7 +508,6 @@ struct clip_ctx {
|
||||||
|
|
||||||
// memory buffers to evaluate the model
|
// memory buffers to evaluate the model
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
ggml_backend_buffer_t params_buffer = NULL;
|
||||||
ggml_backend_buffer_t compute_buffer = NULL;
|
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
|
@ -808,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
}
|
}
|
||||||
embeddings = block_1;
|
embeddings = block_1;
|
||||||
}
|
}
|
||||||
|
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
|
||||||
|
{
|
||||||
|
int n_patch = 24;
|
||||||
|
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||||
|
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||||
|
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||||
|
struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
||||||
|
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||||
|
// mlp_2 ne = [2048, 576, 1, 1]
|
||||||
|
// // AVG Pool Layer 2*2, strides = 2
|
||||||
|
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
|
||||||
|
// mlp_2 ne = [576, 2048, 1, 1]
|
||||||
|
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||||
|
// mlp_2 ne [24, 24, 2048, 1]
|
||||||
|
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||||
|
// weight ne = [3, 3, 2048, 1]
|
||||||
|
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
||||||
|
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
||||||
|
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
||||||
|
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
||||||
|
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||||
|
embeddings = peg_0;
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -1178,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
||||||
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
||||||
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
||||||
} else {
|
}
|
||||||
|
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
|
||||||
|
{
|
||||||
|
// MobilVLM_V2 projection
|
||||||
|
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||||
|
vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
||||||
|
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
||||||
|
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
|
||||||
|
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
||||||
|
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
||||||
|
}
|
||||||
|
else {
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -1676,6 +1720,9 @@ void clip_free(clip_ctx * ctx) {
|
||||||
ggml_free(ctx->ctx_data);
|
ggml_free(ctx->ctx_data);
|
||||||
gguf_free(ctx->ctx_gguf);
|
gguf_free(ctx->ctx_gguf);
|
||||||
|
|
||||||
|
ggml_backend_buffer_free(ctx->params_buffer);
|
||||||
|
ggml_backend_free(ctx->backend);
|
||||||
|
ggml_gallocr_free(ctx->compute_alloc);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1964,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||||
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
||||||
}
|
}
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||||
|
return ctx->vision_model.mm_model_peg_0_b->ne[0];
|
||||||
|
}
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||||
return ctx->vision_model.mm_2_b->ne[0];
|
return ctx->vision_model.mm_2_b->ne[0];
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
|
||||||
def get_tensor_name(name: str) -> str:
|
def get_tensor_name(name: str) -> str:
|
||||||
if "projection" in name:
|
if "projection" in name:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
if "mm_projector" in name:
|
if "mm_projector" in name:
|
||||||
return name.replace("model.mm_projector", "mm")
|
name = name.replace("model.mm_projector", "mm")
|
||||||
|
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||||
|
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||||
|
return name
|
||||||
|
|
||||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||||
|
|
||||||
|
@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
|
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -5,15 +5,14 @@ import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
from subprocess import TimeoutExpired
|
||||||
import psutil
|
|
||||||
|
|
||||||
|
|
||||||
def before_scenario(context, scenario):
|
def before_scenario(context, scenario):
|
||||||
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
|
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print("DEBUG=ON\n")
|
print("DEBUG=ON")
|
||||||
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
|
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
|
||||||
port = 8080
|
port = 8080
|
||||||
if 'PORT' in os.environ:
|
if 'PORT' in os.environ:
|
||||||
port = int(os.environ['PORT'])
|
port = int(os.environ['PORT'])
|
||||||
|
@ -27,60 +26,40 @@ def after_scenario(context, scenario):
|
||||||
return
|
return
|
||||||
if scenario.status == "failed":
|
if scenario.status == "failed":
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
|
||||||
if os.path.isfile('llama.log'):
|
if os.path.isfile('llama.log'):
|
||||||
with closing(open('llama.log', 'r')) as f:
|
with closing(open('llama.log', 'r')) as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
print(line)
|
print(line)
|
||||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
|
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
|
||||||
|
|
||||||
if not pid_exists(context.server_process.pid):
|
if context.server_process.poll() is not None:
|
||||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||||
|
|
||||||
server_graceful_shutdown(context)
|
server_graceful_shutdown(context) # SIGINT
|
||||||
|
|
||||||
# Wait few for socket to free up
|
try:
|
||||||
time.sleep(0.05)
|
context.server_process.wait(0.5)
|
||||||
|
except TimeoutExpired:
|
||||||
|
print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
|
||||||
|
context.server_process.kill() # SIGKILL
|
||||||
|
context.server_process.wait()
|
||||||
|
|
||||||
attempts = 0
|
while is_server_listening(context.server_fqdn, context.server_port):
|
||||||
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
|
|
||||||
server_kill(context)
|
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
attempts += 1
|
except Exception:
|
||||||
if attempts > 5:
|
print("ignoring error in after_scenario:")
|
||||||
server_kill_hard(context)
|
traceback.print_exc(file=sys.stdout)
|
||||||
except:
|
|
||||||
exc = sys.exception()
|
|
||||||
print("error in after scenario: \n")
|
|
||||||
print(exc)
|
|
||||||
print("*** print_tb: \n")
|
|
||||||
traceback.print_tb(exc.__traceback__, file=sys.stdout)
|
|
||||||
|
|
||||||
|
|
||||||
def server_graceful_shutdown(context):
|
def server_graceful_shutdown(context):
|
||||||
print(f"shutting down server pid={context.server_process.pid} ...\n")
|
print(f"shutting down server pid={context.server_process.pid} ...")
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
os.kill(context.server_process.pid, signal.CTRL_C_EVENT)
|
interrupt = signal.CTRL_C_EVENT
|
||||||
else:
|
else:
|
||||||
os.kill(context.server_process.pid, signal.SIGINT)
|
interrupt = signal.SIGINT
|
||||||
|
context.server_process.send_signal(interrupt)
|
||||||
|
|
||||||
def server_kill(context):
|
|
||||||
print(f"killing server pid={context.server_process.pid} ...\n")
|
|
||||||
context.server_process.kill()
|
|
||||||
|
|
||||||
|
|
||||||
def server_kill_hard(context):
|
|
||||||
pid = context.server_process.pid
|
|
||||||
path = context.server_path
|
|
||||||
|
|
||||||
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
|
||||||
try:
|
|
||||||
psutil.Process(pid).kill()
|
|
||||||
except psutil.NoSuchProcess:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def is_server_listening(server_fqdn, server_port):
|
def is_server_listening(server_fqdn, server_port):
|
||||||
|
@ -88,14 +67,5 @@ def is_server_listening(server_fqdn, server_port):
|
||||||
result = sock.connect_ex((server_fqdn, server_port))
|
result = sock.connect_ex((server_fqdn, server_port))
|
||||||
_is_server_listening = result == 0
|
_is_server_listening = result == 0
|
||||||
if _is_server_listening:
|
if _is_server_listening:
|
||||||
print(f"server is listening on {server_fqdn}:{server_port}...\n")
|
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||||
return _is_server_listening
|
return _is_server_listening
|
||||||
|
|
||||||
|
|
||||||
def pid_exists(pid):
|
|
||||||
try:
|
|
||||||
psutil.Process(pid)
|
|
||||||
except psutil.NoSuchProcess:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ Feature: llama.cpp server
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
||||||
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
||||||
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not |
|
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
|
||||||
|
|
||||||
Scenario: Completion prompt truncated
|
Scenario: Completion prompt truncated
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
|
@ -48,7 +48,7 @@ Feature: llama.cpp server
|
||||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||||
"""
|
"""
|
||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry
|
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
|
||||||
And the completion is truncated
|
And the completion is truncated
|
||||||
And 109 prompt tokens are processed
|
And 109 prompt tokens are processed
|
||||||
|
|
||||||
|
@ -67,7 +67,7 @@ Feature: llama.cpp server
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
||||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
||||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | |
|
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
|
||||||
|
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility w/ response format
|
Scenario Outline: OAI Compatibility w/ response format
|
||||||
|
|
|
@ -24,12 +24,16 @@ from prometheus_client import parser
|
||||||
def step_server_config(context, server_fqdn, server_port):
|
def step_server_config(context, server_fqdn, server_port):
|
||||||
context.server_fqdn = server_fqdn
|
context.server_fqdn = server_fqdn
|
||||||
context.server_port = int(server_port)
|
context.server_port = int(server_port)
|
||||||
|
context.n_gpu_layer = None
|
||||||
if 'PORT' in os.environ:
|
if 'PORT' in os.environ:
|
||||||
context.server_port = int(os.environ['PORT'])
|
context.server_port = int(os.environ['PORT'])
|
||||||
print(f"$PORT set, overriding server port with to {context.server_port}")
|
print(f"$PORT set, overriding server port with to {context.server_port}")
|
||||||
if 'FQDN' in os.environ:
|
if 'FQDN' in os.environ:
|
||||||
context.server_fqdn = os.environ['FQDN']
|
context.server_fqdn = os.environ['FQDN']
|
||||||
print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
|
print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
|
||||||
|
if 'N_GPU_LAYERS' in os.environ:
|
||||||
|
context.n_gpu_layer = int(os.environ['N_GPU_LAYERS'])
|
||||||
|
print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}")
|
||||||
|
|
||||||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||||
|
|
||||||
|
@ -41,7 +45,6 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.n_ctx = None
|
context.n_ctx = None
|
||||||
context.n_ga = None
|
context.n_ga = None
|
||||||
context.n_ga_w = None
|
context.n_ga_w = None
|
||||||
context.n_gpu_layer = None
|
|
||||||
context.n_predict = None
|
context.n_predict = None
|
||||||
context.n_prompts = 0
|
context.n_prompts = 0
|
||||||
context.n_server_predict = None
|
context.n_server_predict = None
|
||||||
|
@ -67,7 +70,7 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
def step_download_hf_model(context, hf_file, hf_repo):
|
def step_download_hf_model(context, hf_file, hf_repo):
|
||||||
context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
|
context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"model file: {context.model_file}\n")
|
print(f"model file: {context.model_file}")
|
||||||
|
|
||||||
|
|
||||||
@step('a model file {model_file}')
|
@step('a model file {model_file}')
|
||||||
|
@ -138,9 +141,12 @@ def step_start_server(context):
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
max_attempts *= 2
|
max_attempts *= 2
|
||||||
|
|
||||||
|
addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM)
|
||||||
|
family, typ, proto, _, sockaddr = addrs[0]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
with closing(socket.socket(family, typ, proto)) as sock:
|
||||||
result = sock.connect_ex((context.server_fqdn, context.server_port))
|
result = sock.connect_ex(sockaddr)
|
||||||
if result == 0:
|
if result == 0:
|
||||||
print("\x1b[33;46mserver started!\x1b[0m")
|
print("\x1b[33;46mserver started!\x1b[0m")
|
||||||
return
|
return
|
||||||
|
@ -210,7 +216,7 @@ async def step_request_completion(context, api_error):
|
||||||
user_api_key=context.user_api_key)
|
user_api_key=context.user_api_key)
|
||||||
context.tasks_result.append(completion)
|
context.tasks_result.append(completion)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"Completion response: {completion}\n")
|
print(f"Completion response: {completion}")
|
||||||
if expect_api_error:
|
if expect_api_error:
|
||||||
assert completion == 401, f"completion must be an 401 status code: {completion}"
|
assert completion == 401, f"completion must be an 401 status code: {completion}"
|
||||||
|
|
||||||
|
@ -360,7 +366,7 @@ def step_prompt_passkey(context, passkey, i_pos):
|
||||||
prompt += context.prompt_junk_suffix
|
prompt += context.prompt_junk_suffix
|
||||||
if context.debug:
|
if context.debug:
|
||||||
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
||||||
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```")
|
||||||
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
||||||
context.n_prompts = len(context.prompts)
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
@ -369,7 +375,7 @@ def step_prompt_passkey(context, passkey, i_pos):
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_oai_chat_completions(context, api_error):
|
async def step_oai_chat_completions(context, api_error):
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"Submitting OAI compatible completions request...\n")
|
print(f"Submitting OAI compatible completions request...")
|
||||||
expect_api_error = api_error == 'raised'
|
expect_api_error = api_error == 'raised'
|
||||||
completion = await oai_chat_completions(context.prompts.pop(),
|
completion = await oai_chat_completions(context.prompts.pop(),
|
||||||
context.system_prompt,
|
context.system_prompt,
|
||||||
|
@ -521,12 +527,12 @@ async def step_all_embeddings_are_the_same(context):
|
||||||
embedding1 = np.array(embeddings[i])
|
embedding1 = np.array(embeddings[i])
|
||||||
embedding2 = np.array(embeddings[j])
|
embedding2 = np.array(embeddings[j])
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"embedding1: {embedding1[-8:]}\n")
|
print(f"embedding1: {embedding1[-8:]}")
|
||||||
print(f"embedding2: {embedding2[-8:]}\n")
|
print(f"embedding2: {embedding2[-8:]}")
|
||||||
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||||
msg = f"Similarity between {i} and {j}: {similarity:.10f}"
|
msg = f"Similarity between {i} and {j}: {similarity:.10f}"
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"{msg}\n")
|
print(f"{msg}")
|
||||||
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
|
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
|
||||||
|
|
||||||
|
|
||||||
|
@ -643,7 +649,7 @@ async def step_prometheus_metrics_exported(context):
|
||||||
metrics_raw = await metrics_response.text()
|
metrics_raw = await metrics_response.text()
|
||||||
metric_exported = False
|
metric_exported = False
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"/metrics answer:\n{metrics_raw}\n")
|
print(f"/metrics answer:\n{metrics_raw}")
|
||||||
context.metrics = {}
|
context.metrics = {}
|
||||||
for metric in parser.text_string_to_metric_families(metrics_raw):
|
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||||
match metric.name:
|
match metric.name:
|
||||||
|
@ -949,7 +955,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
||||||
last_match = end
|
last_match = end
|
||||||
highlighted += content[last_match:]
|
highlighted += content[last_match:]
|
||||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||||
print(f"Checking completion response: {highlighted}\n")
|
print(f"Checking completion response: {highlighted}")
|
||||||
assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
|
assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
|
||||||
if expected_predicted_n and expected_predicted_n > 0:
|
if expected_predicted_n and expected_predicted_n > 0:
|
||||||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||||
|
@ -959,7 +965,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
||||||
async def gather_tasks_results(context):
|
async def gather_tasks_results(context):
|
||||||
n_tasks = len(context.concurrent_tasks)
|
n_tasks = len(context.concurrent_tasks)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"Waiting for all {n_tasks} tasks results...\n")
|
print(f"Waiting for all {n_tasks} tasks results...")
|
||||||
for task_no in range(n_tasks):
|
for task_no in range(n_tasks):
|
||||||
context.tasks_result.append(await context.concurrent_tasks.pop())
|
context.tasks_result.append(await context.concurrent_tasks.pop())
|
||||||
n_completions = len(context.tasks_result)
|
n_completions = len(context.tasks_result)
|
||||||
|
@ -976,7 +982,7 @@ async def wait_for_health_status(context,
|
||||||
slots_processing=None,
|
slots_processing=None,
|
||||||
expected_slots=None):
|
expected_slots=None):
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
|
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||||
interval = 0.5
|
interval = 0.5
|
||||||
counter = 0
|
counter = 0
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
|
@ -1065,8 +1071,6 @@ def start_server_background(context):
|
||||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_listen_addr = context.server_fqdn
|
server_listen_addr = context.server_fqdn
|
||||||
if os.name == 'nt':
|
|
||||||
server_listen_addr = '0.0.0.0'
|
|
||||||
server_args = [
|
server_args = [
|
||||||
'--host', server_listen_addr,
|
'--host', server_listen_addr,
|
||||||
'--port', context.server_port,
|
'--port', context.server_port,
|
||||||
|
@ -1105,7 +1109,7 @@ def start_server_background(context):
|
||||||
server_args.append('--verbose')
|
server_args.append('--verbose')
|
||||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||||
server_args.extend(['--log-format', "text"])
|
server_args.extend(['--log-format', "text"])
|
||||||
print(f"starting server with: {context.server_path} {server_args}\n")
|
print(f"starting server with: {context.server_path} {server_args}")
|
||||||
flags = 0
|
flags = 0
|
||||||
if 'nt' == os.name:
|
if 'nt' == os.name:
|
||||||
flags |= subprocess.DETACHED_PROCESS
|
flags |= subprocess.DETACHED_PROCESS
|
||||||
|
|
|
@ -3,5 +3,4 @@ behave~=1.2.6
|
||||||
huggingface_hub~=0.20.3
|
huggingface_hub~=0.20.3
|
||||||
numpy~=1.24.4
|
numpy~=1.24.4
|
||||||
openai~=0.25.0
|
openai~=0.25.0
|
||||||
psutil~=5.9.8
|
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
|
|
|
@ -371,6 +371,7 @@ static json oaicompat_completion_params_parse(
|
||||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||||
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||||
|
llama_params["n_keep"] = json_value(body, "n_keep", 0);
|
||||||
|
|
||||||
if (body.contains("grammar")) {
|
if (body.contains("grammar")) {
|
||||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||||
|
|
|
@ -6,8 +6,6 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
|
||||||
set GGML_SYCL_DEVICE=0
|
|
||||||
rem set GGML_SYCL_DEBUG=1
|
|
||||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||||
|
|
||||||
|
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1709703039,
|
"lastModified": 1710451336,
|
||||||
"narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
|
"narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
|
"rev": "d691274a972b3165335d261cc4671335f5c67de9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
10
ggml-alloc.c
10
ggml-alloc.c
|
@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
if (ggml_is_view(node)) {
|
// TODO: better way to add external dependencies
|
||||||
|
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
||||||
|
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
||||||
|
// itself is never used and should not be considered a dependency
|
||||||
|
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
||||||
struct ggml_tensor * view_src = node->view_src;
|
struct ggml_tensor * view_src = node->view_src;
|
||||||
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
||||||
}
|
}
|
||||||
|
@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
|
|
||||||
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
||||||
|
|
||||||
// allocate explicit inputs and leafs
|
// allocate explicit inputs
|
||||||
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -103,6 +103,11 @@ extern "C" {
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||||
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||||
|
// even if the weight has to be copied from the CPU temporarily
|
||||||
|
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// (optional) event synchronization
|
// (optional) event synchronization
|
||||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||||
|
|
276
ggml-backend.c
276
ggml-backend.c
|
@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
return backend->iface.graph_compute(backend, cgraph);
|
return backend->iface.graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
||||||
return backend->iface.supports_op(backend, op);
|
return backend->iface.supports_op(backend, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
if (backend->iface.offload_op != NULL) {
|
||||||
|
return backend->iface.offload_op(backend, op);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// backend copy
|
// backend copy
|
||||||
|
|
||||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||||
|
@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
|
if (cpu_plan->cplan.work_data == NULL) {
|
||||||
|
free(cpu_plan);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLITS
|
#ifndef GGML_SCHED_MAX_SPLITS
|
||||||
#define GGML_SCHED_MAX_SPLITS 256
|
#define GGML_SCHED_MAX_SPLITS 2048
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_COPIES
|
#ifndef GGML_SCHED_MAX_COPIES
|
||||||
|
@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
|
||||||
struct ggml_cgraph * graph;
|
struct ggml_cgraph * graph;
|
||||||
|
|
||||||
// graph splits
|
// graph splits
|
||||||
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
struct ggml_backend_sched_split * splits;
|
||||||
int n_splits;
|
int n_splits;
|
||||||
|
int splits_capacity;
|
||||||
|
|
||||||
// pipeline parallelism support
|
// pipeline parallelism support
|
||||||
int n_copies;
|
int n_copies;
|
||||||
|
@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
// TODO: use supports_op to check if the backend supports the op
|
// TODO: use supports_op to check if the backend supports the op
|
||||||
|
|
||||||
// assign pre-allocated nodes to their backend
|
// assign pre-allocated nodes to their backend
|
||||||
// dst
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
||||||
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
if (cur_backend_id != -1) {
|
||||||
if (cur_backend != -1) {
|
|
||||||
SET_CAUSE(tensor, "1.dst");
|
SET_CAUSE(tensor, "1.dst");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// view_src
|
// view_src
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
||||||
if (cur_backend != -1) {
|
if (cur_backend_id != -1) {
|
||||||
SET_CAUSE(tensor, "1.vsrc");
|
SET_CAUSE(tensor, "1.vsrc");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// input
|
// graph input
|
||||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||||
SET_CAUSE(tensor, "1.inp");
|
SET_CAUSE(tensor, "1.inp");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// assign nodes that use weights to the backend of the weights
|
// assign nodes that use weights to the backend of the weights
|
||||||
|
// operations with weights are preferably run on the same backend as the weights
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
const struct ggml_tensor * src = tensor->src[i];
|
const struct ggml_tensor * src = tensor->src[i];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
||||||
// operations with weights are always run on the same backend as the weights
|
// check if a backend with higher prio wants to offload the op
|
||||||
|
if (src_backend_id == sched->n_backends - 1) {
|
||||||
|
for (int b = 0; b < src_backend_id; b++) {
|
||||||
|
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||||
|
SET_CAUSE(tensor, "1.off");
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
SET_CAUSE(tensor, "1.wgt%d", i);
|
SET_CAUSE(tensor, "1.wgt%d", i);
|
||||||
return src_backend;
|
return src_backend_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// pass 1: assign backends to ops with pre-allocated inputs
|
// pass 1: assign backends to ops with pre-allocated inputs
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
if (tensor_backend_id(leaf) != -1) {
|
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||||
|
if (*leaf_backend_id != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (tensor_backend_id(node) != -1) {
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
|
if (*node_backend_id != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||||
// src
|
// src
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (tensor_backend_id(src) == -1) {
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
if (*src_backend_id == -1) {
|
||||||
|
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
if (tensor_backend_id == sched->n_backends - 1) {
|
if (*node_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_backend_id = -1;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.2");
|
SET_CAUSE(node, "2.2");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// pass 2.1 expand gpu up
|
// pass 2.1 expand gpu up
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
|
@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
if (tensor_backend_id == sched->n_backends - 1) {
|
if (*node_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_backend_id = -1;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.1");
|
SET_CAUSE(node, "2.1");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// pass 2.4 expand rest down
|
// pass 2.4 expand rest down
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
|
@ -1317,11 +1338,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.4");
|
SET_CAUSE(node, "2.4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.3");
|
SET_CAUSE(node, "2.3");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// pass 3: assign backends to remaining src from dst and view_src
|
// pass 3: assign backends to remaining src from dst and view_src
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
int cur_backend_id = tensor_backend_id(node);
|
int * cur_backend_id = &tensor_backend_id(node);
|
||||||
if (node->view_src != NULL && cur_backend_id == -1) {
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||||
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||||
SET_CAUSE(node, "3.vsrc");
|
SET_CAUSE(node, "3.vsrc");
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
if (src_backend_id == -1) {
|
if (*src_backend_id == -1) {
|
||||||
if (src->view_src != NULL) {
|
if (src->view_src != NULL) {
|
||||||
// views are always on the same backend as the source
|
// views are always on the same backend as the source
|
||||||
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
*src_backend_id = tensor_backend_id(src->view_src);
|
||||||
SET_CAUSE(src, "3.vsrc");
|
SET_CAUSE(src, "3.vsrc");
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(src) = cur_backend_id;
|
*src_backend_id = *cur_backend_id;
|
||||||
SET_CAUSE(src, "3.cur");
|
SET_CAUSE(src, "3.cur");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
{
|
{
|
||||||
int cur_split = 0;
|
int i_split = 0;
|
||||||
|
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||||
// find the backend of the first split, skipping view ops
|
// find the backend of the first split, skipping view ops
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (!ggml_is_view_op(node->op)) {
|
if (!ggml_is_view_op(node->op)) {
|
||||||
sched->splits[0].backend_id = tensor_backend_id(node);
|
split->backend_id = tensor_backend_id(node);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[0].i_start = 0;
|
split->i_start = 0;
|
||||||
sched->splits[0].n_inputs = 0;
|
split->n_inputs = 0;
|
||||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||||
int cur_backend_id = sched->splits[0].backend_id;
|
int cur_backend_id = split->backend_id;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
const int node_backend_id = tensor_backend_id(node);
|
||||||
|
|
||||||
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||||
|
|
||||||
if (tensor_backend_id != cur_backend_id) {
|
// check if we should start a new split based on the sources of the current node
|
||||||
sched->splits[cur_split].i_end = i;
|
bool need_new_split = false;
|
||||||
cur_split++;
|
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||||
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
sched->splits[cur_split].backend_id = tensor_backend_id;
|
struct ggml_tensor * src = node->src[j];
|
||||||
sched->splits[cur_split].i_start = i;
|
if (src == NULL) {
|
||||||
sched->splits[cur_split].n_inputs = 0;
|
continue;
|
||||||
cur_backend_id = tensor_backend_id;
|
}
|
||||||
|
// check if a weight is on a different backend
|
||||||
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
|
int src_backend_id = tensor_backend_id(src);
|
||||||
|
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
||||||
|
need_new_split = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check if the split has too many inputs
|
||||||
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||||
|
const size_t id = hash_id(src);
|
||||||
|
int src_backend_id = sched->tensor_backend_id[id];
|
||||||
|
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
||||||
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||||
|
need_new_split = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node_backend_id != cur_backend_id || need_new_split) {
|
||||||
|
split->i_end = i;
|
||||||
|
i_split++;
|
||||||
|
if (i_split >= sched->splits_capacity) {
|
||||||
|
sched->splits_capacity *= 2;
|
||||||
|
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
||||||
|
GGML_ASSERT(sched->splits != NULL);
|
||||||
|
}
|
||||||
|
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
||||||
|
split = &sched->splits[i_split];
|
||||||
|
split->backend_id = node_backend_id;
|
||||||
|
split->i_start = i;
|
||||||
|
split->n_inputs = 0;
|
||||||
|
cur_backend_id = node_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find inputs that are not on the same backend
|
// find inputs that are not on the same backend
|
||||||
|
@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int src_backend_id = tensor_backend_id(src);
|
const int src_backend_id = tensor_backend_id(src);
|
||||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||||
|
|
||||||
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||||
size_t id = hash_id(src);
|
size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
||||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||||
|
@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
||||||
tensor_backend_id(tensor_copy) = src_backend_id;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_graph_inputs = sched->n_graph_inputs++;
|
int n_graph_inputs = sched->n_graph_inputs++;
|
||||||
|
@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src_backend_id != tensor_backend_id) {
|
if (src_backend_id != node_backend_id) {
|
||||||
// create a copy of the input in the split's backend
|
// create a copy of the input in the split's backend
|
||||||
size_t id = hash_id(src);
|
const size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
|
@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
||||||
tensor_backend_id(tensor_copy) = cur_backend_id;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
int n_inputs = split->n_inputs++;
|
||||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
split->inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
split->i_end = graph->n_nodes;
|
||||||
sched->n_splits = cur_split + 1;
|
sched->n_splits = i_split + 1;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PASS4
|
#ifdef DEBUG_PASS4
|
||||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
// sanity check: all sources should have the same backend as the node
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
|
||||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
|
||||||
if (tensor_backend == NULL) {
|
|
||||||
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
|
||||||
}
|
|
||||||
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
|
||||||
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
|
||||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
|
||||||
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
|
||||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
|
||||||
}
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
struct ggml_tensor * src = node->src[j];
|
|
||||||
if (src == NULL) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
|
||||||
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
|
||||||
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
|
||||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
|
||||||
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
|
||||||
}
|
|
||||||
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
|
||||||
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
|
||||||
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
|
||||||
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
|
||||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fflush(stderr);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// create copies of the graph for each split
|
// create copies of the graph for each split
|
||||||
// TODO: avoid this copy
|
// TODO: avoid this copy
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
|
||||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
|
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
||||||
|
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
const size_t input_id = hash_id(input);
|
||||||
|
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||||
|
|
||||||
// add a dependency to the input source so that it is not freed before the copy is done
|
// add a dependency to the input source so that it is not freed before the copy is done
|
||||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||||
input_dep->src[0] = input;
|
input_dep->src[0] = input;
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||||
|
|
||||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||||
|
@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = split->i_start; j < split->i_end; j++) {
|
for (int j = split->i_start; j < split->i_end; j++) {
|
||||||
|
assert(graph_copy->size > graph_copy->n_nodes);
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
||||||
}
|
}
|
||||||
|
@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_copy(input, input_cpy);
|
ggml_backend_tensor_copy(input, input_cpy);
|
||||||
} else {
|
} else {
|
||||||
|
// wait for the split backend to finish using the input before overwriting it
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_synchronize(split_backend);
|
ggml_backend_synchronize(split_backend);
|
||||||
ggml_backend_synchronize(input_backend);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
|
||||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
||||||
|
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
const int initial_splits_capacity = 16;
|
||||||
|
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
||||||
|
sched->splits_capacity = initial_splits_capacity;
|
||||||
|
|
||||||
for (int b = 0; b < n_backends; b++) {
|
for (int b = 0; b < n_backends; b++) {
|
||||||
sched->backends[b] = backends[b];
|
sched->backends[b] = backends[b];
|
||||||
|
@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
ggml_gallocr_free(sched->galloc);
|
ggml_gallocr_free(sched->galloc);
|
||||||
ggml_free(sched->ctx);
|
ggml_free(sched->ctx);
|
||||||
|
free(sched->splits);
|
||||||
free(sched->hash_set.keys);
|
free(sched->hash_set.keys);
|
||||||
free(sched->tensor_backend_id);
|
free(sched->tensor_backend_id);
|
||||||
free(sched->tensor_copies);
|
free(sched->tensor_copies);
|
||||||
|
@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
// TODO: extract this to a separate function
|
// TODO: extract this to a separate function
|
||||||
|
@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
|
|
||||||
|
|
|
@ -70,11 +70,11 @@ extern "C" {
|
||||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
3674
ggml-cuda.cu
3674
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
21
ggml-cuda.h
21
ggml-cuda.h
|
@ -17,29 +17,17 @@ extern "C" {
|
||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
|
||||||
GGML_API GGML_CALL void ggml_init_cublas(void);
|
|
||||||
|
|
||||||
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
|
||||||
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
|
||||||
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
||||||
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
|
||||||
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// device buffer
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
|
@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||||
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
|
@ -2837,6 +2837,7 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
|
@ -17390,6 +17390,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_SYCL_MAX_DEVICES 16
|
#define GGML_SYCL_MAX_DEVICES 48
|
||||||
#define GGML_SYCL_NAME "SYCL"
|
#define GGML_SYCL_NAME "SYCL"
|
||||||
|
|
||||||
GGML_API void ggml_init_sycl(void);
|
GGML_API void ggml_init_sycl(void);
|
||||||
|
|
|
@ -5699,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
19
ggml.c
19
ggml.c
|
@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||||
#else
|
#else
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
@ -2640,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
ggml_init_cublas();
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
ggml_vk_init_cpu_assist();
|
ggml_vk_init_cpu_assist();
|
||||||
|
@ -11105,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
|
||||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
@ -11305,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
|
||||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
|
@ -16051,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#if defined(GGML_USE_VULKAN)
|
||||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
|
||||||
if (skip_cpu) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
|
@ -16070,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
}
|
}
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_VULKAN
|
||||||
|
|
||||||
#ifdef GGML_USE_SYCL
|
#ifdef GGML_USE_SYCL
|
||||||
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||||
|
|
40
llama.cpp
40
llama.cpp
|
@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{
|
{
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output"},
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
@ -2040,6 +2041,11 @@ struct llama_model {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
for (ggml_backend_buffer_t buf : bufs) {
|
for (ggml_backend_buffer_t buf : bufs) {
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
||||||
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4295,9 +4301,9 @@ static bool llm_load_tensors(
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
} else {
|
if (!model.output) {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
|
@ -4502,11 +4508,13 @@ static bool llm_load_tensors(
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||||
|
|
||||||
// same as tok_embd, duplicated to allow offloading
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
if (!model.output) {
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
@ -5033,6 +5041,13 @@ static bool llm_load_tensors(
|
||||||
size_t first, last;
|
size_t first, last;
|
||||||
ml.get_mapping_range(&first, &last, ctx);
|
ml.get_mapping_range(&first, &last, ctx);
|
||||||
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_layer >= n_gpu_layers) {
|
||||||
|
ggml_backend_cuda_register_host_buffer(
|
||||||
|
ggml_backend_buffer_get_base(buf),
|
||||||
|
ggml_backend_buffer_get_size(buf));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
||||||
|
@ -8231,7 +8246,6 @@ struct llm_build_context {
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
@ -8601,7 +8615,9 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
|
|
||||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||||
// to fix this, we assign the norm layer manually to the backend of its layer
|
// FIXME: fix in ggml_backend_sched
|
||||||
|
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
||||||
|
if (batch.n_tokens < 32 || full_offload) {
|
||||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||||
for (auto * backend : lctx.backends) {
|
for (auto * backend : lctx.backends) {
|
||||||
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||||
|
@ -8610,6 +8626,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cgraph * result = NULL;
|
struct ggml_cgraph * result = NULL;
|
||||||
|
@ -13107,9 +13124,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
if (model->n_gpu_layers > 0) {
|
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||||
|
@ -13129,7 +13145,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||||
|
@ -13285,14 +13300,17 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ggml_backend_t backend = ctx->backends[i];
|
ggml_backend_t backend = ctx->backends[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
||||||
|
if (size > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
||||||
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue