Merge branch 'master' into fix_memcpy_crash
This commit is contained in:
commit
7e54166562
24 changed files with 469 additions and 152 deletions
22
.github/workflows/build.yml
vendored
22
.github/workflows/build.yml
vendored
|
@ -52,7 +52,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -101,7 +101,9 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -209,21 +211,21 @@ jobs:
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential
|
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test llama2c conversion
|
- name: Test llama2c conversion
|
||||||
id: llama2c_test
|
id: llama2c_test
|
||||||
|
@ -938,6 +940,12 @@ jobs:
|
||||||
- name: Download artifacts
|
- name: Download artifacts
|
||||||
id: download-artifact
|
id: download-artifact
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: ./artifact
|
||||||
|
|
||||||
|
- name: Move artifacts
|
||||||
|
id: move_artifacts
|
||||||
|
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
|
@ -956,7 +964,7 @@ jobs:
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||||
for (let file of await fs.readdirSync('./artifact')) {
|
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||||
if (path.extname(file) === '.zip') {
|
if (path.extname(file) === '.zip') {
|
||||||
console.log('uploadReleaseAsset', file);
|
console.log('uploadReleaseAsset', file);
|
||||||
await github.repos.uploadReleaseAsset({
|
await github.repos.uploadReleaseAsset({
|
||||||
|
@ -964,7 +972,7 @@ jobs:
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
release_id: release_id,
|
release_id: release_id,
|
||||||
name: file,
|
name: file,
|
||||||
data: await fs.readFileSync(`./artifact/${file}`)
|
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -48,6 +48,7 @@ models-mnt
|
||||||
/convert-llama2c-to-ggml
|
/convert-llama2c-to-ggml
|
||||||
/embd-input-test
|
/embd-input-test
|
||||||
/embedding
|
/embedding
|
||||||
|
/eval-callback
|
||||||
/gguf
|
/gguf
|
||||||
/gguf-llama-simple
|
/gguf-llama-simple
|
||||||
/gguf-split
|
/gguf-split
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -1,7 +1,7 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
|
@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
ifndef CUDA_DOCKER_ARCH
|
ifndef CUDA_DOCKER_ARCH
|
||||||
ifndef CUDA_POWER_ARCH
|
ifndef CUDA_POWER_ARCH
|
||||||
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
|
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
|
||||||
endif # CUDA_POWER_ARCH
|
endif # CUDA_POWER_ARCH
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
|
@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -8,9 +8,9 @@
|
||||||
- [Linux](#linux)
|
- [Linux](#linux)
|
||||||
- [Windows](#windows)
|
- [Windows](#windows)
|
||||||
- [Environment Variable](#environment-variable)
|
- [Environment Variable](#environment-variable)
|
||||||
- [Known Issue](#known-issue)
|
- [Known Issue](#known-issues)
|
||||||
- [Q&A](#q&a)
|
- [Q&A](#qa)
|
||||||
- [Todo](#todo)
|
- [TODO](#todo)
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
|
@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|
|
||||||
## OS
|
## OS
|
||||||
|
|
||||||
|OS|Status|Verified|
|
| OS | Status | Verified |
|
||||||
|-|-|-|
|
|---------|---------|------------------------------------|
|
||||||
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|
||||||
|Windows|Support|Windows 11|
|
| Windows | Support | Windows 11 |
|
||||||
|
|
||||||
|
|
||||||
## Hardware
|
## Hardware
|
||||||
|
@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|
|
||||||
**Verified devices**
|
**Verified devices**
|
||||||
|
|
||||||
|Intel GPU| Status | Verified Model|
|
| Intel GPU | Status | Verified Model |
|
||||||
|-|-|-|
|
|-------------------------------|---------|---------------------------------------|
|
||||||
|Intel Data Center Max Series| Support| Max 1550, 1100|
|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
|
||||||
|Intel Data Center Flex Series| Support| Flex 170|
|
| Intel Data Center Flex Series | Support | Flex 170 |
|
||||||
|Intel Arc Series| Support| Arc 770, 730M|
|
| Intel Arc Series | Support | Arc 770, 730M |
|
||||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
|
||||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
|
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
|
@ -88,10 +88,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|
||||||
|
|
||||||
**Verified devices**
|
**Verified devices**
|
||||||
|
|
||||||
|Nvidia GPU| Status | Verified Model|
|
| Nvidia GPU | Status | Verified Model |
|
||||||
|-|-|-|
|
|--------------------------|---------|----------------|
|
||||||
|Ampere Series| Support| A100, A4000|
|
| Ampere Series | Support | A100, A4000 |
|
||||||
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
|
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
|
||||||
|
|
||||||
## Docker
|
## Docker
|
||||||
The docker build option is currently limited to *intel GPU* targets.
|
The docker build option is currently limited to *intel GPU* targets.
|
||||||
|
@ -161,7 +161,7 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
||||||
|
|
||||||
- **Nvidia GPU**
|
- **Nvidia GPU**
|
||||||
|
|
||||||
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
|
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
|
||||||
|
|
||||||
2. **Install Intel® oneAPI Base toolkit**
|
2. **Install Intel® oneAPI Base toolkit**
|
||||||
|
|
||||||
|
@ -295,10 +295,10 @@ found 6 SYCL devices:
|
||||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||||
```
|
```
|
||||||
|
|
||||||
|Attribute|Note|
|
| Attribute | Note |
|
||||||
|-|-|
|
|------------------------|-------------------------------------------------------------|
|
||||||
|compute capability 1.3|Level-zero driver/runtime, recommended |
|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
||||||
|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
|
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
||||||
|
|
||||||
4. Launch inference
|
4. Launch inference
|
||||||
|
|
||||||
|
@ -307,10 +307,10 @@ There are two device selection modes:
|
||||||
- Single device: Use one device target specified by the user.
|
- Single device: Use one device target specified by the user.
|
||||||
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
||||||
|
|
||||||
|Device selection|Parameter|
|
| Device selection | Parameter |
|
||||||
|-|-|
|
|------------------|----------------------------------------|
|
||||||
|Single device|--split-mode none --main-gpu DEVICE_ID |
|
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||||
|Multiple devices|--split-mode layer (default)|
|
| Multiple devices | --split-mode layer (default) |
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
|
@ -468,10 +468,10 @@ found 6 SYCL devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|Attribute|Note|
|
| Attribute | Note |
|
||||||
|-|-|
|
|------------------------|-----------------------------------------------------------|
|
||||||
|compute capability 1.3|Level-zero running time, recommended |
|
| compute capability 1.3 | Level-zero running time, recommended |
|
||||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
||||||
|
|
||||||
|
|
||||||
4. Launch inference
|
4. Launch inference
|
||||||
|
@ -481,10 +481,10 @@ There are two device selection modes:
|
||||||
- Single device: Use one device assigned by user.
|
- Single device: Use one device assigned by user.
|
||||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||||
|
|
||||||
|Device selection|Parameter|
|
| Device selection | Parameter |
|
||||||
|-|-|
|
|------------------|----------------------------------------|
|
||||||
|Single device|--split-mode none --main-gpu DEVICE_ID |
|
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||||
|Multiple devices|--split-mode layer (default)|
|
| Multiple devices | --split-mode layer (default) |
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
|
@ -522,20 +522,20 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
#### Build
|
#### Build
|
||||||
|
|
||||||
|Name|Value|Function|
|
| Name | Value | Function |
|
||||||
|-|-|-|
|
|--------------------|-----------------------------------|---------------------------------------------|
|
||||||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||||
|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
||||||
|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
||||||
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
|
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
||||||
#### Runtime
|
#### Runtime
|
||||||
|
|
||||||
|Name|Value|Function|
|
| Name | Value | Function |
|
||||||
|-|-|-|
|
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||||
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
|
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
|
|
||||||
|
@ -567,6 +567,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
||||||
|
|
||||||
## Todo
|
## TODO
|
||||||
|
|
||||||
- Support row layer split for multiple card runs.
|
- Support row layer split for multiple card runs.
|
||||||
|
|
38
README.md
38
README.md
|
@ -485,14 +485,14 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-------------|
|
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
|
||||||
- #### hipBLAS
|
- #### hipBLAS
|
||||||
|
|
||||||
|
@ -534,11 +534,11 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|-------------|
|
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
- #### CLBlast
|
||||||
|
|
||||||
|
@ -746,11 +746,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
|
||||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||||
|
|
||||||
| Model | Original size | Quantized size (Q4_0) |
|
| Model | Original size | Quantized size (Q4_0) |
|
||||||
|------:|--------------:|-----------------------:|
|
|------:|--------------:|----------------------:|
|
||||||
| 7B | 13 GB | 3.9 GB |
|
| 7B | 13 GB | 3.9 GB |
|
||||||
| 13B | 24 GB | 7.8 GB |
|
| 13B | 24 GB | 7.8 GB |
|
||||||
| 30B | 60 GB | 19.5 GB |
|
| 30B | 60 GB | 19.5 GB |
|
||||||
| 65B | 120 GB | 38.5 GB |
|
| 65B | 120 GB | 38.5 GB |
|
||||||
|
|
||||||
### Quantization
|
### Quantization
|
||||||
|
|
||||||
|
@ -758,7 +758,7 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||||
|
|
||||||
*(outdated)*
|
*(outdated)*
|
||||||
|
|
||||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
||||||
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
|
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
|
||||||
|
|
|
@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
|
||||||
|
|
||||||
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
|
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
|
||||||
|
|
||||||
1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
|
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
|
||||||
|
|
||||||
1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
||||||
|
|
||||||
1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
||||||
|
|
||||||
## Reporting a vulnerability
|
## Reporting a vulnerability
|
||||||
|
|
||||||
|
|
|
@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
|
cparams.cb_eval = params.cb_eval;
|
||||||
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
|
@ -2192,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if (params.warmup) {
|
||||||
LOG("warming up the model with an empty run\n");
|
LOG("warming up the model with an empty run\n");
|
||||||
|
|
||||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||||
|
|
|
@ -80,6 +80,9 @@ struct gpt_params {
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
|
@ -156,6 +159,7 @@ struct gpt_params {
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
bool warmup = true; // warmup run
|
||||||
|
|
||||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
|
@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
||||||
|
|
||||||
|
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||||
|
|
|
@ -19,6 +19,7 @@ else()
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gritlm)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
|
9
examples/eval-callback/CMakeLists.txt
Normal file
9
examples/eval-callback/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
set(TARGET eval-callback)
|
||||||
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
|
set(TEST_TARGET test-eval-callback)
|
||||||
|
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
95
examples/eval-callback/README.md
Normal file
95
examples/eval-callback/README.md
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# llama.cpp/examples/eval-callback
|
||||||
|
|
||||||
|
A simple example which demonstrates how to use callback during the inference.
|
||||||
|
It simply prints to the console all operations and tensor data.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
eval-callback \
|
||||||
|
--hf-repo ggml-org/models \
|
||||||
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
|
--model phi-2-q4_0.gguf \
|
||||||
|
--prompt hello \
|
||||||
|
--seed 42 \
|
||||||
|
-ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
Will print:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
llm_load_tensors: offloaded 33/33 layers to GPU
|
||||||
|
...
|
||||||
|
llama_new_context_with_model: n_ctx = 512
|
||||||
|
...
|
||||||
|
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
|
||||||
|
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
|
||||||
|
llama_new_context_with_model: graph nodes = 1225
|
||||||
|
llama_new_context_with_model: graph splits = 2
|
||||||
|
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.0181, 0.0272, 0.0272, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.6989, 1.0636, 1.0636, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.1800, 0.2817, 0.2632, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -0.1863, 0.2970, 0.2604, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1238, 1.2876, -1.8086, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
[ -0.3608, 0.5076, -1.8866, ...],
|
||||||
|
[ 1.7643, 0.0273, -2.1065, ...],
|
||||||
|
...
|
||||||
|
],
|
||||||
|
]
|
||||||
|
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ -1.1135, 1.4604, -1.9226, ...],
|
||||||
|
[ -0.3608, 0.5076, -1.8866, ...],
|
||||||
|
[ 1.7643, 0.0273, -2.1065, ...],
|
||||||
|
...
|
||||||
|
],
|
||||||
|
]
|
||||||
|
```
|
185
examples/eval-callback/eval-callback.cpp
Normal file
185
examples/eval-callback/eval-callback.cpp
Normal file
|
@ -0,0 +1,185 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This the arbitrary data which will be passed to each callback.
|
||||||
|
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
||||||
|
*/
|
||||||
|
struct callback_data {
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||||
|
std::string str;
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
str += std::to_string(t->ne[i]);
|
||||||
|
if (i + 1 < GGML_MAX_DIMS) {
|
||||||
|
str += ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||||
|
float sum = 0;
|
||||||
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
|
printf(" [\n");
|
||||||
|
for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
|
||||||
|
printf(" [\n");
|
||||||
|
for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
|
||||||
|
printf(" [");
|
||||||
|
for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
|
||||||
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
|
float v;
|
||||||
|
if (type == GGML_TYPE_F16) {
|
||||||
|
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
||||||
|
} else if (type == GGML_TYPE_F32) {
|
||||||
|
v = *(float *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I32) {
|
||||||
|
v = (float) *(int32_t *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I16) {
|
||||||
|
v = (float) *(int16_t *) data + i;
|
||||||
|
} else if (type == GGML_TYPE_I8) {
|
||||||
|
v = (float) *(int8_t *) data + i;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
printf("%8.4f", v);
|
||||||
|
sum += v;
|
||||||
|
if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
|
||||||
|
}
|
||||||
|
if (ne[0] > n) printf(", ...");
|
||||||
|
printf("],\n");
|
||||||
|
}
|
||||||
|
if (ne[1] > n) printf(" ...\n");
|
||||||
|
printf(" ],\n");
|
||||||
|
}
|
||||||
|
if (ne[2] > n) printf(" ...\n");
|
||||||
|
printf(" ]\n");
|
||||||
|
printf(" sum = %f\n", sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GGML operations callback during the graph execution.
|
||||||
|
*
|
||||||
|
* @param t current tensor
|
||||||
|
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
|
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
||||||
|
* see ggml_backend_sched_eval_callback
|
||||||
|
* @param user_data user data to pass at each call back
|
||||||
|
* @return true to receive data or continue the graph, false otherwise
|
||||||
|
*/
|
||||||
|
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
auto * cb_data = (callback_data *) user_data;
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
if (ask) {
|
||||||
|
return true; // Always retrieve data
|
||||||
|
}
|
||||||
|
|
||||||
|
char src1_str[128] = {0};
|
||||||
|
if (src1) {
|
||||||
|
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||||
|
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
||||||
|
src0->name, ggml_ne_string(src0).c_str(),
|
||||||
|
src1 ? src1_str : "",
|
||||||
|
ggml_ne_string(t).c_str());
|
||||||
|
|
||||||
|
|
||||||
|
// copy the data from the GPU memory if needed
|
||||||
|
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
||||||
|
|
||||||
|
if (!is_host) {
|
||||||
|
auto n_bytes = ggml_nbytes(t);
|
||||||
|
cb_data->data.resize(n_bytes);
|
||||||
|
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ggml_is_quantized(t->type)) {
|
||||||
|
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||||
|
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool run(llama_context * ctx, const gpt_params & params) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
callback_data cb_data;
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
print_build_info();
|
||||||
|
|
||||||
|
std::mt19937 rng(params.seed);
|
||||||
|
if (params.random_prompt) {
|
||||||
|
params.prompt = gpt_random_prompt(rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// pass the callback to the backend scheduler
|
||||||
|
// it will be executed for each node during the graph computation
|
||||||
|
params.cb_eval = ggml_debug;
|
||||||
|
params.cb_eval_user_data = &cb_data;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
|
// init
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// print system information
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OK = run(ctx, params);
|
||||||
|
if (!OK) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
error_pos = pos;
|
error_pos = pos;
|
||||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||||
|
|
|
@ -107,9 +107,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
|
|
||||||
// the top-k selected expert ids are stored in the ids tensor
|
// the top-k selected expert ids are stored in the ids tensor
|
||||||
// for simplicity, always copy ids to host, because it is small
|
// for simplicity, always copy ids to host, because it is small
|
||||||
// take into account that ids is not contiguous!
|
|
||||||
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
|
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
|
||||||
GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
|
|
||||||
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
|
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
|
||||||
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
||||||
|
|
||||||
|
@ -597,24 +595,18 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
|
||||||
|
|
||||||
// pass the callback to the backend scheduler
|
// pass the callback to the backend scheduler
|
||||||
// it will be executed for each node during the graph computation
|
// it will be executed for each node during the graph computation
|
||||||
cparams.cb_eval = ik_collect_imatrix;
|
params.cb_eval = ik_collect_imatrix;
|
||||||
cparams.cb_eval_user_data = NULL;
|
params.cb_eval_user_data = NULL;
|
||||||
|
params.warmup = false;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
// init
|
||||||
if (ctx == NULL) {
|
llama_model * model;
|
||||||
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
llama_context * ctx;
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
if (model == nullptr || ctx == nullptr) {
|
||||||
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
## Model conversion
|
## Model conversion
|
||||||
|
|
||||||
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
|
1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
||||||
|
|
|
@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
## LLaVA 1.5
|
## LLaVA 1.5
|
||||||
|
|
||||||
- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
|
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
|
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
|
||||||
|
|
|
@ -310,7 +310,7 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
### Quantization
|
### Quantization
|
||||||
|
|
||||||
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
|
||||||
|
|
||||||
## Additional Options
|
## Additional Options
|
||||||
|
|
||||||
|
|
|
@ -3,19 +3,18 @@
|
||||||
TODO
|
TODO
|
||||||
|
|
||||||
## Llama 2 70B Scorechart
|
## Llama 2 70B Scorechart
|
||||||
Quantization | Model size (GiB) | Perplexity | Delta to fp16
|
| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
|
||||||
-- | -- | -- | --
|
|--------------|------------------|------------|---------------|
|
||||||
Q4_0 | 36.20 | 3.5550 | 3.61%
|
| Q4_0 | 36.20 | 3.5550 | 3.61% |
|
||||||
Q4_1 | 40.20 | 3.5125 | 2.37%
|
| Q4_1 | 40.20 | 3.5125 | 2.37% |
|
||||||
Q5_0 | 44.20 | 3.4744 | 1.26%
|
| Q5_0 | 44.20 | 3.4744 | 1.26% |
|
||||||
Q2_K | 27.27 | 3.7339 | 8.82%
|
| Q2_K | 27.27 | 3.7339 | 8.82% |
|
||||||
Q3_K_S | 27.86 | 3.7019 | 7.89%
|
| Q3_K_S | 27.86 | 3.7019 | 7.89% |
|
||||||
Q3_K_M | 30.83 | 3.5932 | 4.72%
|
| Q3_K_M | 30.83 | 3.5932 | 4.72% |
|
||||||
Q3_K_L | 33.67 | 3.5617 | 3.80%
|
| Q3_K_L | 33.67 | 3.5617 | 3.80% |
|
||||||
Q4_K_S | 36.39 | 3.4852 | 1.57%
|
| Q4_K_S | 36.39 | 3.4852 | 1.57% |
|
||||||
Q4_K_M | 38.54 | 3.4725 | 1.20%
|
| Q4_K_M | 38.54 | 3.4725 | 1.20% |
|
||||||
Q5_K_S | 44.20 | 3.4483 | 0.50%
|
| Q5_K_S | 44.20 | 3.4483 | 0.50% |
|
||||||
Q5_K_M | 45.41 | 3.4451 | 0.40%
|
| Q5_K_M | 45.41 | 3.4451 | 0.40% |
|
||||||
Q6_K | 52.70 | 3.4367 | 0.16%
|
| Q6_K | 52.70 | 3.4367 | 0.16% |
|
||||||
fp16 | 128.5 | 3.4313 | -
|
| fp16 | 128.5 | 3.4313 | - |
|
||||||
|
|
||||||
|
|
|
@ -4,17 +4,17 @@ TODO
|
||||||
|
|
||||||
## Llama 2 7B
|
## Llama 2 7B
|
||||||
|
|
||||||
Quantization | Bits per Weight (BPW)
|
| Quantization | Bits per Weight (BPW) |
|
||||||
-- | --
|
|--------------|-----------------------|
|
||||||
Q2_K | 3.35
|
| Q2_K | 3.35 |
|
||||||
Q3_K_S | 3.50
|
| Q3_K_S | 3.50 |
|
||||||
Q3_K_M | 3.91
|
| Q3_K_M | 3.91 |
|
||||||
Q3_K_L | 4.27
|
| Q3_K_L | 4.27 |
|
||||||
Q4_K_S | 4.58
|
| Q4_K_S | 4.58 |
|
||||||
Q4_K_M | 4.84
|
| Q4_K_M | 4.84 |
|
||||||
Q5_K_S | 5.52
|
| Q5_K_S | 5.52 |
|
||||||
Q5_K_M | 5.68
|
| Q5_K_M | 5.68 |
|
||||||
Q6_K | 6.56
|
| Q6_K | 6.56 |
|
||||||
|
|
||||||
## Llama 2 13B
|
## Llama 2 13B
|
||||||
Quantization | Bits per Weight (BPW)
|
Quantization | Bits per Weight (BPW)
|
||||||
|
|
33
llama.cpp
33
llama.cpp
|
@ -1638,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
|
||||||
#if defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_vk_get_device_memory(device, &total, &free);
|
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#else
|
#else
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -11121,7 +11121,7 @@ struct llm_tokenizer_bpe {
|
||||||
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
||||||
}
|
}
|
||||||
|
|
||||||
// add the fnished tokens to the final list keeping correct order for next and prev
|
// add the finished tokens to the final list keeping correct order for next and prev
|
||||||
for (auto & sym : symbols) {
|
for (auto & sym : symbols) {
|
||||||
if (sym.n > 0) {
|
if (sym.n > 0) {
|
||||||
sym.prev = final_prev_index;
|
sym.prev = final_prev_index;
|
||||||
|
@ -11861,7 +11861,9 @@ static void llama_grammar_advance_stack(
|
||||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||||
|
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
new_stacks.emplace_back(stack);
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||||
|
new_stacks.emplace_back(stack);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11898,7 +11900,10 @@ static void llama_grammar_advance_stack(
|
||||||
}
|
}
|
||||||
case LLAMA_GRETYPE_CHAR:
|
case LLAMA_GRETYPE_CHAR:
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
new_stacks.emplace_back(stack);
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||||
|
// only add the stack if it's not a duplicate of one we already have
|
||||||
|
new_stacks.emplace_back(stack);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||||
|
@ -11912,12 +11917,13 @@ static void llama_grammar_advance_stack(
|
||||||
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
||||||
// produces the N possible stacks if the given char is accepted at those
|
// produces the N possible stacks if the given char is accepted at those
|
||||||
// positions
|
// positions
|
||||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
void llama_grammar_accept(
|
||||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||||
const uint32_t chr) {
|
const uint32_t chr,
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||||
|
|
||||||
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
new_stacks.clear();
|
||||||
|
|
||||||
for (const auto & stack : stacks) {
|
for (const auto & stack : stacks) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
|
@ -11936,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
||||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_stacks;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||||
|
@ -11951,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||||
const std::vector<llama_grammar_candidate> & candidates) {
|
const std::vector<llama_grammar_candidate> & candidates) {
|
||||||
|
|
||||||
std::vector<llama_grammar_candidate> rejects;
|
std::vector<llama_grammar_candidate> rejects;
|
||||||
|
rejects.reserve(candidates.size());
|
||||||
|
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
for (const auto & tok : candidates) {
|
for (const auto & tok : candidates) {
|
||||||
|
@ -11964,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||||
const llama_grammar_element * stack_pos = stack.back();
|
const llama_grammar_element * stack_pos = stack.back();
|
||||||
|
|
||||||
std::vector<llama_grammar_candidate> next_candidates;
|
std::vector<llama_grammar_candidate> next_candidates;
|
||||||
|
next_candidates.reserve(candidates.size());
|
||||||
|
|
||||||
for (const auto & tok : candidates) {
|
for (const auto & tok : candidates) {
|
||||||
if (*tok.code_points == 0) {
|
if (*tok.code_points == 0) {
|
||||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||||
|
@ -12771,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
||||||
|
grammar->stacks = tmp_new_stacks;
|
||||||
}
|
}
|
||||||
grammar->partial_utf8 = decoded.second;
|
grammar->partial_utf8 = decoded.second;
|
||||||
GGML_ASSERT(!grammar->stacks.empty());
|
GGML_ASSERT(!grammar->stacks.empty());
|
||||||
|
|
5
llama.h
5
llama.h
|
@ -1097,10 +1097,11 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
||||||
struct llama_context * ctx
|
struct llama_context * ctx
|
||||||
);
|
);
|
||||||
|
|
||||||
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
void llama_grammar_accept(
|
||||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||||
const uint32_t chr);
|
const uint32_t chr,
|
||||||
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||||
|
|
||||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
const std::string & src,
|
const std::string & src,
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
# Shortcut for downloading HF models
|
# Shortcut for downloading HF models
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||||
#
|
#
|
||||||
|
|
||||||
# all logs go to stderr
|
# all logs go to stderr
|
||||||
|
@ -14,7 +14,7 @@ function log {
|
||||||
}
|
}
|
||||||
|
|
||||||
function usage {
|
function usage {
|
||||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
|
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,9 +26,9 @@ function has_cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
if has_cmd wget; then
|
if has_cmd wget; then
|
||||||
cmd="wget -q --show-progress -c -O %s %s"
|
cmd="wget -q --show-progress -c -O %s/%s %s"
|
||||||
elif has_cmd curl; then
|
elif has_cmd curl; then
|
||||||
cmd="curl -C - -f -o %s -L %s"
|
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||||
else
|
else
|
||||||
log "[E] curl or wget not found"
|
log "[E] curl or wget not found"
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -37,6 +37,7 @@ fi
|
||||||
url=""
|
url=""
|
||||||
repo=""
|
repo=""
|
||||||
file=""
|
file=""
|
||||||
|
outdir="."
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
|
@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
|
||||||
file="$2"
|
file="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--outdir)
|
||||||
|
outdir="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
-h|--help)
|
-h|--help)
|
||||||
usage
|
usage
|
||||||
;;
|
;;
|
||||||
|
@ -94,10 +99,10 @@ basename=$(basename $url)
|
||||||
log "[+] attempting to download $basename"
|
log "[+] attempting to download $basename"
|
||||||
|
|
||||||
if [ -n "$cmd" ]; then
|
if [ -n "$cmd" ]; then
|
||||||
cmd=$(printf "$cmd" "$basename" "$url")
|
cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
|
||||||
log "[+] $cmd"
|
log "[+] $cmd"
|
||||||
if $cmd; then
|
if $cmd; then
|
||||||
echo $basename
|
echo $outdir/$basename
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -38,7 +38,7 @@ number ::= [0-9]+)""";
|
||||||
|
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
assert(!grammar->stacks.empty());
|
assert(!grammar->stacks.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,7 +138,7 @@ ws ::= [ \t\n\r]?)""";
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
++pos;
|
++pos;
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
|
|
||||||
// Expect that each code point will not cause the grammar to fail
|
// Expect that each code point will not cause the grammar to fail
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
|
@ -173,7 +173,7 @@ ws ::= [ \t\n\r]?)""";
|
||||||
|
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
auto prev_stacks = grammar->stacks;
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||||
if (grammar->stacks.empty()) {
|
if (grammar->stacks.empty()) {
|
||||||
parse_failed = true;
|
parse_failed = true;
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue