Merge branch 'master' into sycl_fix_non_intel_fp16
This commit is contained in:
commit
f746e7074e
19 changed files with 1050 additions and 213 deletions
|
@ -24,7 +24,7 @@
|
||||||
useOpenCL
|
useOpenCL
|
||||||
useRocm
|
useRocm
|
||||||
useVulkan
|
useVulkan
|
||||||
],
|
] && blas.meta.available,
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
|
@ -67,10 +67,15 @@ let
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||||
|
|
||||||
|
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
|
||||||
|
|
||||||
# TODO: package the Python in this repository in a Nix-like way.
|
# TODO: package the Python in this repository in a Nix-like way.
|
||||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
||||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
||||||
# https://peps.python.org/pep-0517/
|
# https://peps.python.org/pep-0517/
|
||||||
|
#
|
||||||
|
# TODO: Package up each Python script or service appropriately, by making
|
||||||
|
# them into "entrypoints"
|
||||||
llama-python = python3.withPackages (
|
llama-python = python3.withPackages (
|
||||||
ps: [
|
ps: [
|
||||||
ps.numpy
|
ps.numpy
|
||||||
|
@ -159,11 +164,6 @@ effectiveStdenv.mkDerivation (
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
|
|
||||||
# TODO: Package up each Python script or service appropriately.
|
|
||||||
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
|
|
||||||
# we could make those *.py into setuptools' entrypoints
|
|
||||||
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
|
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
||||||
|
@ -244,8 +244,8 @@ effectiveStdenv.mkDerivation (
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main $out/bin/llama
|
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
|
||||||
mv $out/bin/server $out/bin/llama-server
|
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
280
.github/workflows/bench.yml
vendored
Normal file
280
.github/workflows/bench.yml
vendored
Normal file
|
@ -0,0 +1,280 @@
|
||||||
|
# Benchmark
|
||||||
|
name: Benchmark
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
gpu-series:
|
||||||
|
description: 'Azure GPU series to run with'
|
||||||
|
required: true
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- Standard_NC4as_T4_v3
|
||||||
|
- Standard_NC24ads_A100_v4
|
||||||
|
- Standard_NC80adis_H100_v5
|
||||||
|
sha:
|
||||||
|
description: 'Commit SHA1 to build'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
duration:
|
||||||
|
description: 'Duration of the bench'
|
||||||
|
type: string
|
||||||
|
default: 10m
|
||||||
|
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
|
||||||
|
schedule:
|
||||||
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
bench-server-baseline:
|
||||||
|
runs-on: Standard_NC4as_T4_v3
|
||||||
|
env:
|
||||||
|
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
|
||||||
|
N_USERS: 8
|
||||||
|
DURATION: 10m
|
||||||
|
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
|
- name: Install python env
|
||||||
|
id: pipenv
|
||||||
|
run: |
|
||||||
|
cd examples/server/bench
|
||||||
|
python3 -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Prometheus
|
||||||
|
id: install_prometheus
|
||||||
|
run: |
|
||||||
|
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
|
||||||
|
tar xzf prometheus*.tar.gz --strip-components=1
|
||||||
|
./prometheus --config.file=examples/server/bench/prometheus.yml &
|
||||||
|
while ! nc -z localhost 9090; do
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Install k6
|
||||||
|
id: k6_installation
|
||||||
|
run: |
|
||||||
|
cd examples/server/bench
|
||||||
|
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
|
||||||
|
tar xzf k6*.tar.gz --strip-components=1
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. \
|
||||||
|
-DLLAMA_NATIVE=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
|
-DLLAMA_CUBLAS=ON \
|
||||||
|
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
||||||
|
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
||||||
|
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
||||||
|
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||||
|
-DLLAMA_ALL_WARNINGS=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release;
|
||||||
|
cmake --build . --config Release -j $(nproc) --target server
|
||||||
|
|
||||||
|
- name: Download the dataset
|
||||||
|
id: download_dataset
|
||||||
|
run: |
|
||||||
|
cd examples/server/bench
|
||||||
|
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
- name: Server bench
|
||||||
|
id: server_bench
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
|
||||||
|
cd examples/server/bench
|
||||||
|
source venv/bin/activate
|
||||||
|
BENCH_K6_BIN_PATH=./k6 python bench.py \
|
||||||
|
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||||
|
--name ${{ github.job }} \
|
||||||
|
--branch ${{ github.head_ref || github.ref_name }} \
|
||||||
|
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
||||||
|
--scenario script.js \
|
||||||
|
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
||||||
|
--hf-repo ggml-org/models \
|
||||||
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
|
--model-path-prefix /models \
|
||||||
|
--parallel ${{ env.N_USERS }} \
|
||||||
|
-ngl 33 \
|
||||||
|
--batch-size 2048 \
|
||||||
|
--ubatch-size 256 \
|
||||||
|
--ctx-size 16384 \
|
||||||
|
--n-prompts 1000 \
|
||||||
|
--max-prompt-tokens 1024 \
|
||||||
|
--max-tokens 2048
|
||||||
|
|
||||||
|
cat results.github.env >> $GITHUB_ENV
|
||||||
|
|
||||||
|
# Remove dataset as we do not want it in the artefact
|
||||||
|
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: benchmark-results
|
||||||
|
compression-level: 9
|
||||||
|
path: |
|
||||||
|
examples/server/bench/*.jpg
|
||||||
|
examples/server/bench/*.json
|
||||||
|
examples/server/bench/*.log
|
||||||
|
|
||||||
|
- name: Commit status
|
||||||
|
uses: Sibz/github-status-action@v1
|
||||||
|
continue-on-error: true # If not authorized on external repo
|
||||||
|
with:
|
||||||
|
authToken: ${{secrets.GITHUB_TOKEN}}
|
||||||
|
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
|
||||||
|
context: bench-server-baseline
|
||||||
|
description: |
|
||||||
|
${{ env.BENCH_RESULTS }}
|
||||||
|
state: 'success'
|
||||||
|
|
||||||
|
- name: Upload benchmark images
|
||||||
|
uses: devicons/public-upload-to-imgur@v2.2.2
|
||||||
|
continue-on-error: true # Important as it looks unstable: 503
|
||||||
|
id: imgur_step
|
||||||
|
with:
|
||||||
|
client_id: ${{secrets.IMGUR_CLIENT_ID}}
|
||||||
|
path: |
|
||||||
|
examples/server/bench/prompt_tokens_seconds.jpg
|
||||||
|
examples/server/bench/predicted_tokens_seconds.jpg
|
||||||
|
examples/server/bench/kv_cache_usage_ratio.jpg
|
||||||
|
examples/server/bench/requests_processing.jpg
|
||||||
|
|
||||||
|
- name: Extract mermaid
|
||||||
|
id: set_mermaid
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
|
||||||
|
cd examples/server/bench
|
||||||
|
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
|
||||||
|
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
||||||
|
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
|
||||||
|
echo "EOF" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
|
||||||
|
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
|
||||||
|
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
|
||||||
|
echo "EOF" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
|
||||||
|
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
|
||||||
|
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
|
||||||
|
echo "EOF" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
|
||||||
|
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
|
||||||
|
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
|
||||||
|
echo "EOF" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Extract image url
|
||||||
|
id: extract_image_url
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
|
||||||
|
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
|
||||||
|
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
|
||||||
|
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
|
||||||
|
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Comment PR
|
||||||
|
uses: mshick/add-pr-comment@v2
|
||||||
|
id: comment_pr
|
||||||
|
if: ${{ github.event.pull_request != '' }}
|
||||||
|
with:
|
||||||
|
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
|
||||||
|
message: |
|
||||||
|
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
||||||
|
|
||||||
|
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
||||||
|
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
||||||
|
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
|
||||||
|
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
|
||||||
|
- ${{ env.BENCH_GRAPH_XLABEL }}
|
||||||
|
|
||||||
|
<details>
|
||||||
|
|
||||||
|
<summary>Time series</summary>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
|
||||||
|
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
|
||||||
|
|
||||||
|
<details>
|
||||||
|
|
||||||
|
<summary>More</summary>
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
${{ env.PROMPT_TOKENS_SECONDS }}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>More</summary>
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
${{ env.PREDICTED_TOKENS_SECONDS }}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
|
||||||
|
<summary>Details</summary>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
|
||||||
|
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>More</summary>
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
${{ env.KV_CACHE_USAGE_RATIO }}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>More</summary>
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
${{ env.REQUESTS_PROCESSING }}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</p>
|
||||||
|
</details>
|
||||||
|
</details>
|
1
Makefile
1
Makefile
|
@ -556,6 +556,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||||
endif # LLAMA_CUDA_NO_PEER_COPY
|
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
||||||
|
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
||||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert import HfVocab
|
from convert import LlamaHfVocab
|
||||||
|
|
||||||
|
|
||||||
###### MODEL DEFINITIONS ######
|
###### MODEL DEFINITIONS ######
|
||||||
|
@ -230,7 +230,7 @@ class Model(ABC):
|
||||||
def _set_vocab_gpt2(self):
|
def _set_vocab_gpt2(self):
|
||||||
dir_model = self.dir_model
|
dir_model = self.dir_model
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
tokens: list[bytearray] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
@ -243,8 +243,7 @@ class Model(ABC):
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
pad_token = f"[PAD{i}]".encode('utf-8')
|
tokens.append(f"[PAD{i}]")
|
||||||
tokens.append(bytearray(pad_token))
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
elif reverse_vocab[i] in added_vocab:
|
elif reverse_vocab[i] in added_vocab:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
|
@ -266,7 +265,7 @@ class Model(ABC):
|
||||||
def _set_vocab_qwen(self):
|
def _set_vocab_qwen(self):
|
||||||
dir_model = self.dir_model
|
dir_model = self.dir_model
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
tokens: list[bytearray] = []
|
tokens: list[str] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
@ -291,8 +290,7 @@ class Model(ABC):
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
pad_token = f"[PAD{i}]".encode("utf-8")
|
tokens.append(f"[PAD{i}]")
|
||||||
tokens.append(bytearray(pad_token))
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
elif reverse_vocab[i] in added_vocab:
|
elif reverse_vocab[i] in added_vocab:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
|
@ -372,12 +370,8 @@ class Model(ABC):
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_hf(self):
|
def _set_vocab_llama_hf(self):
|
||||||
path = self.dir_model
|
vocab = LlamaHfVocab(self.dir_model)
|
||||||
added_tokens_path = self.dir_model
|
|
||||||
vocab = HfVocab(
|
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
|
||||||
)
|
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -1099,7 +1093,7 @@ class MiniCPMModel(Model):
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_hf()
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
@ -1700,11 +1694,8 @@ class BertModel(Model):
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
path = self.dir_model
|
|
||||||
added_tokens_path = self.dir_model if self.dir_model.exists() else None
|
|
||||||
|
|
||||||
# use huggingface vocab to get all tokens
|
# use huggingface vocab to get all tokens
|
||||||
vocab = HfVocab(path, added_tokens_path)
|
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
|
||||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||||
assert len(tokens) == vocab.vocab_size
|
assert len(tokens) == vocab.vocab_size
|
||||||
self.vocab_size = vocab.vocab_size
|
self.vocab_size = vocab.vocab_size
|
||||||
|
|
|
@ -106,12 +106,12 @@ def main():
|
||||||
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
||||||
print(tensor_map)
|
print(tensor_map)
|
||||||
for name in tensors.keys():
|
for name in tensors.keys():
|
||||||
data = tensors[name]
|
data_torch = tensors[name]
|
||||||
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
||||||
continue
|
continue
|
||||||
old_dtype = data.dtype
|
old_dtype = data_torch.dtype
|
||||||
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
||||||
data = data.to(torch.float32).squeeze().numpy()
|
data = data_torch.to(torch.float32).squeeze().numpy()
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||||
if new_name is None:
|
if new_name is None:
|
||||||
print("Can not map tensor '" + name + "'")
|
print("Can not map tensor '" + name + "'")
|
||||||
|
|
341
convert.py
341
convert.py
|
@ -16,13 +16,14 @@ import re
|
||||||
import signal
|
import signal
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
import textwrap
|
||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
@ -43,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
|
|
||||||
|
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||||
|
FAST_TOKENIZER_FILE = 'tokenizer.json'
|
||||||
|
|
||||||
#
|
#
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
@ -188,8 +192,10 @@ class Params:
|
||||||
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
msg = """\
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
failed to guess 'n_layer'. This model is unknown or unsupported.
|
||||||
|
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
|
||||||
|
raise KeyError(textwrap.dedent(msg))
|
||||||
|
|
||||||
n_head = n_embd // 128 # guessed
|
n_head = n_embd // 128 # guessed
|
||||||
n_mult = 256 # guessed
|
n_mult = 256 # guessed
|
||||||
|
@ -211,7 +217,8 @@ class Params:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||||
rope_scaling = config.get("rope_scaling")
|
rope_scaling = config.get("rope_scaling")
|
||||||
|
@ -233,8 +240,10 @@ class Params:
|
||||||
elif "max_position_embeddings" in config:
|
elif "max_position_embeddings" in config:
|
||||||
n_ctx = config["max_position_embeddings"]
|
n_ctx = config["max_position_embeddings"]
|
||||||
else:
|
else:
|
||||||
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
msg = """\
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
failed to guess 'n_ctx'. This model is unknown or unsupported.
|
||||||
|
Suggestion: provide 'config.json' of the model in the same directory containing model files."""
|
||||||
|
raise KeyError(textwrap.dedent(msg))
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
|
@ -265,7 +274,8 @@ class Params:
|
||||||
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
with open(config_path) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
|
@ -331,47 +341,86 @@ class Params:
|
||||||
# vocab
|
# vocab
|
||||||
#
|
#
|
||||||
|
|
||||||
class BpeVocab:
|
@runtime_checkable
|
||||||
|
class BaseVocab(Protocol):
|
||||||
|
tokenizer_model: ClassVar[str]
|
||||||
|
name: ClassVar[str]
|
||||||
|
|
||||||
|
|
||||||
|
class NoVocab(BaseVocab):
|
||||||
|
tokenizer_model = "no_vocab"
|
||||||
|
name = "no_vocab"
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "<NoVocab for a model without integrated vocabulary>"
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Vocab(BaseVocab, Protocol):
|
||||||
|
vocab_size: int
|
||||||
|
added_tokens_dict: dict[str, int]
|
||||||
|
added_tokens_list: list[str]
|
||||||
|
fname_tokenizer: Path
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path): ...
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
||||||
|
|
||||||
|
|
||||||
|
class BpeVocab(Vocab):
|
||||||
tokenizer_model = "gpt2"
|
tokenizer_model = "gpt2"
|
||||||
name = "bpe"
|
name = "bpe"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, base_path: Path):
|
||||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
added_tokens: dict[str, int] = {}
|
||||||
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
|
||||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
|
||||||
else:
|
|
||||||
self.vocab = self.bpe_tokenizer
|
|
||||||
added_tokens: dict[str, int]
|
|
||||||
if fname_added_tokens is not None:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
|
||||||
else:
|
|
||||||
# Fall back to trying to find the added tokens in tokenizer.json
|
|
||||||
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
added_tokens = {}
|
|
||||||
else:
|
|
||||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
|
||||||
added_tokens = dict(
|
|
||||||
(item['content'], item['id'])
|
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
|
||||||
if item['content'] not in self.bpe_tokenizer)
|
|
||||||
|
|
||||||
vocab_size: int = len(self.vocab)
|
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
# "slow" tokenizer
|
||||||
actual_ids = sorted(added_tokens.values())
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
self.vocab = json.load(f)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# "fast" tokenizer
|
||||||
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
|
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if (
|
||||||
|
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
||||||
|
|
||||||
|
self.vocab = tokenizer_model["vocab"]
|
||||||
|
|
||||||
|
if (added := tokenizer_json.get('added_tokens')) is not None:
|
||||||
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
|
added_tokens = {item['content']: item['id']
|
||||||
|
for item in added
|
||||||
|
if item['content'] not in self.vocab}
|
||||||
|
|
||||||
|
vocab_size = len(self.vocab)
|
||||||
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
actual_ids = sorted(added_tokens.values())
|
||||||
if expected_ids != actual_ids:
|
if expected_ids != actual_ids:
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||||
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
||||||
|
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
self.added_tokens_dict = added_tokens
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
self.vocab_size_base: int = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
|
@ -392,19 +441,25 @@ class BpeVocab:
|
||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "spm"
|
name = "spm"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, base_path: Path):
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
added_tokens: dict[str, int] = {}
|
||||||
added_tokens: dict[str, int]
|
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
||||||
if fname_added_tokens is not None:
|
# normal location
|
||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
try:
|
||||||
else:
|
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
||||||
added_tokens = {}
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
||||||
|
# not found in alternate location either
|
||||||
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
|
@ -414,18 +469,17 @@ class SentencePieceVocab:
|
||||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
# Token pieces that were added to the base vocabulary.
|
||||||
self.added_tokens_dict = added_tokens
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
self.vocab_size_base = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
text: bytes = piece.encode("utf-8")
|
text = piece.encode("utf-8")
|
||||||
score: float = tokenizer.get_score(i)
|
score: float = tokenizer.get_score(i)
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
|
@ -458,27 +512,42 @@ class SentencePieceVocab:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class HfVocab:
|
class LlamaHfVocab(Vocab):
|
||||||
tokenizer_model = "llama"
|
tokenizer_model = "llama"
|
||||||
name = "hfft"
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
|
||||||
|
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
# pre-check so we know if we need transformers
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if ignore_nonllama:
|
||||||
|
pass # workaround incorrect use of this class for WordPiece
|
||||||
|
elif (
|
||||||
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"To use HfVocab, please install the `transformers` package. "
|
"To use LlamaHfVocab, please install the `transformers` package. "
|
||||||
"You can install it with `pip install transformers`."
|
"You can install it with `pip install transformers`."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
print("fname_tokenizer:", fname_tokenizer)
|
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
# Allow the tokenizer to default to slow or fast versions.
|
||||||
# Explicitly set tokenizer to use local paths.
|
# Explicitly set tokenizer to use local paths.
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
fname_tokenizer,
|
base_path,
|
||||||
cache_dir=fname_tokenizer,
|
cache_dir=base_path,
|
||||||
local_files_only=True,
|
local_files_only=True,
|
||||||
)
|
)
|
||||||
|
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
||||||
|
|
||||||
# Initialize lists and dictionaries for added tokens
|
# Initialize lists and dictionaries for added tokens
|
||||||
self.added_tokens_list = []
|
self.added_tokens_list = []
|
||||||
|
@ -506,8 +575,7 @@ class HfVocab:
|
||||||
self.vocab_size_base = self.tokenizer.vocab_size
|
self.vocab_size_base = self.tokenizer.vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
reverse_vocab = {
|
reverse_vocab = {
|
||||||
|
@ -559,18 +627,7 @@ class HfVocab:
|
||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class NoVocab:
|
|
||||||
tokenizer_model = "no_vocab"
|
|
||||||
name = "no_vocab"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return "<NoVocab for a model without integrated vocabulary>"
|
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -588,7 +645,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
.reshape(weights.shape))
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
|
||||||
class Tensor(metaclass=ABCMeta):
|
class Tensor(ABC):
|
||||||
data_type: DataType
|
data_type: DataType
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -610,7 +667,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
|
||||||
|
|
||||||
|
|
||||||
class UnquantizedTensor(Tensor):
|
class UnquantizedTensor(Tensor):
|
||||||
def __init__(self, ndarray: NDArray) -> None:
|
def __init__(self, ndarray: NDArray):
|
||||||
assert isinstance(ndarray, np.ndarray)
|
assert isinstance(ndarray, np.ndarray)
|
||||||
self.ndarray = ndarray
|
self.ndarray = ndarray
|
||||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||||
|
@ -689,7 +746,7 @@ class ModelPlus:
|
||||||
model: LazyModel
|
model: LazyModel
|
||||||
paths: list[Path] # Where this was read from.
|
paths: list[Path] # Where this was read from.
|
||||||
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
format: Literal['ggml', 'torch', 'safetensors', 'none']
|
||||||
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
|
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
|
||||||
|
|
||||||
|
|
||||||
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
||||||
|
@ -698,7 +755,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
||||||
names = {name: None for model in models for name in model}
|
names = {name: None for model in models for name in model}
|
||||||
|
|
||||||
def convert(name: str) -> LazyTensor:
|
def convert(name: str) -> LazyTensor:
|
||||||
lazy_tensors: list[LazyTensor] = [model[name] for model in models]
|
lazy_tensors = [model[name] for model in models]
|
||||||
if len(lazy_tensors) == 1:
|
if len(lazy_tensors) == 1:
|
||||||
# only one file; don't go through this procedure since there might
|
# only one file; don't go through this procedure since there might
|
||||||
# be quantized tensors
|
# be quantized tensors
|
||||||
|
@ -719,7 +776,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
|
||||||
|
|
||||||
def load() -> UnquantizedTensor:
|
def load() -> UnquantizedTensor:
|
||||||
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
|
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
|
||||||
concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
|
concatenated = np.concatenate(ndarrays, axis=axis)
|
||||||
return UnquantizedTensor(concatenated)
|
return UnquantizedTensor(concatenated)
|
||||||
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
|
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
|
||||||
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
|
return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
|
||||||
|
@ -807,10 +864,10 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
|
|
||||||
def load(offset: int, elm_count: int) -> NDArray:
|
def load(offset: int, elm_count: int) -> NDArray:
|
||||||
dtype = data_type.dtype
|
dtype = data_type.dtype
|
||||||
fp = self.zip_file.open(info)
|
with self.zip_file.open(info) as fp:
|
||||||
fp.seek(offset * dtype.itemsize)
|
fp.seek(offset * dtype.itemsize)
|
||||||
size = elm_count * dtype.itemsize
|
size = elm_count * dtype.itemsize
|
||||||
data = fp.read(size)
|
data = fp.read(size)
|
||||||
assert len(data) == size
|
assert len(data) == size
|
||||||
return np.frombuffer(data, dtype)
|
return np.frombuffer(data, dtype)
|
||||||
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
|
||||||
|
@ -831,7 +888,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
def rebuild_from_type_v2(func, new_type, args, state):
|
def rebuild_from_type_v2(func, new_type, args, state):
|
||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
||||||
CLASSES: dict[tuple[str, str], Any] = {
|
CLASSES = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to determine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
|
@ -890,7 +947,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||||
def must_read(fp: IO[bytes], length: int) -> bytes:
|
def must_read(fp: IO[bytes], length: int) -> bytes:
|
||||||
ret = fp.read(length)
|
ret = fp.read(length)
|
||||||
if len(ret) < length:
|
if len(ret) < length:
|
||||||
raise Exception("unexpectedly reached end of file")
|
raise EOFError("unexpectedly reached end of file")
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
@ -948,13 +1005,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
|
||||||
# Handle special case where the model's vocab size is not set
|
# Handle special case where the model's vocab size is not set
|
||||||
if params.n_vocab == -1:
|
if params.n_vocab == -1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
|
"The model's vocab size is set to -1 in params.json. Please update it manually."
|
||||||
|
+ (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
|
||||||
)
|
)
|
||||||
if isinstance(vocab, NoVocab):
|
if not isinstance(vocab, Vocab):
|
||||||
return # model has no vocab
|
return # model has no vocab
|
||||||
|
|
||||||
# Check for a vocab size mismatch
|
# Check for a vocab size mismatch
|
||||||
|
@ -979,11 +1037,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
||||||
if vocab.vocab_size < params.n_vocab:
|
if vocab.vocab_size < params.n_vocab:
|
||||||
msg += " Add the --pad-vocab option and try again."
|
msg += " Add the --pad-vocab option and try again."
|
||||||
|
|
||||||
raise Exception(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
|
@ -1034,8 +1092,6 @@ class OutputFile:
|
||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
||||||
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
||||||
assert not isinstance(vocab, NoVocab)
|
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -1135,7 +1191,7 @@ class OutputFile:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(
|
def write_all(
|
||||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
||||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
pad_vocab: bool = False,
|
pad_vocab: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -1145,11 +1201,11 @@ class OutputFile:
|
||||||
|
|
||||||
# meta data
|
# meta data
|
||||||
of.add_meta_arch(params)
|
of.add_meta_arch(params)
|
||||||
if isinstance(vocab, NoVocab):
|
if isinstance(vocab, Vocab):
|
||||||
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
|
||||||
else:
|
|
||||||
of.add_meta_vocab(vocab)
|
of.add_meta_vocab(vocab)
|
||||||
of.add_meta_special_vocab(svocab)
|
of.add_meta_special_vocab(svocab)
|
||||||
|
else: # NoVocab
|
||||||
|
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
for name, lazy_tensor in model.items():
|
for name, lazy_tensor in model.items():
|
||||||
|
@ -1176,7 +1232,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
||||||
|
|
||||||
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
|
@ -1186,7 +1242,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
|
||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
|
|
||||||
tmp = model
|
tmp = model
|
||||||
|
|
||||||
|
@ -1213,8 +1269,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
|
||||||
if skip_unknown:
|
if skip_unknown:
|
||||||
print(f"Unexpected tensor name: {name} - skipping")
|
print(f"Unexpected tensor name: {name} - skipping")
|
||||||
continue
|
continue
|
||||||
else:
|
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
||||||
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
|
|
||||||
|
|
||||||
if tensor_type in should_skip:
|
if tensor_type in should_skip:
|
||||||
print(f"skipping tensor {name_new}")
|
print(f"skipping tensor {name_new}")
|
||||||
|
@ -1231,7 +1286,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
'''
|
'''
|
||||||
# Support the following patterns:
|
# Support the following patterns:
|
||||||
patterns: list[tuple[str, str]] = [
|
patterns = [
|
||||||
# - x.00.pth, x.01.pth, etc.
|
# - x.00.pth, x.01.pth, etc.
|
||||||
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
(r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
|
||||||
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
|
||||||
|
@ -1277,9 +1332,9 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||||
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
|
||||||
files = [file for glob in globs for file in path.glob(glob)]
|
files = [file for glob in globs for file in path.glob(glob)]
|
||||||
if not files:
|
if not files:
|
||||||
raise Exception(f"Can't find model in directory {path}")
|
raise FileNotFoundError(f"Can't find model in directory {path}")
|
||||||
if len(files) > 1:
|
if len(files) > 1:
|
||||||
raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
|
raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
|
||||||
path = files[0]
|
path = files[0]
|
||||||
|
|
||||||
paths = find_multifile_paths(path)
|
paths = find_multifile_paths(path)
|
||||||
|
@ -1293,36 +1348,14 @@ def load_some_model(path: Path) -> ModelPlus:
|
||||||
|
|
||||||
|
|
||||||
class VocabFactory:
|
class VocabFactory:
|
||||||
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
|
_VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
|
||||||
|
|
||||||
def __init__(self, path: Path):
|
def __init__(self, path: Path):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.file_paths = self._detect_files()
|
|
||||||
print(f"Found vocab files: {self.file_paths}")
|
|
||||||
|
|
||||||
def _detect_files(self) -> dict[str, Path | None]:
|
def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||||
def locate(file: str) -> Path | None:
|
|
||||||
if (path := self.path / file).exists():
|
|
||||||
return path
|
|
||||||
if (path := self.path.parent / file).exists():
|
|
||||||
return path
|
|
||||||
return None
|
|
||||||
|
|
||||||
return {vt: locate(f) for vt, f in self._FILES.items()}
|
|
||||||
|
|
||||||
def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
|
|
||||||
for vtype in vocab_types:
|
|
||||||
try:
|
|
||||||
path = self.file_paths[vtype]
|
|
||||||
except KeyError:
|
|
||||||
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
|
||||||
if path is not None:
|
|
||||||
return vtype, path
|
|
||||||
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
|
||||||
|
|
||||||
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
|
||||||
load_merges = vocab.name == "bpe"
|
load_merges = vocab.name == "bpe"
|
||||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
|
||||||
return gguf.SpecialVocab(
|
return gguf.SpecialVocab(
|
||||||
model_parent_path,
|
model_parent_path,
|
||||||
load_merges=load_merges,
|
load_merges=load_merges,
|
||||||
|
@ -1331,27 +1364,29 @@ class VocabFactory:
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
||||||
vocab_type, path = self._select_file(vocab_types)
|
vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
|
||||||
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
|
selected_vocabs: dict[str, type[Vocab]] = {}
|
||||||
|
for vtype in vocab_types:
|
||||||
|
try:
|
||||||
|
selected_vocabs[vtype] = vocab_classes[vtype]
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
|
||||||
|
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
for vtype, cls in selected_vocabs.items():
|
||||||
if vocab_type == "bpe":
|
try:
|
||||||
return BpeVocab(
|
vocab = cls(self.path)
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
break
|
||||||
)
|
except FileNotFoundError:
|
||||||
if vocab_type == "spm":
|
pass # ignore unavailable tokenizers
|
||||||
return SentencePieceVocab(
|
else:
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
|
||||||
)
|
|
||||||
if vocab_type == "hfft":
|
|
||||||
return HfVocab(
|
|
||||||
path.parent, added_tokens_path if added_tokens_path.exists() else None
|
|
||||||
)
|
|
||||||
raise ValueError(vocab_type)
|
|
||||||
|
|
||||||
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
|
||||||
vocab: Vocab
|
return vocab
|
||||||
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
|
|
||||||
|
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
|
||||||
|
vocab: BaseVocab
|
||||||
|
if vocab_types is None:
|
||||||
vocab = NoVocab()
|
vocab = NoVocab()
|
||||||
else:
|
else:
|
||||||
vocab = self._create_vocab_by_path(vocab_types)
|
vocab = self._create_vocab_by_path(vocab_types)
|
||||||
|
@ -1408,10 +1443,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
if args.no_vocab:
|
if args.no_vocab and args.vocab_only:
|
||||||
if args.vocab_only:
|
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||||
raise ValueError("no need to specify --vocab-only if using --no-vocab")
|
|
||||||
args.vocab_type = "no_vocab"
|
|
||||||
|
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
model_plus = lazy_load_file(args.model)
|
model_plus = lazy_load_file(args.model)
|
||||||
|
@ -1433,10 +1466,12 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
params = Params.load(model_plus)
|
params = Params.load(model_plus)
|
||||||
if params.n_ctx == -1:
|
if params.n_ctx == -1:
|
||||||
if args.ctx is None:
|
if args.ctx is None:
|
||||||
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
|
msg = """\
|
||||||
"Please specify one with --ctx:\n"
|
The model doesn't have a context size, and you didn't specify one with --ctx
|
||||||
" - LLaMA v1: --ctx 2048\n"
|
Please specify one with --ctx:
|
||||||
" - LLaMA v2: --ctx 4096\n")
|
- LLaMA v1: --ctx 2048
|
||||||
|
- LLaMA v2: --ctx 4096"""
|
||||||
|
parser.error(textwrap.dedent(msg))
|
||||||
params.n_ctx = args.ctx
|
params.n_ctx = args.ctx
|
||||||
|
|
||||||
if args.outtype:
|
if args.outtype:
|
||||||
|
@ -1451,9 +1486,11 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
model_parent_path = model_plus.paths[0].parent
|
model_parent_path = model_plus.paths[0].parent
|
||||||
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
|
||||||
vocab_factory = VocabFactory(vocab_path)
|
vocab_factory = VocabFactory(vocab_path)
|
||||||
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
|
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
|
||||||
|
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
|
assert isinstance(vocab, Vocab)
|
||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
|
|
|
@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com
|
||||||
|
|
||||||
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
||||||
|
|
||||||
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
|
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava-cli` to build it.
|
Build with cmake or run `make llava-cli` to build it.
|
||||||
|
@ -36,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||||
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||||
|
@ -78,7 +78,7 @@ cd examples/llava/android/build_64
|
||||||
### run on Android
|
### run on Android
|
||||||
refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
||||||
|
|
||||||
## some result on Android with `Snapdragon 888` chip
|
## Some result on Android with `Snapdragon 888` chip
|
||||||
### case 1
|
### case 1
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
|
@ -109,7 +109,6 @@ llama_print_timings: total time = 34731.93 ms
|
||||||
--image /data/local/tmp/cat.jpeg \
|
--image /data/local/tmp/cat.jpeg \
|
||||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||||
```
|
```
|
||||||
|
|
||||||
**output**
|
**output**
|
||||||
```sh
|
```sh
|
||||||
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
|
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
|
||||||
|
@ -121,12 +120,82 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m
|
||||||
llama_print_timings: total time = 34570.79 ms
|
llama_print_timings: total time = 34570.79 ms
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Some result on Android with `Snapdragon 778G` chip
|
||||||
|
### MobileVLM-1.7B case
|
||||||
|
#### llava-cli release-b2005
|
||||||
|
**input**
|
||||||
|
```sh
|
||||||
|
/data/local/tmp/llava-cli \
|
||||||
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
|
-t 4 \
|
||||||
|
--image /data/local/tmp/many_llamas.jpeg \
|
||||||
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
|
||||||
|
```
|
||||||
|
**output**
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image encoded in 18728.52 ms by CLIP ( 130.06 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that? ASSISTANT:
|
||||||
|
|
||||||
|
A group of llamas are standing in a green pasture.
|
||||||
|
|
||||||
|
llama_print_timings: load time = 20357.33 ms
|
||||||
|
llama_print_timings: sample time = 2.96 ms / 14 runs ( 0.21 ms per token, 4734.53 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 ms per token, 23.52 tokens per second)
|
||||||
|
llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second)
|
||||||
|
llama_print_timings: total time = 28038.34 ms / 205 tokens
|
||||||
|
```
|
||||||
|
#### llava-cli latest-version
|
||||||
|
**input**
|
||||||
|
|
||||||
|
Just the same as above.
|
||||||
|
|
||||||
|
**output**(seems to be much slower)
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image embedding created: 144 tokens
|
||||||
|
|
||||||
|
encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that? ASSISTANT:
|
||||||
|
|
||||||
|
It is a group of sheep standing together in a grass field.
|
||||||
|
|
||||||
|
llama_print_timings: load time = 818120.91 ms
|
||||||
|
llama_print_timings: sample time = 3.44 ms / 14 runs ( 0.25 ms per token, 4067.40 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 529274.69 ms / 191 tokens ( 2771.07 ms per token, 0.36 tokens per second)
|
||||||
|
llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 ms per token, 0.30 tokens per second)
|
||||||
|
llama_print_timings: total time = 865441.76 ms / 204 tokens
|
||||||
|
```
|
||||||
|
### MobileVLM_V2-1.7B case
|
||||||
|
#### llava-cli release-2005b
|
||||||
|
**input**
|
||||||
|
|
||||||
|
Just the same as above.
|
||||||
|
|
||||||
|
**output**
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image encoded in 20609.61 ms by CLIP ( 143.12 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that? ASSISTANT:
|
||||||
|
|
||||||
|
This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
|
||||||
|
|
||||||
|
The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
|
||||||
|
|
||||||
|
llama_print_timings: load time = 22406.77 ms
|
||||||
|
llama_print_timings: sample time = 49.26 ms / 186 runs ( 0.26 ms per token, 3776.27 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 9044.54 ms / 191 tokens ( 47.35 ms per token, 21.12 tokens per second)
|
||||||
|
llama_print_timings: eval time = 14497.49 ms / 186 runs ( 77.94 ms per token, 12.83 tokens per second)
|
||||||
|
llama_print_timings: total time = 44411.01 ms / 377 tokens
|
||||||
|
```
|
||||||
|
|
||||||
## Orin compile and run
|
## Orin compile and run
|
||||||
### compile
|
### compile
|
||||||
```sh
|
```sh
|
||||||
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||||
```
|
```
|
||||||
|
|
||||||
### run on Orin
|
### run on Orin
|
||||||
### case 1
|
### case 1
|
||||||
**input**
|
**input**
|
||||||
|
@ -175,8 +244,121 @@ llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 m
|
||||||
llama_print_timings: total time = 1365.47 ms / 243 tokens
|
llama_print_timings: total time = 1365.47 ms / 243 tokens
|
||||||
```
|
```
|
||||||
|
|
||||||
## Minor shortcomings
|
## Running on Intel(R) Core(TM) i7-10750H
|
||||||
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
|
### Operating system
|
||||||
|
Ubuntu22.04
|
||||||
|
### compile
|
||||||
|
```sh
|
||||||
|
make -j32
|
||||||
|
```
|
||||||
|
### MobileVLM-1.7B case
|
||||||
|
**input**
|
||||||
|
```sh
|
||||||
|
-m /path/to/ggml-model-q4_k.gguf \
|
||||||
|
--mmproj /path/to/mmproj-model-f16.gguf \
|
||||||
|
--image /path/to/many_llamas.jpeg
|
||||||
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
|
||||||
|
```
|
||||||
|
**output**
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image embedding created: 144 tokens
|
||||||
|
|
||||||
|
encode_image_with_clip: image encoded in 2730.94 ms by CLIP ( 18.96 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that?ASSISTANT:
|
||||||
|
|
||||||
|
A group of llamas are walking together in a field.
|
||||||
|
|
||||||
|
llama_print_timings: load time = 5506.60 ms
|
||||||
|
llama_print_timings: sample time = 0.44 ms / 13 runs ( 0.03 ms per token, 29545.45 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 2031.58 ms / 190 tokens ( 10.69 ms per token, 93.52 tokens per second)
|
||||||
|
llama_print_timings: eval time = 438.92 ms / 12 runs ( 36.58 ms per token, 27.34 tokens per second)
|
||||||
|
llama_print_timings: total time = 5990.25 ms / 202 tokens
|
||||||
|
```
|
||||||
|
|
||||||
|
### MobileVLM_V2-1.7B case
|
||||||
|
**input**
|
||||||
|
|
||||||
|
Just the same as above.
|
||||||
|
|
||||||
|
**ouput**
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image embedding created: 144 tokens
|
||||||
|
|
||||||
|
encode_image_with_clip: image encoded in 3223.89 ms by CLIP ( 22.39 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that?ASSISTANT:
|
||||||
|
|
||||||
|
The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
|
||||||
|
|
||||||
|
The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
|
||||||
|
|
||||||
|
The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
|
||||||
|
|
||||||
|
llama_print_timings: load time = 6642.61 ms
|
||||||
|
llama_print_timings: sample time = 8.15 ms / 223 runs ( 0.04 ms per token, 27358.61 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 2475.07 ms / 190 tokens ( 13.03 ms per token, 76.77 tokens per second)
|
||||||
|
llama_print_timings: eval time = 8760.60 ms / 222 runs ( 39.46 ms per token, 25.34 tokens per second)
|
||||||
|
llama_print_timings: total time = 15513.95 ms / 412 tokens
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run on Intel(R) Core(TM) Ultra7 115H
|
||||||
|
### operation system
|
||||||
|
Windows11
|
||||||
|
### comiple
|
||||||
|
```sh
|
||||||
|
make -j32
|
||||||
|
```
|
||||||
|
### MobileVLM-1.7B case
|
||||||
|
**input**
|
||||||
|
```sh
|
||||||
|
-m /path/to/ggml-model-q4_k.gguf \
|
||||||
|
--mmproj /path/to/tmp/mmproj-model-f16.gguf \
|
||||||
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
|
||||||
|
```
|
||||||
|
**output**
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image encoded in 4902.81 ms by CLIP ( 34.05 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that? ASSISTANT:
|
||||||
|
|
||||||
|
The image features a group of brown and white llamas standing in a grassy field.
|
||||||
|
|
||||||
|
llama_print_timings: load time = 7441.06 ms
|
||||||
|
llama_print_timings: sample time = 0.72 ms / 19 runs ( 0.04 ms per token, 26279.39 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 2090.71 ms / 191 tokens ( 10.95 ms per token, 91.36 tokens per second)
|
||||||
|
llama_print_timings: eval time = 512.35 ms / 18 runs ( 28.46 ms per token, 35.13 tokens per second)
|
||||||
|
llama_print_timings: total time = 7987.23 ms / 209 tokens
|
||||||
|
```
|
||||||
|
|
||||||
|
### MobileVLM_V2-1.7B case
|
||||||
|
**input**
|
||||||
|
|
||||||
|
Just the same as above.
|
||||||
|
|
||||||
|
**output**
|
||||||
|
```sh
|
||||||
|
encode_image_with_clip: image encoded in 4682.44 ms by CLIP ( 32.52 ms per image patch)
|
||||||
|
system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
|
||||||
|
user_prompt: \nWhat's that? ASSISTANT:
|
||||||
|
|
||||||
|
This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
|
||||||
|
of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
|
||||||
|
|
||||||
|
The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
|
||||||
|
|
||||||
|
The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
|
||||||
|
front is not visible, indicating that it might not be the main focus of the photo.
|
||||||
|
|
||||||
|
The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
|
||||||
|
|
||||||
|
|
||||||
|
llama_print_timings: load time = 7015.35 ms
|
||||||
|
llama_print_timings: sample time = 10.61 ms / 256 runs ( 0.04 ms per token, 24119.09 tokens per second)
|
||||||
|
llama_print_timings: prompt eval time = 2052.45 ms / 191 tokens ( 10.75 ms per token, 93.06 tokens per second)
|
||||||
|
llama_print_timings: eval time = 7259.43 ms / 255 runs ( 28.47 ms per token, 35.13 tokens per second)
|
||||||
|
llama_print_timings: total time = 14371.19 ms / 446 tokens
|
||||||
|
```
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
|
@ -191,5 +373,5 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic
|
||||||
|
|
||||||
## contributor
|
## contributor
|
||||||
```sh
|
```sh
|
||||||
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
|
zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
|
||||||
```
|
```
|
||||||
|
|
|
@ -835,9 +835,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||||
// weight ne = [3, 3, 2048, 1]
|
// weight ne = [3, 3, 2048, 1]
|
||||||
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
||||||
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
|
||||||
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
||||||
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
||||||
|
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
||||||
|
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
||||||
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||||
embeddings = peg_0;
|
embeddings = peg_0;
|
||||||
}
|
}
|
||||||
|
@ -1755,7 +1756,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
||||||
|
|
||||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||||
|
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||||
n_patches /= 4;
|
n_patches /= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -296,7 +296,9 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
### Batch Size
|
### Batch Size
|
||||||
|
|
||||||
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
||||||
|
|
||||||
|
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
|
||||||
|
|
||||||
### Prompt Caching
|
### Prompt Caching
|
||||||
|
|
||||||
|
|
303
examples/server/bench/bench.py
Normal file
303
examples/server/bench/bench.py
Normal file
|
@ -0,0 +1,303 @@
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from contextlib import closing
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.dates
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def main(args_in: list[str] | None = None) -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Start server benchmark scenario")
|
||||||
|
parser.add_argument("--name", type=str, help="Bench name", required=True)
|
||||||
|
parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
|
||||||
|
parser.add_argument("--branch", type=str, help="Branch name", default="detached")
|
||||||
|
parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
|
||||||
|
parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
|
||||||
|
parser.add_argument("--port", type=int, help="Server listen host", default="8080")
|
||||||
|
parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
|
||||||
|
parser.add_argument("--n-prompts", type=int,
|
||||||
|
help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
|
||||||
|
parser.add_argument("--max-prompt-tokens", type=int,
|
||||||
|
help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
|
||||||
|
required=True)
|
||||||
|
parser.add_argument("--max-tokens", type=int,
|
||||||
|
help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
|
||||||
|
required=True)
|
||||||
|
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
|
||||||
|
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
|
||||||
|
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
|
||||||
|
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
|
||||||
|
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
|
||||||
|
parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
|
||||||
|
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
|
||||||
|
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
|
||||||
|
parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
|
||||||
|
|
||||||
|
args = parser.parse_args(args_in)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Start the server and performance scenario
|
||||||
|
try:
|
||||||
|
server_process = start_server(args)
|
||||||
|
except Exception:
|
||||||
|
print("bench: server start error :")
|
||||||
|
traceback.print_exc(file=sys.stdout)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# start the benchmark
|
||||||
|
try:
|
||||||
|
start_benchmark(args)
|
||||||
|
|
||||||
|
iterations = 0
|
||||||
|
with open("results.github.env", 'w') as github_env:
|
||||||
|
# parse output
|
||||||
|
with open('k6-results.json', 'r') as bench_results:
|
||||||
|
# Load JSON data from file
|
||||||
|
data = json.load(bench_results)
|
||||||
|
for metric_name in data['metrics']:
|
||||||
|
for metric_metric in data['metrics'][metric_name]:
|
||||||
|
value = data['metrics'][metric_name][metric_metric]
|
||||||
|
if isinstance(value, float) or isinstance(value, int):
|
||||||
|
value = round(value, 2)
|
||||||
|
data['metrics'][metric_name][metric_metric]=value
|
||||||
|
github_env.write(
|
||||||
|
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
|
||||||
|
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
|
||||||
|
iterations = data['root_group']['checks']['success completion']['passes']
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
print("bench: error :")
|
||||||
|
traceback.print_exc(file=sys.stdout)
|
||||||
|
|
||||||
|
# Stop the server
|
||||||
|
if server_process:
|
||||||
|
try:
|
||||||
|
print(f"bench: shutting down server pid={server_process.pid} ...")
|
||||||
|
if os.name == 'nt':
|
||||||
|
interrupt = signal.CTRL_C_EVENT
|
||||||
|
else:
|
||||||
|
interrupt = signal.SIGINT
|
||||||
|
server_process.send_signal(interrupt)
|
||||||
|
server_process.wait(0.5)
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
|
||||||
|
server_process.kill() # SIGKILL
|
||||||
|
server_process.wait()
|
||||||
|
|
||||||
|
while is_server_listening(args.host, args.port):
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
title = (f"llama.cpp {args.name} on {args.runner_label}\n "
|
||||||
|
f"duration={args.duration} {iterations} iterations")
|
||||||
|
xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
|
||||||
|
f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
|
||||||
|
f"branch={args.branch} commit={args.commit}")
|
||||||
|
|
||||||
|
# Prometheus
|
||||||
|
end_time = time.time()
|
||||||
|
if is_server_listening("0.0.0.0", 9090):
|
||||||
|
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
|
||||||
|
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
|
||||||
|
|
||||||
|
for metric in metrics:
|
||||||
|
resp = requests.get(f"http://localhost:9090/api/v1/query_range",
|
||||||
|
params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
|
||||||
|
|
||||||
|
with open(f"{metric}.json", 'w') as metric_json:
|
||||||
|
metric_json.write(resp.text)
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
|
||||||
|
else:
|
||||||
|
metric_data = resp.json()
|
||||||
|
values = metric_data['data']['result'][0]['values']
|
||||||
|
timestamps, metric_values = zip(*values)
|
||||||
|
metric_values = [float(value) for value in metric_values]
|
||||||
|
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
|
||||||
|
plt.figure(figsize=(16, 10), dpi=80)
|
||||||
|
plt.plot(timestamps_dt, metric_values, label=metric)
|
||||||
|
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
|
||||||
|
plt.yticks(fontsize=12, alpha=.7)
|
||||||
|
|
||||||
|
ylabel = f"llamacpp:{metric}"
|
||||||
|
plt.title(title,
|
||||||
|
fontsize=14, wrap=True)
|
||||||
|
plt.grid(axis='both', alpha=.3)
|
||||||
|
plt.ylabel(ylabel, fontsize=22)
|
||||||
|
plt.xlabel(xlabel, fontsize=14, wrap=True)
|
||||||
|
plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
|
||||||
|
plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
|
||||||
|
plt.gcf().autofmt_xdate()
|
||||||
|
|
||||||
|
# Remove borders
|
||||||
|
plt.gca().spines["top"].set_alpha(0.0)
|
||||||
|
plt.gca().spines["bottom"].set_alpha(0.3)
|
||||||
|
plt.gca().spines["right"].set_alpha(0.0)
|
||||||
|
plt.gca().spines["left"].set_alpha(0.3)
|
||||||
|
|
||||||
|
# Save the plot as a jpg image
|
||||||
|
plt.savefig(f'{metric}.jpg', dpi=60)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# Mermaid format in case images upload failed
|
||||||
|
with (open(f"{metric}.mermaid", 'w') as mermaid_f):
|
||||||
|
mermaid = (
|
||||||
|
f"""---
|
||||||
|
config:
|
||||||
|
xyChart:
|
||||||
|
titleFontSize: 12
|
||||||
|
width: 900
|
||||||
|
height: 600
|
||||||
|
themeVariables:
|
||||||
|
xyChart:
|
||||||
|
titleColor: "#000000"
|
||||||
|
---
|
||||||
|
xychart-beta
|
||||||
|
title "{title}"
|
||||||
|
y-axis "llamacpp:{metric}"
|
||||||
|
x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
|
||||||
|
line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
|
||||||
|
""")
|
||||||
|
mermaid_f.write(mermaid)
|
||||||
|
|
||||||
|
# 140 chars max for commit status description
|
||||||
|
bench_results = {
|
||||||
|
"req": {
|
||||||
|
"p90": data['metrics']["http_req_duration"]["p(90)"],
|
||||||
|
"avg": data['metrics']["http_req_duration"]["avg"],
|
||||||
|
},
|
||||||
|
"pp": {
|
||||||
|
"p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
|
||||||
|
"avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
|
||||||
|
},
|
||||||
|
"tg": {
|
||||||
|
"p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
|
||||||
|
"avg": data['metrics']["llamacpp_tokens_second"]["avg"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
with open("results.github.env", 'a') as github_env:
|
||||||
|
github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
|
||||||
|
github_env.write(f"BENCH_ITERATIONS={iterations}\n")
|
||||||
|
|
||||||
|
title = title.replace('\n', ' ')
|
||||||
|
xlabel = xlabel.replace('\n', ' ')
|
||||||
|
github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
|
||||||
|
github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def start_benchmark(args):
|
||||||
|
k6_path = 'k6'
|
||||||
|
if 'BENCH_K6_BIN_PATH' in os.environ:
|
||||||
|
k6_path = os.environ['BENCH_K6_BIN_PATH']
|
||||||
|
k6_args = [
|
||||||
|
'run', args.scenario,
|
||||||
|
'--no-color',
|
||||||
|
]
|
||||||
|
k6_args.extend(['--duration', args.duration])
|
||||||
|
k6_args.extend(['--iterations', args.n_prompts])
|
||||||
|
k6_args.extend(['--vus', args.parallel])
|
||||||
|
k6_args.extend(['--summary-export', 'k6-results.json'])
|
||||||
|
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
||||||
|
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
||||||
|
print(f"bench: starting k6 with: {args}")
|
||||||
|
k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
|
||||||
|
if k6_completed.returncode != 0:
|
||||||
|
raise Exception("bench: unable to run k6")
|
||||||
|
|
||||||
|
|
||||||
|
def start_server(args):
|
||||||
|
server_process = start_server_background(args)
|
||||||
|
|
||||||
|
attempts = 0
|
||||||
|
max_attempts = 20
|
||||||
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
|
max_attempts *= 2
|
||||||
|
|
||||||
|
while not is_server_listening(args.host, args.port):
|
||||||
|
attempts += 1
|
||||||
|
if attempts > max_attempts:
|
||||||
|
assert False, "server not started"
|
||||||
|
print(f"bench: waiting for server to start ...")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
print("bench: server started.")
|
||||||
|
return server_process
|
||||||
|
|
||||||
|
|
||||||
|
def start_server_background(args):
|
||||||
|
# Start the server
|
||||||
|
server_path = '../../../build/bin/server'
|
||||||
|
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||||
|
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
|
server_args = [
|
||||||
|
'--host', args.host,
|
||||||
|
'--port', args.port,
|
||||||
|
]
|
||||||
|
model_file = args.model_path_prefix + os.path.sep + args.hf_file
|
||||||
|
model_dir = os.path.dirname(model_file)
|
||||||
|
if not os.path.exists(model_dir):
|
||||||
|
os.makedirs(model_dir)
|
||||||
|
server_args.extend(['--model', model_file])
|
||||||
|
server_args.extend(['--hf-repo', args.hf_repo])
|
||||||
|
server_args.extend(['--hf-file', args.hf_file])
|
||||||
|
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
|
||||||
|
server_args.extend(['--ctx-size', args.ctx_size])
|
||||||
|
server_args.extend(['--parallel', args.parallel])
|
||||||
|
server_args.extend(['--batch-size', args.batch_size])
|
||||||
|
server_args.extend(['--ubatch-size', args.ubatch_size])
|
||||||
|
server_args.extend(['--n-predict', args.max_tokens * 2])
|
||||||
|
server_args.extend(['--defrag-thold', "0.1"])
|
||||||
|
server_args.append('--cont-batching')
|
||||||
|
server_args.append('--metrics')
|
||||||
|
server_args.extend(['--log-format', "text"])
|
||||||
|
args = [str(arg) for arg in [server_path, *server_args]]
|
||||||
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
|
pkwargs = {
|
||||||
|
'stdout': subprocess.PIPE,
|
||||||
|
'stderr': subprocess.PIPE
|
||||||
|
}
|
||||||
|
server_process = subprocess.Popen(
|
||||||
|
args,
|
||||||
|
**pkwargs)
|
||||||
|
|
||||||
|
def server_log(in_stream, out_stream):
|
||||||
|
for line in iter(in_stream.readline, b''):
|
||||||
|
print(line.decode('utf-8'), end='', file=out_stream)
|
||||||
|
|
||||||
|
thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
|
||||||
|
thread_stdout.start()
|
||||||
|
thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
|
||||||
|
thread_stderr.start()
|
||||||
|
|
||||||
|
return server_process
|
||||||
|
|
||||||
|
|
||||||
|
def is_server_listening(server_fqdn, server_port):
|
||||||
|
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||||
|
result = sock.connect_ex((server_fqdn, server_port))
|
||||||
|
_is_server_listening = result == 0
|
||||||
|
if _is_server_listening:
|
||||||
|
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||||
|
return _is_server_listening
|
||||||
|
|
||||||
|
|
||||||
|
def escape_metric_name(metric_name):
|
||||||
|
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
9
examples/server/bench/prometheus.yml
Normal file
9
examples/server/bench/prometheus.yml
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
global:
|
||||||
|
scrape_interval: 10s
|
||||||
|
external_labels:
|
||||||
|
llamacpp: 'server'
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'llama.cpp server'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:8080']
|
2
examples/server/bench/requirements.txt
Normal file
2
examples/server/bench/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
matplotlib
|
||||||
|
requests
|
|
@ -3566,6 +3566,7 @@ int main(int argc, char ** argv) {
|
||||||
sigemptyset (&sigint_action.sa_mask);
|
sigemptyset (&sigint_action.sa_mask);
|
||||||
sigint_action.sa_flags = 0;
|
sigint_action.sa_flags = 0;
|
||||||
sigaction(SIGINT, &sigint_action, NULL);
|
sigaction(SIGINT, &sigint_action, NULL);
|
||||||
|
sigaction(SIGTERM, &sigint_action, NULL);
|
||||||
#elif defined (_WIN32)
|
#elif defined (_WIN32)
|
||||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||||
|
|
|
@ -1114,7 +1114,10 @@ def start_server_background(context):
|
||||||
server_args.append('--verbose')
|
server_args.append('--verbose')
|
||||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||||
server_args.extend(['--log-format', "text"])
|
server_args.extend(['--log-format', "text"])
|
||||||
print(f"starting server with: {context.server_path} {server_args}")
|
|
||||||
|
args = [str(arg) for arg in [context.server_path, *server_args]]
|
||||||
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
|
|
||||||
flags = 0
|
flags = 0
|
||||||
if 'nt' == os.name:
|
if 'nt' == os.name:
|
||||||
flags |= subprocess.DETACHED_PROCESS
|
flags |= subprocess.DETACHED_PROCESS
|
||||||
|
@ -1130,16 +1133,14 @@ def start_server_background(context):
|
||||||
[str(arg) for arg in [context.server_path, *server_args]],
|
[str(arg) for arg in [context.server_path, *server_args]],
|
||||||
**pkwargs)
|
**pkwargs)
|
||||||
|
|
||||||
def log_stdout(process):
|
def server_log(in_stream, out_stream):
|
||||||
for line in iter(process.stdout.readline, b''):
|
for line in iter(in_stream.readline, b''):
|
||||||
print(line.decode('utf-8'), end='')
|
print(line.decode('utf-8'), end='', file=out_stream)
|
||||||
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
|
|
||||||
|
thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
|
||||||
thread_stdout.start()
|
thread_stdout.start()
|
||||||
|
|
||||||
def log_stderr(process):
|
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
|
||||||
for line in iter(process.stderr.readline, b''):
|
|
||||||
print(line.decode('utf-8'), end='', file=sys.stderr)
|
|
||||||
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
|
|
||||||
thread_stderr.start()
|
thread_stderr.start()
|
||||||
|
|
||||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
||||||
|
|
13
flake.nix
13
flake.nix
|
@ -145,6 +145,7 @@
|
||||||
# the same path you would with an overlay.
|
# the same path you would with an overlay.
|
||||||
legacyPackages = {
|
legacyPackages = {
|
||||||
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
|
llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
};
|
};
|
||||||
|
@ -155,6 +156,7 @@
|
||||||
{
|
{
|
||||||
default = config.legacyPackages.llamaPackages.llama-cpp;
|
default = config.legacyPackages.llamaPackages.llama-cpp;
|
||||||
vulkan = config.packages.default.override { useVulkan = true; };
|
vulkan = config.packages.default.override { useVulkan = true; };
|
||||||
|
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
||||||
}
|
}
|
||||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||||
opencl = config.packages.default.override { useOpenCL = true; };
|
opencl = config.packages.default.override { useOpenCL = true; };
|
||||||
|
@ -168,9 +170,14 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
# Packages exposed in `.#checks` will be built by the CI and by
|
# Packages exposed in `.#checks` will be built by the CI and by
|
||||||
# `nix flake check`. Currently we expose all packages, but we could
|
# `nix flake check`.
|
||||||
# make more granular choices
|
#
|
||||||
checks = config.packages;
|
# We could test all outputs e.g. as `checks = confg.packages`.
|
||||||
|
#
|
||||||
|
# TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
|
||||||
|
checks = {
|
||||||
|
inherit (config.packages) default vulkan;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -2951,7 +2951,7 @@ namespace dpct
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
static int g_ggml_sycl_debug=0;
|
static int g_ggml_sycl_debug=0;
|
||||||
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
|
#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
|
||||||
|
|
||||||
#define CHECK_TRY_ERROR(expr) \
|
#define CHECK_TRY_ERROR(expr) \
|
||||||
[&]() { \
|
[&]() { \
|
||||||
|
@ -12851,6 +12851,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sycl_print_sycl_devices() {
|
void ggml_backend_sycl_print_sycl_devices() {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
||||||
int device_count = dpct::dev_mgr::instance().device_count();
|
int device_count = dpct::dev_mgr::instance().device_count();
|
||||||
std::map<std::string, size_t> DeviceNums;
|
std::map<std::string, size_t> DeviceNums;
|
||||||
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
||||||
|
@ -12908,7 +12909,9 @@ static void ggml_init_sycl() try {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
|
||||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||||
|
|
||||||
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
||||||
|
|
||||||
#if defined(GGML_SYCL_F16)
|
#if defined(GGML_SYCL_F16)
|
||||||
|
@ -16022,6 +16025,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
||||||
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
||||||
|
|
||||||
if (!g_sycl_gpu_mgr) {
|
if (!g_sycl_gpu_mgr) {
|
||||||
|
@ -16056,6 +16060,7 @@ catch (sycl::exception const &exc) {
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
||||||
size_t description_size) try {
|
size_t description_size) try {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
int device_id = g_sycl_gpu_mgr->gpus[device];
|
int device_id = g_sycl_gpu_mgr->gpus[device];
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||||
|
@ -16070,6 +16075,7 @@ catch (sycl::exception const &exc) {
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
||||||
size_t *total) try {
|
size_t *total) try {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
||||||
ggml_sycl_set_device(device);
|
ggml_sycl_set_device(device);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -16421,7 +16427,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
||||||
ggml_init_sycl();
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
||||||
|
|
||||||
if (device_index>=g_device_count or device_index<0) {
|
if (device_index>=g_device_count or device_index<0) {
|
||||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||||
device_index, g_device_count-1);
|
device_index, g_device_count-1);
|
||||||
|
@ -16791,6 +16798,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
|
||||||
ggml_init_sycl();
|
ggml_init_sycl();
|
||||||
// FIXME: this is not thread safe
|
// FIXME: this is not thread safe
|
||||||
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
||||||
|
@ -16863,6 +16871,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
|
||||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_sycl_host_buffer_type_name,
|
||||||
|
@ -17159,6 +17168,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
||||||
ggml_init_sycl();
|
ggml_init_sycl();
|
||||||
|
|
||||||
check_allow_gpu_index(device);
|
check_allow_gpu_index(device);
|
||||||
|
@ -17185,6 +17195,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
||||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||||
return g_sycl_gpu_mgr->get_gpu_count();
|
return g_sycl_gpu_mgr->get_gpu_count();
|
||||||
}
|
}
|
||||||
|
@ -17197,16 +17208,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
|
||||||
return g_sycl_gpu_mgr->get_index(device_id);
|
return g_sycl_gpu_mgr->get_index(device_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
|
||||||
return g_sycl_gpu_mgr->gpus[device_index];
|
return g_sycl_gpu_mgr->gpus[device_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
||||||
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
ggml_init_sycl();
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
|
||||||
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
||||||
|
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
||||||
|
|
||||||
if (g_sycl_gpu_mgr) {
|
if (g_sycl_gpu_mgr) {
|
||||||
delete g_sycl_gpu_mgr;
|
delete g_sycl_gpu_mgr;
|
||||||
}
|
}
|
||||||
|
@ -17217,6 +17233,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
||||||
|
ggml_init_sycl();
|
||||||
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
|
||||||
|
|
||||||
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -2938,7 +2938,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
data_size *= ne[i];
|
data_size *= ne[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
|
||||||
|
|
||||||
void * data = view_src != NULL ? view_src->data : NULL;
|
void * data = view_src != NULL ? view_src->data : NULL;
|
||||||
if (data != NULL) {
|
if (data != NULL) {
|
||||||
|
|
|
@ -9152,8 +9152,9 @@ struct llm_build_context {
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * attn_out = cur;
|
struct ggml_tensor * attn_out = cur;
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -60,9 +60,9 @@ extern "C" {
|
||||||
|
|
||||||
enum llama_vocab_type {
|
enum llama_vocab_type {
|
||||||
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue