Merge branch 'master' into sycl_fix_non_intel_fp16

2024-03-28 15:45:06 +00:00 · 2024-03-28 15:45:06 +00:00 · f746e7074e
commit f746e7074e
parent 4070423210 be55134a53
19 changed files with 1050 additions and 213 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -24,7 +24,7 @@
    useOpenCL
    useRocm
    useVulkan
-  ],
+  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
@ -67,10 +67,15 @@ let
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";
  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
  #
  # TODO: Package up each Python script or service appropriately, by making
  # them into "entrypoints"
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
@ -159,11 +164,6 @@ effectiveStdenv.mkDerivation (
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
      # TODO: Package up each Python script or service appropriately.
      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
      # we could make those *.py into setuptools' entrypoints
      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';
    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
@ -244,8 +244,8 @@ effectiveStdenv.mkDerivation (
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
-      mv $out/bin/main $out/bin/llama
+      mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
-      mv $out/bin/server $out/bin/llama-server
+      mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -0,0 +1,280 @@
 # Benchmark
 name: Benchmark
 on:
  workflow_dispatch:
    inputs:
      gpu-series:
        description: 'Azure GPU series to run with'
        required: true
        type: choice
        options:
          - Standard_NC4as_T4_v3
          - Standard_NC24ads_A100_v4
          - Standard_NC80adis_H100_v5
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      duration:
        description: 'Duration of the bench'
        type: string
        default: 10m
  push:
    branches:
      - master
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  schedule:
    -  cron: '04 2 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  bench-server-baseline:
    runs-on: Standard_NC4as_T4_v3
    env:
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
      N_USERS: 8
      DURATION: 10m
    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Install python env
        id: pipenv
        run: |
          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
      - name: Prometheus
        id: install_prometheus
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
      - name: Install k6
        id: k6_installation
        run: |
          cd examples/server/bench
          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
          tar xzf k6*.tar.gz --strip-components=1
      - name: Build
        id: cmake_build
        run: |
          set -eux
          mkdir build
          cd build
          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
              -DCMAKE_CUDA_ARCHITECTURES=75 \
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
          cmake --build . --config Release -j $(nproc) --target server
      - name: Download the dataset
        id: download_dataset
        run: |
          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
      - name: Server bench
        id: server_bench
        run: |
          set -eux
          cd examples/server/bench
          source venv/bin/activate
          BENCH_K6_BIN_PATH=./k6 python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch ${{ github.head_ref || github.ref_name }} \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
              --hf-repo ggml-org/models	 \
              --hf-file phi-2/ggml-model-q4_0.gguf \
              --model-path-prefix /models \
              --parallel ${{ env.N_USERS }} \
              -ngl 33 \
              --batch-size 2048 \
              --ubatch-size	256 \
              --ctx-size 16384 \
              --n-prompts 1000 \
              --max-prompt-tokens 1024 \
              --max-tokens 2048
          cat results.github.env >> $GITHUB_ENV
          # Remove dataset as we do not want it in the artefact
          rm ShareGPT_V3_unfiltered_cleaned_split.json
      - uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          compression-level: 9
          path: |
            examples/server/bench/*.jpg
            examples/server/bench/*.json
            examples/server/bench/*.log
      - name: Commit status
        uses: Sibz/github-status-action@v1
        continue-on-error: true # If not authorized on external repo
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
          context: bench-server-baseline
          description: |
            ${{ env.BENCH_RESULTS }}
          state: 'success'
      - name: Upload benchmark images
        uses: devicons/public-upload-to-imgur@v2.2.2
        continue-on-error: true # Important as it looks unstable: 503
        id: imgur_step
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
            examples/server/bench/prompt_tokens_seconds.jpg
            examples/server/bench/predicted_tokens_seconds.jpg
            examples/server/bench/kv_cache_usage_ratio.jpg
            examples/server/bench/requests_processing.jpg
      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux
          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
      - name: Extract image url
        id: extract_image_url
        continue-on-error: true
        run: |
          set -eux
          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
      - name: Comment PR
        uses: mshick/add-pr-comment@v2
        id: comment_pr
        if: ${{ github.event.pull_request != '' }}
        with:
          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
          message: |
            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
            - ${{ env.BENCH_GRAPH_XLABEL }}
            <details>
            <summary>Time series</summary>
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
            <details>
            <summary>More</summary>
            ```mermaid
            ${{ env.PROMPT_TOKENS_SECONDS }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.PREDICTED_TOKENS_SECONDS }}
            ```
            </details>
            </p>
            <details>
            <summary>Details</summary>
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.KV_CACHE_USAGE_RATIO }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.REQUESTS_PROCESSING }}
            ```
            </details>
            </p>
            </details>
            </details>
--- a/1
+++ b/1
@ -556,6 +556,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
-from convert import HfVocab
+from convert import LlamaHfVocab
 ###### MODEL DEFINITIONS ######
@ -230,7 +230,7 @@ class Model(ABC):
    def _set_vocab_gpt2(self):
        dir_model = self.dir_model
        hparams = self.hparams
-        tokens: list[bytearray] = []
+        tokens: list[str] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
@ -243,8 +243,7 @@ class Model(ABC):
        for i in range(vocab_size):
            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode('utf-8')
+                tokens.append(f"[PAD{i}]")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
@ -266,7 +265,7 @@ class Model(ABC):
    def _set_vocab_qwen(self):
        dir_model = self.dir_model
        hparams = self.hparams
-        tokens: list[bytearray] = []
+        tokens: list[str] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
@ -291,8 +290,7 @@ class Model(ABC):
        for i in range(vocab_size):
            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(f"[PAD{i}]")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
@ -372,12 +370,8 @@ class Model(ABC):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
-    def _set_vocab_hf(self):
+    def _set_vocab_llama_hf(self):
-        path = self.dir_model
+        vocab = LlamaHfVocab(self.dir_model)
        added_tokens_path = self.dir_model
        vocab = HfVocab(
            path, added_tokens_path if added_tokens_path.exists() else None
        )
        tokens = []
        scores = []
        toktypes = []
@ -1099,7 +1093,7 @@ class MiniCPMModel(Model):
        self.gguf_writer.add_file_type(self.ftype)
    def set_vocab(self):
-        self._set_vocab_hf()
+        self._set_vocab_llama_hf()
    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
        if n_kv_head is not None and n_head != n_kv_head:
@ -1700,11 +1694,8 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)
    def set_vocab(self):
        path = self.dir_model
        added_tokens_path = self.dir_model if self.dir_model.exists() else None
        # use huggingface vocab to get all tokens
-        vocab = HfVocab(path, added_tokens_path)
+        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
        tokens, scores, toktypes = zip(*vocab.all_tokens())
        assert len(tokens) == vocab.vocab_size
        self.vocab_size = vocab.vocab_size
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -106,12 +106,12 @@ def main():
    tensor_map = gguf.get_tensor_name_map(arch, block_count)
    print(tensor_map)
    for name in tensors.keys():
-        data = tensors[name]
+        data_torch = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
            continue
-        old_dtype = data.dtype
+        old_dtype = data_torch.dtype
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-        data = data.to(torch.float32).squeeze().numpy()
+        data = data_torch.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
--- a/convert.py
+++ b/convert.py
@ -16,13 +16,14 @@ import re
 import signal
 import struct
 import sys
 import textwrap
 import time
 import zipfile
-from abc import ABCMeta, abstractmethod
+from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
 from sentencepiece import SentencePieceProcessor
@ -43,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
 DEFAULT_CONCURRENCY = 8
 ADDED_TOKENS_FILE = 'added_tokens.json'
 FAST_TOKENIZER_FILE = 'tokenizer.json'
 #
 # data types
 #
@ -188,8 +192,10 @@ class Params:
            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+            msg = """\
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+                failed to guess 'n_layer'. This model is unknown or unsupported.
                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
            raise KeyError(textwrap.dedent(msg))
        n_head = n_embd // 128 # guessed
        n_mult = 256           # guessed
@ -211,7 +217,8 @@ class Params:
    @staticmethod
    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
-        config = json.load(open(config_path))
+        with open(config_path) as f:
            config = json.load(f)
        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
        rope_scaling = config.get("rope_scaling")
@ -233,8 +240,10 @@ class Params:
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+            msg = """\
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+                failed to guess 'n_ctx'. This model is unknown or unsupported.
                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
            raise KeyError(textwrap.dedent(msg))
        n_experts      = None
        n_experts_used = None
@ -265,7 +274,8 @@ class Params:
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
-        config = json.load(open(config_path))
+        with open(config_path) as f:
            config = json.load(f)
        n_experts      = None
        n_experts_used = None
@ -331,47 +341,86 @@ class Params:
 # vocab
 #
-class BpeVocab:
+@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
    name: ClassVar[str]
 class NoVocab(BaseVocab):
    tokenizer_model = "no_vocab"
    name = "no_vocab"
    def __repr__(self) -> str:
        return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
 class Vocab(BaseVocab, Protocol):
    vocab_size: int
    added_tokens_dict: dict[str, int]
    added_tokens_list: list[str]
    fname_tokenizer: Path
    def __init__(self, base_path: Path): ...
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
 class BpeVocab(Vocab):
    tokenizer_model = "gpt2"
    name = "bpe"
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+    def __init__(self, base_path: Path):
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        added_tokens: dict[str, int] = {}
        if isinstance(self.bpe_tokenizer.get('model'), dict):
            self.vocab = self.bpe_tokenizer["model"]["vocab"]
        else:
            self.vocab = self.bpe_tokenizer
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
                    (item['content'], item['id'])
                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
                    if item['content'] not in self.bpe_tokenizer)
-        vocab_size: int = len(self.vocab)
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+            # "slow" tokenizer
-        actual_ids      = sorted(added_tokens.values())
+            with open(fname_tokenizer, encoding="utf-8") as f:
                self.vocab = json.load(f)
            try:
                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        else:
            # "fast" tokenizer
            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
            # if this fails, FileNotFoundError propagates to caller
            with open(fname_tokenizer, encoding="utf-8") as f:
                tokenizer_json = json.load(f)
            tokenizer_model: dict[str, Any] = tokenizer_json['model']
            if (
                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
                or tokenizer_json['decoder']['type'] != 'ByteLevel'
            ):
                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
            self.vocab = tokenizer_model["vocab"]
            if (added := tokenizer_json.get('added_tokens')) is not None:
                # Added tokens here can be duplicates of the main vocabulary.
                added_tokens = {item['content']: item['id']
                                for item in added
                                if item['content'] not in self.vocab}
        vocab_size   = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
+        self.vocab_size_base      = vocab_size
-        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
        self.fname_added_tokens   = fname_added_tokens
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -392,19 +441,25 @@ class BpeVocab:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-class SentencePieceVocab:
+class SentencePieceVocab(Vocab):
    tokenizer_model = "llama"
    name = "spm"
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+    def __init__(self, base_path: Path):
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: dict[str, int] = {}
-        added_tokens: dict[str, int]
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
-        if fname_added_tokens is not None:
+            # normal location
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+            try:
-        else:
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
-            added_tokens = {}
+                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')
-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()
        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
@ -414,18 +469,17 @@ class SentencePieceVocab:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_dict = added_tokens
+        self.added_tokens_dict  = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
+            text         = piece.encode("utf-8")
            score: float = tokenizer.get_score(i)
            toktype = gguf.TokenType.NORMAL
@ -458,27 +512,42 @@ class SentencePieceVocab:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-class HfVocab:
+class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
+    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
            tokenizer_json = json.load(f)
        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
        if ignore_nonllama:
            pass  # workaround incorrect use of this class for WordPiece
        elif (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
-                "To use HfVocab, please install the `transformers` package. "
+                "To use LlamaHfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
-            fname_tokenizer,
+            base_path,
-            cache_dir=fname_tokenizer,
+            cache_dir=base_path,
            local_files_only=True,
        )
        assert self.tokenizer.is_fast  # assume tokenizer.json is used
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
@ -506,8 +575,7 @@ class HfVocab:
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
+        self.fname_tokenizer = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
@ -559,18 +627,7 @@ class HfVocab:
        yield from self.added_tokens()
    def __repr__(self) -> str:
-        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class NoVocab:
    tokenizer_model = "no_vocab"
    name = "no_vocab"
    def __repr__(self) -> str:
        return "<NoVocab for a model without integrated vocabulary>"
 Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
 #
@ -588,7 +645,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
            .reshape(weights.shape))
-class Tensor(metaclass=ABCMeta):
+class Tensor(ABC):
    data_type: DataType
    @abstractmethod
@ -610,7 +667,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
 class UnquantizedTensor(Tensor):
-    def __init__(self, ndarray: NDArray) -> None:
+    def __init__(self, ndarray: NDArray):
        assert isinstance(ndarray, np.ndarray)
        self.ndarray = ndarray
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
@ -689,7 +746,7 @@ class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
    format: Literal['ggml', 'torch', 'safetensors', 'none']
-    vocab: Vocab | None  # For GGML models (which have vocab built in), the vocab.
+    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
 def merge_sharded(models: list[LazyModel]) -> LazyModel:
@ -698,7 +755,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
    names = {name: None for model in models for name in model}
    def convert(name: str) -> LazyTensor:
-        lazy_tensors: list[LazyTensor] = [model[name] for model in models]
+        lazy_tensors = [model[name] for model in models]
        if len(lazy_tensors) == 1:
            # only one file; don't go through this procedure since there might
            # be quantized tensors
@ -719,7 +776,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
        def load() -> UnquantizedTensor:
            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
-            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            concatenated = np.concatenate(ndarrays, axis=axis)
            return UnquantizedTensor(concatenated)
        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@ -807,10 +864,10 @@ class LazyUnpickler(pickle.Unpickler):
        def load(offset: int, elm_count: int) -> NDArray:
            dtype = data_type.dtype
-            fp = self.zip_file.open(info)
+            with self.zip_file.open(info) as fp:
-            fp.seek(offset * dtype.itemsize)
+                fp.seek(offset * dtype.itemsize)
-            size = elm_count * dtype.itemsize
+                size = elm_count * dtype.itemsize
-            data = fp.read(size)
+                data = fp.read(size)
            assert len(data) == size
            return np.frombuffer(data, dtype)
        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
@ -831,7 +888,7 @@ class LazyUnpickler(pickle.Unpickler):
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)
-    CLASSES: dict[tuple[str, str], Any] = {
+    CLASSES = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@ -890,7 +947,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
 def must_read(fp: IO[bytes], length: int) -> bytes:
    ret = fp.read(length)
    if len(ret) < length:
-        raise Exception("unexpectedly reached end of file")
+        raise EOFError("unexpectedly reached end of file")
    return ret
@ -948,13 +1005,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
            yield result
-def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
+def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
    # Handle special case where the model's vocab size is not set
    if params.n_vocab == -1:
        raise ValueError(
-            f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
+            "The model's vocab size is set to -1 in params.json. Please update it manually."
            + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
        )
-    if isinstance(vocab, NoVocab):
+    if not isinstance(vocab, Vocab):
        return  # model has no vocab
    # Check for a vocab size mismatch
@ -979,11 +1037,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
    if vocab.vocab_size < params.n_vocab:
        msg += " Add the --pad-vocab option and try again."
-    raise Exception(msg)
+    raise ValueError(msg)
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
    def add_meta_arch(self, params: Params) -> None:
@ -1034,8 +1092,6 @@ class OutputFile:
            self.gguf.add_file_type(params.ftype)
    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        assert not isinstance(vocab, NoVocab)
        tokens = []
        scores = []
        toktypes = []
@ -1135,7 +1191,7 @@ class OutputFile:
    @staticmethod
    def write_all(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
@ -1145,11 +1201,11 @@ class OutputFile:
        # meta data
        of.add_meta_arch(params)
-        if isinstance(vocab, NoVocab):
+        if isinstance(vocab, Vocab):
            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
        else:
            of.add_meta_vocab(vocab)
            of.add_meta_special_vocab(svocab)
        else:  # NoVocab
            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
        # tensor info
        for name, lazy_tensor in model.items():
@ -1176,7 +1232,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
-    raise Exception(f"Unexpected combination of types: {name_to_type}")
+    raise ValueError(f"Unexpected combination of types: {name_to_type}")
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1186,7 +1242,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
 def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
-    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
+    should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
    tmp = model
@ -1213,8 +1269,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
            if skip_unknown:
                print(f"Unexpected tensor name: {name} - skipping")
                continue
-            else:
+            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
        if tensor_type in should_skip:
            print(f"skipping tensor {name_new}")
@ -1231,7 +1286,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
    the nth path in the model.
    '''
    # Support the following patterns:
-    patterns: list[tuple[str, str]] = [
+    patterns = [
        # - x.00.pth, x.01.pth, etc.
        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@ -1277,9 +1332,9 @@ def load_some_model(path: Path) -> ModelPlus:
            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
            files = [file for glob in globs for file in path.glob(glob)]
        if not files:
-            raise Exception(f"Can't find model in directory {path}")
+            raise FileNotFoundError(f"Can't find model in directory {path}")
        if len(files) > 1:
-            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+            raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
        path = files[0]
    paths = find_multifile_paths(path)
@ -1293,36 +1348,14 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
-    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
+    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
    def __init__(self, path: Path):
        self.path = path
        self.file_paths = self._detect_files()
        print(f"Found vocab files: {self.file_paths}")
-    def _detect_files(self) -> dict[str, Path | None]:
+    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
        def locate(file: str) -> Path | None:
            if (path := self.path / file).exists():
                return path
            if (path := self.path.parent / file).exists():
                return path
            return None
        return {vt: locate(f) for vt, f in self._FILES.items()}
    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
        for vtype in vocab_types:
            try:
                path = self.file_paths[vtype]
            except KeyError:
                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
            if path is not None:
                return vtype, path
        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
    def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocab.name == "bpe"
-        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
+        n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
        return gguf.SpecialVocab(
            model_parent_path,
            load_merges=load_merges,
@ -1331,27 +1364,29 @@ class VocabFactory:
        )
    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
-        vocab_type, path = self._select_file(vocab_types)
+        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
-        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
+        selected_vocabs: dict[str, type[Vocab]] = {}
        for vtype in vocab_types:
            try:
                selected_vocabs[vtype] = vocab_classes[vtype]
            except KeyError:
                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
-        added_tokens_path = path.parent / "added_tokens.json"
+        for vtype, cls in selected_vocabs.items():
-        if vocab_type == "bpe":
+            try:
-            return BpeVocab(
+                vocab = cls(self.path)
-                path, added_tokens_path if added_tokens_path.exists() else None
+                break
-            )
+            except FileNotFoundError:
-        if vocab_type == "spm":
+                pass  # ignore unavailable tokenizers
-            return SentencePieceVocab(
+        else:
-                path, added_tokens_path if added_tokens_path.exists() else None
+            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
            )
        if vocab_type == "hfft":
            return HfVocab(
                path.parent, added_tokens_path if added_tokens_path.exists() else None
            )
        raise ValueError(vocab_type)
-    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+        print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
-        vocab: Vocab
+        return vocab
-        if len(vocab_types) == 1 and "no_vocab" in vocab_types:
+
    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
        vocab: BaseVocab
        if vocab_types is None:
            vocab = NoVocab()
        else:
            vocab = self._create_vocab_by_path(vocab_types)
@ -1408,10 +1443,8 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    args = parser.parse_args(args_in)
-    if args.no_vocab:
+    if args.no_vocab and args.vocab_only:
-        if args.vocab_only:
+        raise ValueError("--vocab-only does not make sense with --no-vocab")
            raise ValueError("no need to specify --vocab-only if using --no-vocab")
        args.vocab_type = "no_vocab"
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
@ -1433,10 +1466,12 @@ def main(args_in: list[str] | None = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+            msg = """\
-                            "Please specify one with --ctx:\n"
+                The model doesn't have a context size, and you didn't specify one with --ctx
-                            " - LLaMA v1: --ctx 2048\n"
+                Please specify one with --ctx:
-                            " - LLaMA v2: --ctx 4096\n")
+                 - LLaMA v1: --ctx 2048
                 - LLaMA v2: --ctx 4096"""
            parser.error(textwrap.dedent(msg))
        params.n_ctx = args.ctx
    if args.outtype:
@ -1451,9 +1486,11 @@ def main(args_in: list[str] | None = None) -> None:
    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
+    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
    if args.vocab_only:
        assert isinstance(vocab, Vocab)
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com
 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
-Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion  is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
+Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 ## Usage
 Build with cmake or run `make llava-cli` to build it.
@ -36,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```
-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf \
@ -78,7 +78,7 @@ cd examples/llava/android/build_64
 ### run on Android
 refer to `android/adb_run.sh`, modify resources' `name` and `path`
-## some result on Android with `Snapdragon 888` chip
+## Some result on Android with `Snapdragon 888` chip
 ### case 1
 **input**
 ```sh
@ -109,7 +109,6 @@ llama_print_timings:       total time =   34731.93 ms
    --image /data/local/tmp/cat.jpeg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
@ -121,12 +120,82 @@ llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 m
 llama_print_timings:       total time =   34570.79 ms
 ```
 ## Some result on Android with `Snapdragon 778G` chip
 ### MobileVLM-1.7B case
 #### llava-cli release-b2005
 **input**
 ```sh
 /data/local/tmp/llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
    --image /data/local/tmp/many_llamas.jpeg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in 18728.52 ms by CLIP (  130.06 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 A group of llamas are standing in a green pasture.
 llama_print_timings:        load time =   20357.33 ms
 llama_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
 llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
 llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
 llama_print_timings:       total time =   28038.34 ms /   205 tokens
 ```
 #### llava-cli latest-version
 **input**
 Just the same as above.
 **output**(seems to be much slower)
 ```sh
 encode_image_with_clip: image embedding created: 144 tokens
 encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 It is a group of sheep standing together in a grass field.
 llama_print_timings:        load time =  818120.91 ms
 llama_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
 llama_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
 llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
 llama_print_timings:       total time =  865441.76 ms /   204 tokens
 ```
 ### MobileVLM_V2-1.7B case
 #### llava-cli release-2005b
 **input**
 Just the same as above.
 **output**
 ```sh
 encode_image_with_clip: image encoded in 20609.61 ms by CLIP (  143.12 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
 The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
 llama_print_timings:        load time =   22406.77 ms
 llama_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
 llama_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
 llama_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
 llama_print_timings:       total time =   44411.01 ms /   377 tokens
 ```
 ## Orin compile and run
 ### compile
 ```sh
 make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```
 ### run on Orin
 ### case 1
 **input**
@ -175,8 +244,121 @@ llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 m
 llama_print_timings:       total time =    1365.47 ms /   243 tokens
 ```
-## Minor shortcomings
+## Running on Intel(R) Core(TM) i7-10750H
-The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+### Operating system
 Ubuntu22.04
 ### compile
 ```sh
 make -j32
 ```
 ### MobileVLM-1.7B case
 **input**
 ```sh
 -m /path/to/ggml-model-q4_k.gguf \
    --mmproj /path/to/mmproj-model-f16.gguf \
    --image /path/to/many_llamas.jpeg
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
 ```
 **output**
 ```sh
 encode_image_with_clip: image embedding created: 144 tokens
 encode_image_with_clip: image encoded in  2730.94 ms by CLIP (   18.96 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that?ASSISTANT:
 A group of llamas are walking together in a field.
 llama_print_timings:        load time =    5506.60 ms
 llama_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
 llama_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
 llama_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
 llama_print_timings:       total time =    5990.25 ms /   202 tokens
 ```
 ### MobileVLM_V2-1.7B case
 **input**
 Just the same as above.
 **ouput**
 ```sh
 encode_image_with_clip: image embedding created: 144 tokens
 encode_image_with_clip: image encoded in  3223.89 ms by CLIP (   22.39 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that?ASSISTANT:
 The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
 The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
 The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
 llama_print_timings:        load time =    6642.61 ms
 llama_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
 llama_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
 llama_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
 llama_print_timings:       total time =   15513.95 ms /   412 tokens
 ```
 ## Run on Intel(R) Core(TM) Ultra7 115H
 ### operation system
 Windows11
 ### comiple
 ```sh
 make -j32
 ```
 ### MobileVLM-1.7B case
 **input**
 ```sh
 -m /path/to/ggml-model-q4_k.gguf \
    --mmproj /path/to/tmp/mmproj-model-f16.gguf \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in  4902.81 ms by CLIP (   34.05 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 The image features a group of brown and white llamas standing in a grassy field.
 llama_print_timings:        load time =    7441.06 ms
 llama_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
 llama_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
 llama_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
 llama_print_timings:       total time =    7987.23 ms /   209 tokens
 ```
 ### MobileVLM_V2-1.7B case
 **input**
 Just the same as above.
 **output**
 ```sh
 encode_image_with_clip: image encoded in  4682.44 ms by CLIP (   32.52 ms per image patch)
 system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
 user_prompt: \nWhat's that? ASSISTANT:
 This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
 of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
 The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
 The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
 front is not visible, indicating that it might not be the main focus of the photo.
 The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
 llama_print_timings:        load time =    7015.35 ms
 llama_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
 llama_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
 llama_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
 llama_print_timings:       total time =   14371.19 ms /   446 tokens
 ```
 ## TODO
@ -191,5 +373,5 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic
 ## contributor
 ```sh
-zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -835,9 +835,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
            // weight ne = [3, 3, 2048, 1]
            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
            embeddings = peg_0;
        }
@ -1755,7 +1756,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
        n_patches /= 4;
    }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -296,7 +296,9 @@ These options help improve the performance and memory usage of the LLaMA models.
 ### Batch Size
-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
 - `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
 ### Prompt Caching
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@ -0,0 +1,303 @@
 import argparse
 import json
 import os
 import re
 import signal
 import socket
 import subprocess
 import sys
 import threading
 import time
 import traceback
 from contextlib import closing
 from datetime import datetime
 import matplotlib
 import matplotlib.dates
 import matplotlib.pyplot as plt
 import requests
 def main(args_in: list[str] | None = None) -> None:
    parser = argparse.ArgumentParser(description="Start server benchmark scenario")
    parser.add_argument("--name", type=str, help="Bench name", required=True)
    parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
    parser.add_argument("--branch", type=str, help="Branch name", default="detached")
    parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
    parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
    parser.add_argument("--port", type=int, help="Server listen host", default="8080")
    parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
    parser.add_argument("--n-prompts", type=int,
                        help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
    parser.add_argument("--max-prompt-tokens", type=int,
                        help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
                        required=True)
    parser.add_argument("--max-tokens", type=int,
                        help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
                        required=True)
    parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
    parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
    parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
    parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
    parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
    parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
    parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
    parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
    parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
    args = parser.parse_args(args_in)
    start_time = time.time()
    # Start the server and performance scenario
    try:
        server_process = start_server(args)
    except Exception:
        print("bench: server start error :")
        traceback.print_exc(file=sys.stdout)
        sys.exit(1)
    # start the benchmark
    try:
        start_benchmark(args)
        iterations = 0
        with open("results.github.env", 'w') as github_env:
            # parse output
            with open('k6-results.json', 'r') as bench_results:
                # Load JSON data from file
                data = json.load(bench_results)
                for metric_name in data['metrics']:
                    for metric_metric in data['metrics'][metric_name]:
                        value = data['metrics'][metric_name][metric_metric]
                        if isinstance(value, float) or isinstance(value, int):
                            value = round(value, 2)
                            data['metrics'][metric_name][metric_metric]=value
                            github_env.write(
                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
                token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
                iterations = data['root_group']['checks']['success completion']['passes']
    except Exception:
        print("bench: error :")
        traceback.print_exc(file=sys.stdout)
    # Stop the server
    if server_process:
        try:
            print(f"bench: shutting down server pid={server_process.pid} ...")
            if os.name == 'nt':
                interrupt = signal.CTRL_C_EVENT
            else:
                interrupt = signal.SIGINT
            server_process.send_signal(interrupt)
            server_process.wait(0.5)
        except subprocess.TimeoutExpired:
            print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
            server_process.kill()  # SIGKILL
            server_process.wait()
        while is_server_listening(args.host, args.port):
            time.sleep(0.1)
    title = (f"llama.cpp {args.name} on {args.runner_label}\n "
             f"duration={args.duration} {iterations} iterations")
    xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
              f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
              f"branch={args.branch} commit={args.commit}")
    # Prometheus
    end_time = time.time()
    if is_server_listening("0.0.0.0", 9090):
        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
        for metric in metrics:
            resp = requests.get(f"http://localhost:9090/api/v1/query_range",
                                params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
            with open(f"{metric}.json", 'w') as metric_json:
                metric_json.write(resp.text)
            if resp.status_code != 200:
                print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
            else:
                metric_data = resp.json()
                values = metric_data['data']['result'][0]['values']
                timestamps, metric_values = zip(*values)
                metric_values = [float(value) for value in metric_values]
                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
                plt.figure(figsize=(16, 10), dpi=80)
                plt.plot(timestamps_dt, metric_values, label=metric)
                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
                plt.yticks(fontsize=12, alpha=.7)
                ylabel = f"llamacpp:{metric}"
                plt.title(title,
                          fontsize=14, wrap=True)
                plt.grid(axis='both', alpha=.3)
                plt.ylabel(ylabel, fontsize=22)
                plt.xlabel(xlabel, fontsize=14, wrap=True)
                plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
                plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
                plt.gcf().autofmt_xdate()
                # Remove borders
                plt.gca().spines["top"].set_alpha(0.0)
                plt.gca().spines["bottom"].set_alpha(0.3)
                plt.gca().spines["right"].set_alpha(0.0)
                plt.gca().spines["left"].set_alpha(0.3)
                # Save the plot as a jpg image
                plt.savefig(f'{metric}.jpg', dpi=60)
                plt.close()
                # Mermaid format in case images upload failed
                with (open(f"{metric}.mermaid", 'w') as mermaid_f):
                    mermaid = (
                    f"""---
 config:
    xyChart:
        titleFontSize: 12
        width: 900
        height: 600
    themeVariables:
        xyChart:
            titleColor: "#000000"
 ---
 xychart-beta
    title "{title}"
    y-axis "llamacpp:{metric}"
    x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
    line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
                    """)
                    mermaid_f.write(mermaid)
    # 140 chars max for commit status description
    bench_results = {
        "req": {
            "p90": data['metrics']["http_req_duration"]["p(90)"],
            "avg": data['metrics']["http_req_duration"]["avg"],
        },
        "pp": {
            "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
            "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
        },
        "tg": {
            "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
            "avg": data['metrics']["llamacpp_tokens_second"]["avg"],
        },
    }
    with open("results.github.env", 'a') as github_env:
        github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
        github_env.write(f"BENCH_ITERATIONS={iterations}\n")
        title = title.replace('\n', ' ')
        xlabel = xlabel.replace('\n', ' ')
        github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
        github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
 def start_benchmark(args):
    k6_path = 'k6'
    if 'BENCH_K6_BIN_PATH' in os.environ:
        k6_path = os.environ['BENCH_K6_BIN_PATH']
    k6_args = [
        'run', args.scenario,
        '--no-color',
    ]
    k6_args.extend(['--duration', args.duration])
    k6_args.extend(['--iterations', args.n_prompts])
    k6_args.extend(['--vus', args.parallel])
    k6_args.extend(['--summary-export', 'k6-results.json'])
    args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
    args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
    print(f"bench: starting k6 with: {args}")
    k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
    if k6_completed.returncode != 0:
        raise Exception("bench: unable to run k6")
 def start_server(args):
    server_process = start_server_background(args)
    attempts = 0
    max_attempts = 20
    if 'GITHUB_ACTIONS' in os.environ:
        max_attempts *= 2
    while not is_server_listening(args.host, args.port):
        attempts += 1
        if attempts > max_attempts:
            assert False, "server not started"
        print(f"bench:     waiting for server to start ...")
        time.sleep(0.5)
    print("bench: server started.")
    return server_process
 def start_server_background(args):
    # Start the server
    server_path = '../../../build/bin/server'
    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
    server_args = [
        '--host', args.host,
        '--port', args.port,
    ]
    model_file = args.model_path_prefix + os.path.sep + args.hf_file
    model_dir  = os.path.dirname(model_file)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    server_args.extend(['--model', model_file])
    server_args.extend(['--hf-repo', args.hf_repo])
    server_args.extend(['--hf-file', args.hf_file])
    server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
    server_args.extend(['--ctx-size', args.ctx_size])
    server_args.extend(['--parallel', args.parallel])
    server_args.extend(['--batch-size', args.batch_size])
    server_args.extend(['--ubatch-size', args.ubatch_size])
    server_args.extend(['--n-predict', args.max_tokens * 2])
    server_args.extend(['--defrag-thold', "0.1"])
    server_args.append('--cont-batching')
    server_args.append('--metrics')
    server_args.extend(['--log-format', "text"])
    args = [str(arg) for arg in [server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
    pkwargs = {
        'stdout': subprocess.PIPE,
        'stderr': subprocess.PIPE
    }
    server_process = subprocess.Popen(
        args,
        **pkwargs)
    def server_log(in_stream, out_stream):
        for line in iter(in_stream.readline, b''):
            print(line.decode('utf-8'), end='', file=out_stream)
    thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
    thread_stdout.start()
    thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
    thread_stderr.start()
    return server_process
 def is_server_listening(server_fqdn, server_port):
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
        result = sock.connect_ex((server_fqdn, server_port))
        _is_server_listening = result == 0
        if _is_server_listening:
            print(f"server is listening on {server_fqdn}:{server_port}...")
        return _is_server_listening
 def escape_metric_name(metric_name):
    return re.sub('[^A-Z0-9]', '_', metric_name.upper())
 if __name__ == '__main__':
    main()
--- a/examples/server/bench/prometheus.yml
+++ b/examples/server/bench/prometheus.yml
@ -0,0 +1,9 @@
 global:
  scrape_interval:     10s
  external_labels:
    llamacpp: 'server'
 scrape_configs:
  - job_name: 'llama.cpp server'
    static_configs:
      - targets: ['localhost:8080']
--- a/examples/server/bench/requirements.txt
+++ b/examples/server/bench/requirements.txt
@ -0,0 +1,2 @@
 matplotlib
 requests
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3566,6 +3566,7 @@ int main(int argc, char ** argv) {
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1114,7 +1114,10 @@ def start_server_background(context):
        server_args.append('--verbose')
    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
        server_args.extend(['--log-format', "text"])
-    print(f"starting server with: {context.server_path} {server_args}")
+
    args = [str(arg) for arg in [context.server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
    flags = 0
    if 'nt' == os.name:
        flags |= subprocess.DETACHED_PROCESS
@ -1130,16 +1133,14 @@ def start_server_background(context):
        [str(arg) for arg in [context.server_path, *server_args]],
        **pkwargs)
-    def log_stdout(process):
+    def server_log(in_stream, out_stream):
-        for line in iter(process.stdout.readline, b''):
+        for line in iter(in_stream.readline, b''):
-            print(line.decode('utf-8'), end='')
+            print(line.decode('utf-8'), end='', file=out_stream)
-    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
+
    thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
    thread_stdout.start()
-    def log_stderr(process):
+    thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
        for line in iter(process.stderr.readline, b''):
            print(line.decode('utf-8'), end='', file=sys.stderr)
    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
    thread_stderr.start()
    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
--- a/flake.nix
+++ b/flake.nix
@ -145,6 +145,7 @@
            # the same path you would with an overlay.
            legacyPackages = {
              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
            };
@ -155,6 +156,7 @@
              {
                default = config.legacyPackages.llamaPackages.llama-cpp;
                vulkan = config.packages.default.override { useVulkan = true; };
                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
@ -168,9 +170,14 @@
              };
            # Packages exposed in `.#checks` will be built by the CI and by
-            # `nix flake check`. Currently we expose all packages, but we could
+            # `nix flake check`.
-            # make more granular choices
+            #
-            checks = config.packages;
+            # We could test all outputs e.g. as `checks = confg.packages`.
            #
            # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
            checks = {
              inherit (config.packages) default vulkan;
            };
          };
      };
 }
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -2951,7 +2951,7 @@ namespace dpct
 #include "ggml-common.h"
 static int g_ggml_sycl_debug=0;
-#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
+#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0)
 #define CHECK_TRY_ERROR(expr)                                                  \
  [&]() {                                                                      \
@ -12851,6 +12851,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
 }
 void ggml_backend_sycl_print_sycl_devices() {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
    int device_count = dpct::dev_mgr::instance().device_count();
    std::map<std::string, size_t> DeviceNums;
    fprintf(stderr, "found %d SYCL devices:\n", device_count);
@ -12908,7 +12909,9 @@ static void ggml_init_sycl() try {
    static bool initialized = false;
    if (!initialized) {
        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
 #if defined(GGML_SYCL_F16)
@ -16022,6 +16025,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
 }
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
    for(int i=0;i<max_len;i++) id_list[i] = -1;
    if (!g_sycl_gpu_mgr) {
@ -16056,6 +16060,7 @@ catch (sycl::exception const &exc) {
 GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
                                      size_t description_size) try {
    GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
    dpct::device_info prop;
    int device_id = g_sycl_gpu_mgr->gpus[device];
    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@ -16070,6 +16075,7 @@ catch (sycl::exception const &exc) {
 GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
                                                   size_t *total) try {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
    ggml_sycl_set_device(device);
    /*
@ -16421,7 +16427,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
 };
 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
-    ggml_init_sycl();
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
    if (device_index>=g_device_count or device_index<0) {
        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
            device_index, g_device_count-1);
@ -16791,6 +16798,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
 };
 GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
    ggml_init_sycl();
    // FIXME: this is not thread safe
    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
@ -16863,6 +16871,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
 }
 ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n");
    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
        /* .iface    = */ {
            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
@ -17159,6 +17168,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
 }
 GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
    ggml_init_sycl();
    check_allow_gpu_index(device);
@ -17185,6 +17195,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
 }
 GGML_CALL int ggml_backend_sycl_get_device_count() {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
    if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
    return g_sycl_gpu_mgr->get_gpu_count();
 }
@ -17197,16 +17208,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params,
 }
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n");
    return g_sycl_gpu_mgr->get_index(device_id);
 }
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
    return g_sycl_gpu_mgr->gpus[device_index];
 }
 GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
-    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
+    ggml_init_sycl();
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
    if (g_sycl_gpu_mgr) {
        delete g_sycl_gpu_mgr;
    }
@ -17217,6 +17233,9 @@ GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id
 }
 GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
    ggml_init_sycl();
    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_mul_device_mode\n");
    if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
        return;
    }
--- a/ggml.c
+++ b/ggml.c
@ -2938,7 +2938,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        data_size *= ne[i];
    }
-    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
+    GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
    void * data = view_src != NULL ? view_src->data : NULL;
    if (data != NULL) {
--- a/llama.cpp
+++ b/llama.cpp
@ -9152,8 +9152,9 @@ struct llm_build_context {
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
            }
            struct ggml_tensor * attn_out = cur;
--- a/llama.h
+++ b/llama.h
@ -60,9 +60,9 @@ extern "C" {
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
+        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
+        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
    };
    // note: these values should be synchronized with ggml_rope