Merge remote-tracking branch 'origin/master' into sl/micro-batching

This commit is contained in:
slaren 2024-01-24 21:12:13 +01:00
commit f69ab89520
37 changed files with 2473 additions and 856 deletions

View file

@ -0,0 +1,26 @@
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
ARG UBUNTU_VERSION=22.04
FROM intel/hpckit:$ONEAPI_VERSION as build
RUN apt-get update && \
apt-get install -y git
WORKDIR /app
COPY . .
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
cmake --build . --config Release --target main server
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/build/bin/main /main
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
ENTRYPOINT [ "/main" ]

View file

@ -7,6 +7,18 @@
{ system, ... }:
{
_module.args = {
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
# again, the below creates several nixpkgs instances which the
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
#
# This is currently "slow" and "expensive", on a certain scale.
# This also isn't "right" in that this hinders dependency injection at
# the level of flake inputs. This might get removed in the foreseeable
# future.
#
# Note that you can use these expressions without Nix
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
pkgsCuda = import inputs.nixpkgs {
inherit system;
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,

View file

@ -73,6 +73,7 @@ let
ps: [
ps.numpy
ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda
ps.transformers
]
@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
pname = "llama-cpp${pnameSuffix}";
version = llamaVersion;
# Note: none of the files discarded here are visible in the sandbox or
# affect the output hash. This also means they can be modified without
# triggering a rebuild.
src = lib.cleanSourceWith {
filter =
name: type:
!(builtins.any (_: _) [
let
noneOf = builtins.all (x: !x);
baseName = baseNameOf name;
in
noneOf [
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
(name == "README.md") # Ignore *.md changes whe computing outPaths
(lib.hasPrefix "." name) # Skip hidden files and directories
]);
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
(lib.hasPrefix "." baseName) # Skip hidden files and directories
(baseName == "flake.lock")
];
src = lib.cleanSource ../../.;
};
@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
cmakeFlags =
[
(cmakeBool "LLAMA_NATIVE" true)
(cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
description = "contains numpy and sentencepiece";
buildInputs = [ llama-python ];
inputsFrom = [ finalAttrs.finalPackage ];
shellHook = ''
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
'';
};
shell-extra = mkShell {

View file

@ -4,6 +4,10 @@
llamaVersion ? "0.0.0",
}:
# We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope (
self: {
inherit llamaVersion;

View file

@ -295,7 +295,7 @@ jobs:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.21.1-2023-04-24
SDE_VERSION: 9.33.0-2024-01-07
strategy:
matrix:
@ -400,7 +400,7 @@ jobs:
id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar

View file

@ -35,6 +35,7 @@ jobs:
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v3

View file

@ -2,13 +2,20 @@ name: Nix aarch64 builds
on:
workflow_dispatch: # allows manual triggering
schedule:
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
# 1.5h instead of minutes with the cold cache).
#
# randint(0, 59), randint(0, 23)
- cron: '26 12 * * *'
# But also rebuild if we touched any of the Nix expressions:
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
paths: ['**/*.nix', 'flake.lock']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
paths: ['**/*.nix', 'flake.lock']
jobs:
nix-build-aarch64:

View file

@ -5,10 +5,8 @@ on:
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs:
nix-eval:

View file

@ -108,6 +108,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
if (LLAMA_PERF)
add_definitions(-DGGML_PERF)
endif()
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -471,6 +478,11 @@ function(get_flags CCID CCVER)
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
endif()
elseif (CCID MATCHES "Intel")
# enable max optimization level when using Intel compiler
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
add_link_options(-fuse-ld=lld -static-intel)
endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)

View file

@ -203,6 +203,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true;
} else if (arg == "-bf" || arg == "--binary-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
// store the external file name in params
params.prompt_file = argv[i];
std::ostringstream ss;
ss << file.rdbuf();
params.prompt = ss.str();
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
@ -659,6 +676,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
params.logdir += DIRECTORY_SEPARATOR;
}
} else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.logits_file = argv[i];
} else if (arg == "--perplexity" || arg == "--all-logits") {
params.logits_all = true;
} else if (arg == "--ppl-stride") {
@ -695,6 +718,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.winogrande_tasks = std::stoi(argv[i]);
} else if (arg == "--multiple-choice") {
params.multiple_choice = true;
} else if (arg == "--multiple-choice-tasks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.multiple_choice_tasks = std::stoi(argv[i]);
} else if (arg == "--kl-divergence") {
params.kl_divergence = true;
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") {
@ -894,6 +927,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
printf(" -f FNAME, --file FNAME\n");
printf(" prompt file to start generation.\n");
printf(" -bf FNAME, --binary-file FNAME\n");
printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@ -944,6 +979,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);

View file

@ -92,6 +92,7 @@ struct gpt_params {
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
std::string logits_file = ""; // file for saving *all* logits
std::vector<llama_model_kv_override> kv_overrides;
@ -109,6 +110,11 @@ struct gpt_params {
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL-divergence
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs

View file

@ -10,7 +10,7 @@ import re
import sys
from enum import IntEnum
from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
import numpy as np
import torch
@ -289,6 +289,58 @@ class Model:
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor
@ -487,7 +539,8 @@ class MPTModel(Model):
# map tensor names
if "scales" in name:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
new_name = new_name.replace("scales", "act.scales")
if new_name is not None:
new_name = new_name.replace("scales", "act.scales")
else:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
@ -876,6 +929,13 @@ class PersimmonModel(Model):
class StableLMModel(Model):
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
self._set_vocab_qwen()
def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
@ -904,7 +964,7 @@ class QwenModel(Model):
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@staticmethod
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
parts = [bytes([b]) for b in token]
while True:
min_idx = None
@ -921,52 +981,7 @@ class QwenModel(Model):
return parts
def set_vocab(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[self.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.special_tokens
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
self._set_vocab_qwen()
def set_gguf_parameters(self):
self.gguf_writer.add_name("Qwen")
@ -1285,7 +1300,7 @@ def main() -> None:
if args.awq_path:
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model"
dir_model = tmp_model_path
if tmp_model_path.is_dir():

View file

@ -2,6 +2,7 @@
from __future__ import annotations
import argparse
import os
import struct
import sys
from enum import IntEnum
@ -9,7 +10,6 @@ from pathlib import Path
import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else:
raise ValueError('Unable to load metadata')
vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype)
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
vocab_factory = convert.VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)
return params, vocab, special_vocab
def handle_args():

View file

@ -5,17 +5,16 @@ import json
import os
import struct
import sys
from pathlib import Path
from typing import Any, BinaryIO, Sequence
import numpy as np
import torch
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -60,7 +59,14 @@ if __name__ == '__main__':
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu")
if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu")
else:
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
model = load_file(input_model, device="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():

View file

@ -1,11 +1,13 @@
#!/usr/bin/env python3
import torch
import os
from pprint import pprint
import sys
import argparse
import os
import sys
from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
@ -69,7 +71,7 @@ def main():
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
pprint(hparams)
tensors = {}
tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON

View file

@ -17,58 +17,28 @@ import signal
import struct
import sys
import time
import warnings
import zipfile
from abc import ABCMeta, abstractmethod
from argparse import ArgumentParser
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import (
IO,
TYPE_CHECKING,
Any,
Callable,
Iterable,
Literal,
Optional,
Tuple,
TypeVar,
)
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
import numpy as np
from sentencepiece import SentencePieceProcessor
try:
from transformers import AutoTokenizer
except ModuleNotFoundError as e:
warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
if "NO_LOCAL_GGUF" not in os.environ:
# Use absolute path to the gguf-py directory
gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed
if gguf_py_dir not in sys.path:
sys.path.insert(1, gguf_py_dir)
if TYPE_CHECKING:
from typing import TypeAlias
# Import gguf module
try:
import gguf
except ModuleNotFoundError as e:
print(f"Could not import gguf: {e}")
sys.exit(1)
if TYPE_CHECKING: # NOTE: This isn't necessary.
from typing import TypeAlias # This can technically be omitted.
if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
# NOTE: n-dimensional arrays should be directly referenced
NDArray: TypeAlias = "np.ndarray[Any, Any]"
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8
@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
#
# TODO: Clean up and refactor data types
@dataclass(frozen=True)
class DataType:
name: str
@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
@dataclass
class Params:
n_vocab: int
n_embd: int
n_layer: int
n_ctx: int
n_ff: int
n_head: int
n_head_kv: int
f_norm_eps: Optional[float] = None
n_experts: Optional[int] = None
n_experts_used: Optional[int] = None
n_vocab: int
n_embd: int
n_layer: int
n_ctx: int
n_ff: int
n_head: int
n_head_kv: int
n_experts: int | None = None
n_experts_used: int | None = None
f_norm_eps: float | None = None
rope_scaling_type: Optional[gguf.RopeScalingType] = None
f_rope_freq_base: Optional[float] = None
f_rope_scale: Optional[float] = None
n_orig_ctx: Optional[int] = None
rope_finetuned: Optional[bool] = None
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
n_orig_ctx: int | None = None
rope_finetuned: bool | None = None
ftype: Optional[GGMLFileType] = None
ftype: GGMLFileType | None = None
# path to the directory containing the model files
path_model: Optional[Path] = None
path_model: Path | None = None
@staticmethod
def guessed(model: LazyModel) -> "Params":
def guessed(model: LazyModel) -> Params:
# try transformer naming first
n_vocab, n_embd = (
model["model.embed_tokens.weight"].shape
if "model.embed_tokens.weight" in model
else model["tok_embeddings.weight"].shape
)
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer = next(
i
for i in itertools.count()
if f"model.layers.{i}.self_attn.q_proj.weight" not in model
)
elif (
"model.layers.0.self_attn.W_pack.weight" in model
): # next: try baichuan naming
n_layer = next(
i
for i in itertools.count()
if f"model.layers.{i}.self_attn.W_pack.weight" not in model
)
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
else:
n_layer = next(
i
for i in itertools.count()
if f"layers.{i}.attention.wq.weight" not in model
)
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1:
raise Exception(
"failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
)
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_head = n_embd // 128 # guessed
n_mult = 256 # guessed
n_head = n_embd // 128 # guessed
n_mult = 256 # guessed
# TODO: verify this
n_ff = int(2 * (4 * n_embd) / 3)
n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
return Params(
n_vocab=n_vocab,
n_embd=n_embd,
n_layer=n_layer,
n_ctx=-1,
n_ff=n_ff,
n_head=n_head,
n_head_kv=n_head,
f_norm_eps=1e-5,
n_vocab = n_vocab,
n_embd = n_embd,
n_layer = n_layer,
n_ctx = -1,
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head,
f_norm_eps = 1e-5,
)
@staticmethod
def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@ -274,22 +223,20 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling["original_max_position_embeddings"]
rope_finetuned = rope_scaling["finetuned"]
n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f"Unknown rope scaling type: {typ}")
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"]
elif "max_position_embeddings" in config:
n_ctx = config["max_position_embeddings"]
else:
raise Exception(
"failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
)
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_experts = None
n_experts = None
n_experts_used = None
if "num_local_experts" in config:
@ -297,30 +244,30 @@ class Params:
n_experts_used = config["num_experts_per_tok"]
return Params(
n_vocab=config["vocab_size"],
n_embd=config["hidden_size"],
n_layer=config["num_hidden_layers"],
n_ctx=n_ctx,
n_ff=config["intermediate_size"],
n_head=(n_head := config["num_attention_heads"]),
n_head_kv=config.get("num_key_value_heads", n_head),
n_experts=n_experts,
n_experts_used=n_experts_used,
f_norm_eps=config["rms_norm_eps"],
f_rope_freq_base=config.get("rope_theta"),
rope_scaling_type=rope_scaling_type,
f_rope_scale=f_rope_scale,
n_orig_ctx=n_orig_ctx,
rope_finetuned=rope_finetuned,
n_vocab = config["vocab_size"],
n_embd = config["hidden_size"],
n_layer = config["num_hidden_layers"],
n_ctx = n_ctx,
n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
rope_finetuned = rope_finetuned,
)
# LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod
def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
n_experts = None
n_experts = None
n_experts_used = None
f_rope_freq_base = None
@ -343,50 +290,50 @@ class Params:
if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"]
n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6
return Params(
n_vocab=model["tok_embeddings.weight"].shape[0],
n_embd=config["dim"],
n_layer=config["n_layers"],
n_ctx=n_ctx,
n_ff=n_ff,
n_head=(n_head := config["n_heads"]),
n_head_kv=config.get("n_kv_heads", n_head),
n_experts=n_experts,
n_experts_used=n_experts_used,
f_norm_eps=config["norm_eps"],
f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = config["dim"],
n_layer = config["n_layers"],
n_ctx = n_ctx,
n_ff = n_ff,
n_head = (n_head := config["n_heads"]),
n_head_kv = config.get("n_kv_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["norm_eps"],
f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
)
@staticmethod
def load(model_plus: ModelPlus) -> "Params":
hf_config_path = model_plus.paths[0].parent / "config.json"
def load(model_plus: ModelPlus) -> Params:
hf_config_path = model_plus.paths[0].parent / "config.json"
orig_config_path = model_plus.paths[0].parent / "params.json"
if hf_config_path.exists():
params = Params.load_transformers_config(model_plus.model, hf_config_path)
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
elif orig_config_path.exists():
params = Params.load_torch_params(model_plus.model, orig_config_path)
elif model_plus.format != "none":
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
elif model_plus.format != 'none':
params = Params.guessed(model_plus.model)
else:
raise ValueError("Cannot guess params when model format is none")
raise ValueError('Cannot guess params when model format is none')
params.path_model = model_plus.paths[0].parent
return params
class BpeVocab: # GPT
def __init__(
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
) -> None:
self.bpe_tokenizer = json.loads(
open(str(fname_tokenizer), encoding="utf-8").read()
)
#
# vocab
#
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"]
added_tokens: dict[str, int]
if fname_added_tokens is not None:
@ -394,34 +341,31 @@ class BpeVocab: # GPT
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
# Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item["content"], item["id"])
for item in tokenizer_json.get("added_tokens", [])
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item["content"] not in self.bpe_tokenizer
)
if item['content'] not in self.bpe_tokenizer)
vocab_size: int = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(
f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
)
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -442,10 +386,8 @@ class BpeVocab: # GPT
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab: # LlaMa
def __init__(
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
) -> None:
class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int]
if fname_added_tokens is not None:
@ -455,23 +397,19 @@ class SentencePieceVocab: # LlaMa
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {
id: piece for piece, id in added_tokens.items() if id >= vocab_size
}
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(
f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
)
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@ -512,11 +450,15 @@ class SentencePieceVocab: # LlaMa
class HfVocab:
def __init__(
self,
fname_tokenizer: Path,
fname_added_tokens: Optional[Path] = None,
) -> None:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use HfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
print("fname_tokenizer:", fname_tokenizer)
# Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths.
@ -529,7 +471,7 @@ class HfVocab:
# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
self.added_tokens_dict = dict()
self.added_tokens_ids = set()
self.added_tokens_ids = set()
# Process added tokens
for tok, tokidx in sorted(
@ -550,12 +492,12 @@ class HfVocab:
# Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
}
@ -573,11 +515,9 @@ class HfVocab:
token_id, self.special_ids # Reuse already stored special IDs
)
def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
# Determine token type based on whether it's a special token
return (
gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
)
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
def get_token_score(self, token_id: int) -> float:
# Placeholder for actual logic to determine the token's score
@ -589,7 +529,6 @@ class HfVocab:
if text in self.specials:
toktype = self.get_token_type(self.specials[text], self.special_ids)
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
score = -1000.0
@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
else:
model = merge_sharded([mp.model for mp in models_plus])
return ModelPlus(model, paths, format, vocab)
return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute.
("torch._tensor", "_rebuild_from_type_v2"): getattr(
rebuild_from_type_v2, "__func__"
),
("torch._utils", "_rebuild_tensor_v2"): getattr(
lazy_rebuild_tensor_v2, "__func__"
),
("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
("torch", "HalfStorage"): LazyStorageKind(DT_F16),
("torch", "FloatStorage"): LazyStorageKind(DT_F32),
("torch", "IntStorage"): LazyStorageKind(DT_I32),
("torch", "Tensor"): LazyTensor,
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
('torch', 'IntStorage'): LazyStorageKind(DT_I32),
('torch', 'Tensor'): LazyTensor,
}
def find_class(self, module: str, name: str) -> Any:
@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
executor_class = ProcessPoolExecutor
else:
executor_class = ThreadPoolExecutor
with executor_class(max_workers = max_workers) as executor:
with executor_class(max_workers=max_workers) as executor:
futures: list[concurrent.futures.Future[Out]] = []
done = False
for _ in range(concurrency):
@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
class OutputFile:
def __init__(
self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
) -> None:
self.gguf = gguf.GGUFWriter(
fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
)
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
@ -1036,21 +967,16 @@ class OutputFile:
if params.n_ctx == 4096:
name = "LLaMA v2"
elif params.path_model is not None:
name = str(params.path_model.parent).split("/")[-1]
name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name(name)
self.gguf.add_context_length(params.n_ctx)
self.gguf.add_embedding_length(params.n_embd)
self.gguf.add_block_count(params.n_layer)
self.gguf.add_feed_forward_length(params.n_ff)
self.gguf.add_name (name)
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
self.gguf.add_feed_forward_length (params.n_ff)
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count(params.n_head)
self.gguf.add_head_count_kv(params.n_head_kv)
if params.f_norm_eps is None:
raise ValueError("f_norm_eps is None")
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv)
if params.n_experts:
self.gguf.add_expert_count(params.n_experts)
@ -1058,6 +984,11 @@ class OutputFile:
if params.n_experts_used:
self.gguf.add_expert_used_count(params.n_experts_used)
if params.f_norm_eps:
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
else:
raise ValueError('f_norm_eps is None')
if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -1089,7 +1020,7 @@ class OutputFile:
return tokenizer_model
def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
tokens = []
scores = []
toktypes = []
@ -1124,14 +1055,10 @@ class OutputFile:
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
n_elements = int(np.prod(tensor.shape))
raw_dtype = getattr(tensor.data_type, "ggml_type", None)
data_type = (
getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
)
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
self.gguf.add_tensor_info(
name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
)
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
def write_meta(self) -> None:
self.gguf.write_header_to_file()
@ -1145,14 +1072,10 @@ class OutputFile:
@staticmethod
def write_vocab_only(
fname_out: Path,
params: Params,
vocab: Vocab,
svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out, endianess=endianess)
@ -1180,14 +1103,8 @@ class OutputFile:
@staticmethod
def write_all(
fname_out: Path,
ftype: GGMLFileType,
params: Params,
model: LazyModel,
vocab: Vocab,
svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1207,26 +1124,19 @@ class OutputFile:
of.write_tensor_info()
# tensor data
ndarrays_inner = bounded_parallel_map(
OutputFile.do_item, model.items(), concurrency=concurrency
)
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize,
ndarrays_inner,
concurrency=concurrency,
max_workers=concurrency,
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(
zip(model.items(), ndarrays)
):
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory:
def __init__(self, path: Path):
self.path = path
self.files = {
self.files: dict[str, Path | None] = {
"tokenizer.model": None,
"vocab.json": None,
"tokenizer.json": None,
@ -1380,24 +1290,18 @@ class VocabFactory:
self.files[file] = parent_file_path
print(f"Found vocab files: {self.files}")
def _select_file(self, vocabtype: Optional[str]) -> Path:
def _select_file(self, vocabtype: str | None) -> Path:
if vocabtype in ["spm", "bpe"]:
for file_key in self.files.keys():
if self.files[file_key]:
return self.files[file_key]
if (file := self.files[file_key]) is not None:
return file
raise FileNotFoundError(f"{vocabtype} vocab not found.")
elif vocabtype == "hfft":
if vocabtype == "hfft":
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
return self.path
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def _create_special_vocab(
self,
vocab: Vocab,
vocabtype: str,
model_parent_path: Path,
) -> gguf.SpecialVocab:
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab(
@ -1407,13 +1311,12 @@ class VocabFactory:
n_vocab=n_vocab,
)
def load_vocab(
self, vocabtype: str, model_parent_path: Path
) -> Tuple[Vocab, gguf.SpecialVocab]:
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype)
print(f"Loading vocab file '{path}', type '{vocabtype}'")
added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocabtype == "bpe":
vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
@ -1428,6 +1331,7 @@ class VocabFactory:
)
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab(
vocab,
vocabtype,
@ -1436,18 +1340,17 @@ class VocabFactory:
return vocab, special_vocab
def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
GGMLFileType.MostlyQ8_0: "q8_0",
GGMLFileType.MostlyQ8_0:"q8_0",
}[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
if ret in model_paths:
sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n"
)
"Please explicitly specify a path using --outfile.\n")
sys.exit(1)
return ret
@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.format = {model_plus.format!r}")
print(f"model_plus.vocab = {model_plus.vocab!r}")
for name, lazy_tensor in model_plus.model.items():
print(
f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
)
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
def get_argument_parser() -> ArgumentParser:
def main(args_in: list[str] | None = None) -> None:
output_choices = ["f32", "f16"]
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser = argparse.ArgumentParser(
description="Convert a LLaMa model to a GGML compatible file"
)
parser.add_argument(
"model",
type=Path,
help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
)
parser.add_argument(
"--awq-path",
type=Path,
help="Path to the Activation-aware Weight Quantization cache file",
default=None,
)
parser.add_argument(
"--dump",
action="store_true",
help="Display the model content without converting it",
)
parser.add_argument(
"--dump-single",
action="store_true",
help="Display the content of a single model file without conversion",
)
parser.add_argument(
"--vocab-only",
action="store_true",
help="Extract and output only the vocabulary",
)
parser.add_argument(
"--outtype",
choices=output_choices,
help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
)
parser.add_argument(
"--vocab-dir",
type=Path,
help="Directory containing the tokenizer.model, if separate from the model file",
)
parser.add_argument(
"--vocab-type",
choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer
default="spm",
help="The vocabulary format used to define the tokenizer model (default: spm)",
)
parser.add_argument(
"--pad-vocab",
action="store_true",
help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
)
parser.add_argument(
"--outfile",
type=Path,
help="Specify the path for the output file (default is based on input)",
)
parser.add_argument(
"--ctx", type=int, help="Model training context (default is based on input)"
)
parser.add_argument(
"--concurrency",
type=int,
help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
default=DEFAULT_CONCURRENCY,
)
parser.add_argument(
"--big-endian",
action="store_true",
help="Indicate that the model is executed on a big-endian machine",
)
return parser
def main(argv: Optional[list[str]] = None) -> None:
parser = get_argument_parser()
args = parser.parse_args(argv)
args = parser.parse_args(args_in)
if args.awq_path:
sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
from awq.apply_awq import add_scale_weights
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model"
if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.")
@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
if not args.vocab_only:
model_plus = load_some_model(args.model)
else:
model_plus = ModelPlus(
model={}, paths=[args.model / "dummy"], format="none", vocab=None
)
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.big_endian:
endianess = gguf.GGUFEndian.BIG
@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
raise Exception(
"The model doesn't have a context size, and you didn't specify one with --ctx\n"
"Please specify one with --ctx:\n"
" - LLaMA v1: --ctx 2048\n"
" - LLaMA v2: --ctx 4096\n"
)
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
"Please specify one with --ctx:\n"
" - LLaMA v1: --ctx 2048\n"
" - LLaMA v2: --ctx 4096\n")
params.n_ctx = args.ctx
if args.outtype:
@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile
OutputFile.write_vocab_only(
outfile,
params,
vocab,
special_vocab,
endianess=endianess,
pad_vocab=args.pad_vocab,
)
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}")
return
if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab
model = model_plus.model
model = convert_model_names(model, params)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_output_file(model_plus.paths, ftype)
print(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}")
model = model_plus.model
model = convert_model_names(model, params)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(
outfile,
ftype,
params,
model,
vocab,
special_vocab,
concurrency=args.concurrency,
endianess=endianess,
pad_vocab=args.pad_vocab,
)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}")
if __name__ == "__main__":
main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv
if __name__ == '__main__':
main()

View file

@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
std::vector<size_t> train_samples_begin;
std::vector<size_t> train_samples_size;
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
tokenize_file(lctx,
params.common.fn_train_data,
params.common.sample_start,

View file

@ -26,6 +26,7 @@ struct StatParams {
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false;
};
@ -42,6 +43,9 @@ private:
int m_last_call = 0;
std::vector<float> m_src1_data;
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name) const;
void keep_imatrix(int ncall) const;
};
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix();
}
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
}
}
}
} else {
@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix();
}
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
}
}
}
@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
void IMatrixCollector::save_imatrix() const {
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
}
void IMatrixCollector::keep_imatrix(int ncall) const {
auto file_name = m_params.ofile;
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str());
}
void IMatrixCollector::save_imatrix(const char * fname) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries));
@ -248,7 +269,7 @@ static void process_logits(
}
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
}
std::vector<float> logit_history;
logit_history.resize(tokens.size());
std::vector<float> prob_history;
prob_history.resize(tokens.size());
if (compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
const int n_chunk_max = tokens.size() / n_ctx;
@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
if (compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
@ -321,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
if (compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
@ -338,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
const int first = n_ctx/2;
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
if (compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
logits.clear();
}
}
printf("\n");
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
if (compute_ppl) {
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
}
return true;
@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) {
StatParams sparams;
bool compute_ppl = true;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
@ -381,12 +419,21 @@ int main(int argc, char ** argv) {
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
args.push_back(argv[iarg]);
std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]);
}
}
gpt_params params;
@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params);
bool OK = compute_imatrix(ctx, params, compute_ppl);
if (!OK) {
return 1;
}

View file

@ -6,7 +6,7 @@
" Similarly, you could add an insert mode keybind with
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
"
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
" let g:llama_api_url = "192.168.1.10:8080"
" llama_overrides can also be set through buffer/window scopes. For instance
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@ -82,6 +82,9 @@ func llama#doLlamaGen()
endif
let l:querydata.prompt = join(l:buflines, "\n")
let l:curlcommand = copy(s:curlcommand)
if exists("g:llama_api_key")
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
endif
let l:curlcommand[2] = json_encode(l:querydata)
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
endfunction

View file

@ -0,0 +1,131 @@
# MobileVLM
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
## Usage
Build with cmake or run `make llava-cli` to build it.
After building, run: `./llava-cli` to see the usage. For example:
```sh
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
```
## Model conversion
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
--projector-type ldp
```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./convert.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
```sh
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
## Android compile and run
### compile
refer to `examples/llava/android/build_64.sh`
```sh
mkdir examples/llava/android/build_64
cd examples/llava/android/build_64
../build_64.sh
```
### run on Android
refer to `android/adb_run.sh`, modify resources' `name` and `path`
## some result on Android with `Snapdragon 888` chip
### case 1
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/demo.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch)
Susan Wise Bauer
llama_print_timings: load time = 23574.72 ms
llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second)
llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second)
llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second)
llama_print_timings: total time = 34731.93 ms
```
### case 2
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/cat.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
The image depicts a cat sitting in the grass near some tall green plants.
llama_print_timings: load time = 23257.32 ms
llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second)
llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second)
llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second)
llama_print_timings: total time = 34570.79 ms
```
## Minor shortcomings
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
## TODO
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
- [ ] Optimize LDP projector performance
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
- [ ] run MobileVLM on `Jetson Orin`
- [ ] Support more model variants, such as `MobileVLM-3B`.
## contributor
```sh
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
```

View file

@ -0,0 +1,53 @@
#!/bin/bash
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
projector_name="mmproj-model-f16.gguf"
llama_name="ggml-model-q4_k.gguf"
img_dir="/Users/cxt/model/llm"
img_name="demo.jpg"
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
# img_name="cat.jpeg"
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
program_dir="build_64/bin"
binName="llava-cli"
n_threads=4
deviceDir="/data/local/tmp"
saveDir="output"
if [ ! -d ${saveDir} ]; then
mkdir ${saveDir}
fi
function android_run() {
# # copy resource into device
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
# copy program into device
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
adb shell "chmod 0777 ${deviceDir}/${binName}"
# run
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
-m ${deviceDir}/${llama_name} \
--mmproj ${deviceDir}/${projector_name} \
-t ${n_threads} \
--image ${deviceDir}/${img_name} \
-p \"${prompt}\" \
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
-m ${deviceDir}/${llama_name} \
--mmproj ${deviceDir}/${projector_name} \
-t ${n_threads} \
--image ${deviceDir}/${img_name} \
-p \"${prompt}\" \
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
}
android_run
echo "android_run is Done!"

View file

@ -0,0 +1,8 @@
#!/bin/bash
cmake ../../../../ \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-23 $1
make -j4

View file

@ -2,17 +2,6 @@
// so there might be still unnecessary artifacts hanging around
// I'll gradually clean and extend it
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <regex>
#include <stdexcept>
#include <vector>
#include "clip.h"
#include "ggml.h"
#include "ggml-alloc.h"
@ -29,6 +18,19 @@
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <regex>
#include <stdexcept>
#include <vector>
#include <sstream>
#include <cinttypes>
static std::string format(const char * fmt, ...) {
va_list ap;
va_list ap2;
@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
//
// tensor name constants
@ -89,6 +92,21 @@ static std::string format(const char * fmt, ...) {
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
};
//
// utilities to get data from a gguf file
@ -129,6 +147,91 @@ static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype));
}
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
}
static projector_type clip_projector_type_from_string(const std::string & name) {
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
if (kv.second == name) {
return kv.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
//
// image data
//
@ -205,6 +308,32 @@ struct clip_vision_model {
struct ggml_tensor * mm_0_b;
struct ggml_tensor * mm_2_w;
struct ggml_tensor * mm_2_b;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
struct ggml_tensor * mm_model_mlp_1_b;
struct ggml_tensor * mm_model_mlp_3_w;
struct ggml_tensor * mm_model_mlp_3_b;
struct ggml_tensor * mm_model_block_1_block_0_0_w;
struct ggml_tensor * mm_model_block_1_block_0_1_w;
struct ggml_tensor * mm_model_block_1_block_0_1_b;
struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
struct ggml_tensor * mm_model_block_1_block_2_0_w;
struct ggml_tensor * mm_model_block_1_block_2_1_w;
struct ggml_tensor * mm_model_block_1_block_2_1_b;
struct ggml_tensor * mm_model_block_2_block_0_0_w;
struct ggml_tensor * mm_model_block_2_block_0_1_w;
struct ggml_tensor * mm_model_block_2_block_0_1_b;
struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
struct ggml_tensor * mm_model_block_2_block_2_0_w;
struct ggml_tensor * mm_model_block_2_block_2_1_w;
struct ggml_tensor * mm_model_block_2_block_2_1_b;
};
struct clip_ctx {
@ -213,6 +342,7 @@ struct clip_ctx {
bool has_llava_projector = false;
struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
float image_mean[3];
float image_std[3];
@ -430,16 +560,135 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
free(patches_data);
}
// shape [1, 576, 1024]
// ne is whcn, ne = [1024, 576, 1, 1]
embeddings = ggml_get_rows(ctx0, embeddings, patches);
// mm projection 0
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// print_tensor_info(embeddings, "embeddings");
embeddings = ggml_gelu(ctx0, embeddings);
// llava projector
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
int n_patch = 24;
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
mlp_1 = ggml_gelu(ctx0, mlp_1);
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
// block 1
struct ggml_tensor * block_1 = nullptr;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// residual
block_1 = ggml_add(ctx0, mlp_3, block_1);
}
// block_2
{
// stride = 2
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
// not sure the parameters is right for globalAvgPooling
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
}
embeddings = block_1;
}
else {
GGML_ASSERT(false);
}
}
// build the graph
@ -485,16 +734,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("\n");
}
const int n_tensors = gguf_get_n_tensors(ctx);
// kv
if (verbosity >= 3) {
const int n_kv = gguf_get_n_kv(ctx);
const int n_kv = gguf_get_n_kv(ctx);
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname);
{
std::map<enum ggml_type, uint32_t> n_type;
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
for (int i = 0; i < n_tensors; i++) {
enum ggml_type type = gguf_get_tensor_type(ctx, i);
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
n_type[type]++;
}
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(ctx, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
replace_all(value, "\n", "\\n");
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}
// print type counts
for (auto & kv : n_type) {
if (kv.second == 0) {
continue;
}
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
}
printf("\n");
}
// data
@ -503,12 +783,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
enum ggml_type type = gguf_get_tensor_type(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
size_t tensor_size = ggml_nbytes(cur);
buffer_size += tensor_size;
if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
ggml_n_dims(cur), cur->name, tensor_size, offset);
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
}
}
}
@ -517,6 +798,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
clip_ctx * new_clip = new clip_ctx;
// update projector type
{
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
if (idx != -1) {
const std::string proj_type = gguf_get_val_str(ctx, idx);
new_clip->proj_type = clip_projector_type_from_string(proj_type);
}
else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
}
#ifdef GGML_USE_CUBLAS
new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__);
@ -661,10 +954,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
// LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
@ -1100,13 +1428,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
}
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_2_b->ne[0];
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
}
int clip_n_patches(const struct clip_ctx * ctx) {
auto & params = ctx->vision_model.hparams;
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
n_patches /= 4;
}
return n_patches;
}
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {

View file

@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
fout.add_description("vision-only CLIP model")
elif has_llava_projector:
fout.add_description("image encoder for LLaVA")
# add projector type
fout.add_string("clip.projector_type", args.projector_type)
else:
fout.add_description("two-tower CLIP model")
@ -218,7 +221,8 @@ if has_llava_projector:
projector = torch.load(args.llava_projector)
for name, data in projector.items():
name = get_tensor_name(name)
if data.ndim == 2:
# pw and dw conv ndim==4
if data.ndim == 2 or data.ndim == 4:
data = data.squeeze().numpy().astype(np.float16)
else:
data = data.squeeze().numpy().astype(np.float32)

View file

@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}
static inline int nearest_int(float fval) {
//assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
}
static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
float max_logit = logits[0];
float min_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
min_logit = std::min(min_logit, logits[i]);
}
min_logit = std::max(min_logit, max_logit - 16);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
const float log_sum_exp = log(sum_exp);
const float min_log_prob = min_logit - max_logit - log_sum_exp;
const float scale = (max_logit - min_logit)/65535.f;
float * d = (float *)log_prob;
d[0] = scale;
d[1] = min_log_prob;
log_prob += 4;
if (scale) {
const float inv_scale = 1/scale;
for (int i = 0; i < n_vocab; ++i) {
log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
}
} else {
std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
}
return max_logit + log_sum_exp - logits[tok];
}
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
@ -147,6 +184,130 @@ static void process_logits(
}
}
static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0;
auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
double local_nll = 0;
double local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
}
struct kl_divergence_result {
double sum_nll = 0;
double sum_nll2 = 0;
double sum_kld = 0;
double sum_kld2 = 0;
double sum_nll_diff = 0;
double sum_nll_diff2 = 0;
size_t n_same_top = 0;
size_t count = 0;
};
static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
float max_logit = logits[0];
int imax = 0;
for (int i = 1; i < n_vocab; ++i) {
if (logits[i] > max_logit) {
max_logit = logits[i];
imax = i;
}
}
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
const float log_sum_exp = log(sum_exp);
const float * d = (const float *)base_log_prob;
const float scale = d[0];
const float min_log_prob = d[1];
base_log_prob += 4;
float nll = max_logit + log_sum_exp - logits[tok];
kld.sum_nll += nll;
kld.sum_nll2 += nll*nll;
nll += (scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff += nll;
kld.sum_nll_diff2 += nll*nll;
max_logit += log_sum_exp;
double sum = 0;
int imax_base = -1;
float p_log_base_max = 0;
for (int i = 0; i < n_vocab; ++i) {
const float p_log_base = scale*base_log_prob[i] + min_log_prob;
if (i == 0 || p_log_base > p_log_base_max) {
p_log_base_max = p_log_base;
imax_base = i;
}
if (p_log_base > -16.f) {
const float p_base = expf(p_log_base);
sum += p_base * (p_log_base - logits[i] + max_logit);
}
}
kld.sum_kld += sum;
kld.sum_kld2 += sum*sum;
++kld.count;
if (imax == imax_base) ++kld.n_same_top;
return sum;
}
static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
float * kld_values) {
std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0;
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
kl_divergence_result local_kld;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
kld.sum_nll += local_kld.sum_nll;
kld.sum_nll2 += local_kld.sum_nll2;
kld.sum_kld += local_kld.sum_kld;
kld.sum_kld2 += local_kld.sum_kld2;
kld.sum_nll_diff += local_kld.sum_nll_diff;
kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
kld.n_same_top += local_kld.n_same_top;
kld.count += local_kld.count;
break;
}
lock.unlock();
double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
kld_values[i] = (float)v;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
}
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -294,6 +455,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str());
if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,6 +509,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
std::vector<uint16_t> log_probs;
if (!params.logits_file.empty()) {
logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
const int nv = 2*((n_vocab + 1)/2) + 4;
log_probs.resize(n_ctx * nv);
}
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
@ -398,8 +580,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// process the entire prompt.
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
if (!params.logits_file.empty()) {
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs, nll, nll2);
} else {
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
}
count += n_ctx - first - 1;
// perplexity is e^(average negative log-likelihood)
@ -540,14 +727,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
// Number of tasks to use when computing the score
if (params.hellaswag_tasks < hs_task_count) {
hs_task_count = params.hellaswag_tasks;
}
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
// The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
std::mt19937 rng(1);
@ -1031,6 +1218,566 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
}
static bool deserialize_string(std::istream & in, std::string & str) {
uint32_t size;
if (!in.read((char *)&size, sizeof(size)).fail()) {
str.resize(size);
if (!in.read((char *)&str[0], size).fail()) return true;
}
return false;
}
struct multiple_choice_answers {
std::vector<std::string> answers;
std::vector<int> labels;
bool deserialize(std::istream& in) {
uint32_t n;
in.read((char *)&n, sizeof(n));
if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
answers.resize(n);
labels.resize(n);
for (auto& a : answers) {
if (!deserialize_string(in, a)) return false;
}
in.read((char *)labels.data(), n*sizeof(int));
return !in.fail();
}
};
struct multiple_choice_task {
std::string question; // the question (or context that needs to be continued)
multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer
multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet
bool deserialize(std::istream& in) {
if (!deserialize_string(in, question)) return false;
return mc1.deserialize(in) && mc2.deserialize(in);
}
// For evaluation
size_t i_batch; // starting index in the llama_batch
size_t common_prefix; // max number of initial tokens that are the same in all sentences
size_t required_tokens; // needed number of tokens to evaluate all answers
std::vector<std::vector<llama_token>> seq_tokens;
std::vector<float> log_probs;
};
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__);
}
return false;
}
task.seq_tokens.reserve(task.mc1.answers.size());
for (auto& answer : task.mc1.answers) {
if (answer.empty()) {
if (log_error) {
printf("%s: found empty answer\n", __func__);
}
return false;
}
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
}
auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) {
min_len = std::min(min_len, seq.size());
}
task.common_prefix = 0;
for (size_t k = 0; k < min_len; ++k) {
auto token = task.seq_tokens[0][k];
bool all_same = true;
for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
if (task.seq_tokens[i][k] != token) {
all_same = false;
break;
}
}
if (!all_same) {
break;
}
++task.common_prefix;
}
task.required_tokens = task.common_prefix;
for (auto& seq : task.seq_tokens) {
task.required_tokens += seq.size() - task.common_prefix;
}
return true;
}
//
// Calculates score for multiple choice tasks with single correct answer from prompt.
// Commonly used LLM evaluation metrics of this type are
// * ARC
// * HellaSwag
// * MMLU
// * TruthfulQA
//
// Validation datasets for these 4 tests can be found at
// https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
// The data for these datasets was extracted from
// git@hf.co:datasets/allenai/ai2_arc
// https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
// git@hf.co:datasets/Stevross/mmlu
// https://huggingface.co/datasets/truthful_qa
//
static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
std::istringstream strstream(params.prompt);
uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) {
printf("%s: no tasks\n", __func__);
return;
}
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
printf("%s: failed to raad task positions from prompt\n", __func__);
return;
}
std::vector<multiple_choice_task> tasks;
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks
tasks.resize(n_task);
printf("%s: reading tasks", __func__);
int n_dot = n_task/100;
int i = 0;
for (auto& task : tasks) {
++i;
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
return;
}
if (i%n_dot == 0) printf(".");
}
printf("done\n");
}
else {
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1);
std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
float scale = 1.f/(1.f + (float)std::mt19937::max());
tasks.resize(params.multiple_choice_tasks);
for (auto& task : tasks) {
int j = (int)(scale * rng() * aux.size());
int idx = aux[j];
aux[j] = aux.back();
aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return;
}
}
n_task = params.multiple_choice_tasks;
}
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
printf("%s: preparing task data", __func__);
fflush(stdout);
if (n_task > 500) {
printf("...");
fflush(stdout);
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
int num_tasks = tasks.size();
int n_bad_local = 0;
while (true) {
int first = counter.fetch_add(K_TOKEN_CHUNK);
if (first >= num_tasks) {
if (n_bad_local > 0) n_bad += n_bad_local;
break;
}
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
for (int i = first; i < last; ++i) {
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
}
}
};
size_t max_thread = std::thread::hardware_concurrency();
max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
std::vector<std::thread> workers(max_thread-1);
for (auto& w : workers) w = std::thread(prepare);
prepare();
for (auto& w : workers) w.join();
printf("done\n");
fflush(stdout);
int nbad = n_bad;
if (nbad > 0) {
printf("%s: found %d malformed tasks\n", __func__, nbad);
return;
}
} else {
int n_dot = n_task/100;
int i_task = 0;
for (auto& task : tasks) {
++i_task;
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
return;
}
if (i_task%n_dot == 0) {
printf(".");
fflush(stdout);
}
}
printf("done\n");
}
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
printf("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
std::vector<std::thread> workers(std::thread::hardware_concurrency());
std::vector<int> batch_indeces;
int n_done = 0;
int n_correct = 0;
int n_tot_answers = 0;
for (size_t i0 = 0; i0 < tasks.size(); i0++) {
int n_cur = 0;
size_t i1 = i0;
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
llama_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique seuqnce ids - one for each ending
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
int s0 = 0;
while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
auto& cur_task = tasks[i1];
int num_answers = cur_task.seq_tokens.size();
if (s0 + num_answers > max_seq) {
break;
}
if (int(batch_indeces.size()) != num_answers) {
batch_indeces.resize(num_answers);
}
for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
}
}
s0 += num_answers;
cur_task.i_batch = i_batch;
i_batch += cur_task.required_tokens;
n_cur += cur_task.required_tokens;
if (++i1 == tasks.size()) {
break;
}
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
llama_kv_cache_clear(ctx);
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
return;
}
// Compute log-probs in parallel
// First we collect all tasks
eval_pairs.clear();
for (size_t i = i0; i < i1; ++i) {
auto& cur_task = tasks[i];
size_t li = cur_task.common_prefix;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
}
++li;
}
}
// Then we do the actual calculation
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
size_t ir = 0;
// compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i];
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) {
// printf("%d", j+1);
// }
//}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
cur_task.log_probs.resize(cur_task.seq_tokens.size());
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
//printf(" %zu %g\n", ir, eval_results[ir]);
++count;
log_prob += eval_results[ir++];
}
cur_task.log_probs[s] = log_prob / count;
//printf(" Final: %g\n", log_prob / count);
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
}
// Find the ending with maximum logprob
size_t logprob_max_idx = 0;
float logprob_max_val = cur_task.log_probs[0];
for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
if (cur_task.log_probs[s] > logprob_max_val) {
logprob_max_val = cur_task.log_probs[s];
logprob_max_idx = s;
}
}
n_tot_answers += cur_task.log_probs.size();
if (cur_task.mc1.labels[logprob_max_idx] == 1) {
++n_correct;
}
++n_done;
// Print the accumulated accuracy mean x 100
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
fflush(stdout);
}
i0 = i1 - 1;
}
llama_batch_free(batch);
if (n_done < 100) return;
float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1));
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1));
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
printf("\n");
}
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (params.logits_file.empty()) {
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return;
}
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) {
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
return;
}
{
char check[9]; check[8] = 0;
in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return;
}
}
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
int n_vocab, n_chunk;
in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) {
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return;
}
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab);
}
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
if (count < 1) {
return std::make_pair(0., 0.);
}
double f = sum/count;
double df = sum2/count - f*f;
df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
return std::make_pair(f, df);
};
kl_divergence_result kld;
auto kld_ptr = kld_values.data();
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
// clear the KV cache
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n");
}
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr);
kld_ptr += n_ctx - 1 - first;
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
auto p_top = 1.*kld.n_same_top/kld.count;
auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first),
log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
p_top, d_p_top);
fflush(stdout);
logits.clear();
}
printf("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end());
printf("===== KL-divergence statistics\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2];
printf("Median : %10.6f\n", kld_median);
auto percentile = [&kld_values] (float fraction) {
if (fraction <= 0) return kld_values.front();
if (fraction >= 1) return kld_values.back();
float p = fraction*(kld_values.size() - 1);
size_t ip = size_t(p); p -= ip;
return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
};
printf("Maximum: %10.6f\n", kld_values.back());
printf("KLD_99 : %10.6f\n", percentile(0.99f));
printf("KLD_95 : %10.6f\n", percentile(0.95f));
printf("KLD_90 : %10.6f\n", percentile(0.90f));
printf("Minimum: %10.6f\n", kld_values.front());
printf("KLD_01 : %10.6f\n", percentile(0.01f));
printf("KLD_05 : %10.6f\n", percentile(0.05f));
printf("KLD_10 : %10.6f\n", percentile(0.10f));
}
int main(int argc, char ** argv) {
gpt_params params;
@ -1091,6 +1838,10 @@ int main(int argc, char ** argv) {
hellaswag_score(ctx, params);
} else if (params.winogrande) {
winogrande_score(ctx, params);
} else if (params.multiple_choice) {
multiple_choice_score(ctx, params);
} else if (params.kl_divergence) {
kl_divergence(ctx, params);
} else {
results = perplexity(ctx, params);
}

View file

@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },

6
flake.lock generated
View file

@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1705133751,
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
"lastModified": 1705677747,
"narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
"rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
"type": "github"
},
"original": {

View file

@ -1,3 +1,17 @@
# The flake interface to llama.cpp's Nix expressions. The flake is used as a
# more discoverable entry-point, as well as a way to pin the dependencies and
# expose default outputs, including the outputs built by the CI.
# For more serious applications involving some kind of customization you may
# want to consider consuming the overlay, or instantiating `llamaPackages`
# directly:
#
# ```nix
# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
# ```
# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
# of the relation between Nix and the Nix Flakes.
{
description = "Port of Facebook's LLaMA model in C/C++";

View file

@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
} else {
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
__func__, size, max_avail);
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
__func__, tensor->name, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
return;
}

View file

@ -13,6 +13,10 @@
#include <map>
#include <array>
// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
[[noreturn]]
static __device__ void bad_arch() {
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
(void) arch_list;
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
(void) bad_arch; // suppress unused function warning
(void) no_device_code; // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
return a;
#else
(void) a;
bad_arch();
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
return x;
#else
(void) x;
bad_arch();
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
}
@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
}
#else
(void) vx; (void) y; (void) k;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_PASCAL
}
@ -2452,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
// second part effectively subtracts 8 from each quant value
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2489,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2524,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
// second part effectively subtracts 16 from each quant value
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2569,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2590,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
return d8_0*d8_1 * sumi;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2620,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
return d3 * sumf;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
return d3*d8 * sumi;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
return dm5f.x*sumf_d - dm5f.y*sumf_m;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
return d*sumf;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
return d6 * sumf_d;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
return dall * sumf_d - dmin * sumf_m;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
return d * sumf_d;
#else
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
@ -4501,7 +4520,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_0_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4570,7 +4589,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_1_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4637,7 +4656,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_0_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4704,7 +4723,7 @@ mul_mat_q5_1(
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_1_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4771,7 +4790,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q8_0_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4838,7 +4857,7 @@ mul_mat_q2_K(
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q2_K_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4907,7 +4926,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q3_K_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -4976,7 +4995,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_K_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -5043,7 +5062,7 @@ mul_mat_q5_K(
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_K_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -5112,7 +5131,7 @@ template <bool need_check> static __global__ void
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q6_K_q8_1_mul_mat;
bad_arch();
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
}
#else
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
bad_arch();
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
}

View file

@ -668,7 +668,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
return true;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
return ctx->support_simdgroup_reduction;
return ctx->support_simdgroup_reduction &&
(op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
case GGML_OP_CPY:
case GGML_OP_DUP:
case GGML_OP_CONT:

340
ggml.c
View file

@ -1418,6 +1418,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
// TODO: optimize performance
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
static const float GELU_COEF_A = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f;
@ -1776,9 +1779,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"GELU",
"GELU_QUICK",
"SILU",
"HARDSWISH",
"HARDSIGMOID",
};
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@ -3946,6 +3951,20 @@ struct ggml_tensor * ggml_silu_back(
return result;
}
// ggml hardswish
struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
}
// ggml hardsigmoid
struct ggml_tensor * ggml_hardsigmoid(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
}
// ggml_norm
static struct ggml_tensor * ggml_norm_impl(
@ -5345,6 +5364,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
return result;
}
// ggml_conv_depthwise
struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1) {
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
struct ggml_tensor * result =
ggml_mul_mat(ctx,
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC1, KH, KW] => [1, OC, 1, KH * KW]
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
return result;
}
// ggml_conv_2d
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@ -7765,6 +7809,9 @@ static void ggml_compute_forward_acc_f32(
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) {
if (params->ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
@ -9334,6 +9381,87 @@ static void ggml_compute_forward_silu_back(
}
}
static void ggml_compute_forward_hardswish_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardswish_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
}
}
static void ggml_compute_forward_hardswish(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hardswish_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
static void ggml_compute_forward_hardsigmoid_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardsigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
}
}
static void ggml_compute_forward_hardsigmoid(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_norm
static void ggml_compute_forward_norm_f32(
@ -9826,11 +9954,30 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
if (params->ith != 0) {
return;
}
const int64_t ne_plane = ne01*ne00;
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
UNUSED(desired_wsize);
if (params->type == GGML_TASK_INIT) {
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
}
}
}
return;
}
@ -9838,9 +9985,14 @@ static void ggml_compute_forward_mul_mat(
return;
}
// perform sgemm, parallelization controlled by blas lib
if (ith != 0) {
return;
}
//const int64_t tgemm0 = ggml_perf_time_us();
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
@ -9849,17 +10001,7 @@ static void ggml_compute_forward_mul_mat(
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
ggml_to_float_t const to_float = type_traits[type].to_float;
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
to_float((const char *) x + i01*nb01, wdata + id, ne00);
id += ne00;
}
assert(id*sizeof(float) <= params->wsize);
x = wdata;
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -9869,6 +10011,7 @@ static void ggml_compute_forward_mul_mat(
0.0f, d, ne01);
}
}
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -9877,6 +10020,9 @@ static void ggml_compute_forward_mul_mat(
#endif
if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10041,6 +10187,9 @@ static void ggml_compute_forward_mul_mat_id(
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
char * wdata = params->wdata;
if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10226,6 +10375,9 @@ static void ggml_compute_forward_out_prod_f32(
return;
}
#endif
if (ith != 0) {
return;
}
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return;
}
@ -10409,6 +10561,9 @@ static void ggml_compute_forward_out_prod_q_f32(
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return;
}
@ -10593,6 +10748,9 @@ static void ggml_compute_forward_set_f32(
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) {
if (params->ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
@ -10935,6 +11093,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memset(dst->data, 0, ggml_nbytes(dst));
}
@ -10969,6 +11130,9 @@ static void ggml_compute_forward_get_rows_back_f32(
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memset(dst->data, 0, ggml_nbytes(dst));
}
@ -11106,6 +11270,9 @@ static void ggml_compute_forward_diag_mask_f32(
GGML_ASSERT(n_past >= 0);
if (!inplace && (params->type == GGML_TASK_INIT)) {
if (ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@ -12076,6 +12243,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12170,6 +12340,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize);
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12368,6 +12541,7 @@ static void ggml_compute_forward_im2col(
}
}
// ggml_compute_forward_conv_transpose_2d
static void ggml_compute_forward_conv_transpose_2d(
@ -12393,6 +12567,9 @@ static void ggml_compute_forward_conv_transpose_2d(
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@ -13936,6 +14113,14 @@ static void ggml_compute_forward_unary(
{
ggml_compute_forward_silu(params, src0, dst);
} break;
case GGML_UNARY_OP_HARDSWISH:
{
ggml_compute_forward_hardswish(params, src0, dst);
} break;
case GGML_UNARY_OP_HARDSIGMOID:
{
ggml_compute_forward_hardsigmoid(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
@ -13999,6 +14184,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
if (!inplace && params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
return;
}
@ -16292,8 +16480,9 @@ struct ggml_compute_state_shared {
const int n_threads;
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
atomic_int node_task; // active graph node task phase
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
@ -16349,6 +16538,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
{
n_tasks = 1;
} break;
@ -16542,6 +16733,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
return n_tasks;
}
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_node_n = * node_n;
while (true) {
if (do_yield) {
sched_yield();
}
* node_n = atomic_load(&state->shared->node_n);
if (* node_n != last_node_n) break;
}
}
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_task_phase = * task_phase;
while (true) {
if (do_yield) {
sched_yield();
}
* task_phase = atomic_load(&state->shared->node_task);
if (* task_phase != last_task_phase) break;
}
}
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@ -16552,7 +16771,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
set_numa_thread_affinity(state->ith, n_threads);
int node_n = -1;
int node_n = -1;
int task_phase = GGML_TASK_FINALIZE;
while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@ -16584,7 +16804,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads);
@ -16593,13 +16812,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
params.nth = n_tasks;
/* INIT */
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
if (n_tasks == 1) {
/* INIT */
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
// they do something more efficient than spinning (?)
params.type = GGML_TASK_COMPUTE;
@ -16620,38 +16839,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_n, node_n);
task_phase = GGML_TASK_INIT;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_n, node_n);
atomic_store(&state->shared->node_task, task_phase);
} else {
// wait for other threads to finish
const int last = node_n;
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
// UPD: adding the do_yield flag seems to resolve the issue universally
if (do_yield) {
sched_yield();
}
node_n = atomic_load(&state->shared->node_n);
if (node_n != last) break;
};
ggml_graph_compute_thread_sync_node(&node_n, state, false);
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
}
// check if we should stop
if (node_n >= cgraph->n_nodes) break;
/* COMPUTE */
/* INIT & COMPUTE */
struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads);
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_COMPUTE,
/*.type =*/ GGML_TASK_INIT,
/*.ith =*/ state->ith,
/*.nth =*/ n_tasks,
/*.wsize =*/ cplan->work_size,
@ -16659,8 +16864,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
};
if (state->ith < n_tasks) {
if (GGML_OP_HAS_INIT[node->op]) {
ggml_compute_forward(&params, node);
}
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
task_phase = GGML_TASK_COMPUTE;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_task, task_phase);
}
else {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
// UPD: adding the do_yield flag seems to resolve the issue universally
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
}
if (state->ith < n_tasks) {
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
task_phase = GGML_TASK_FINALIZE;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_task, task_phase);
}
else {
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
}
}
return GGML_EXIT_SUCCESS;
@ -16717,8 +16953,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
// here we need memory for fully dequantized matrix from src0
// take into account that src0 can be broadcasted into src1[2,3]
cur = ggml_type_size(GGML_TYPE_F32)
* node->src[0]->ne[0]*node->src[0]->ne[1]
* node->src[1]->ne[2]*node->src[1]->ne[3];
}
} else
#endif
@ -16872,6 +17111,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
/*.node_task =*/ GGML_TASK_FINALIZE,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};

23
ggml.h
View file

@ -489,6 +489,8 @@ extern "C" {
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_COUNT,
};
@ -1040,6 +1042,16 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// hardswish(x) = x * relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
struct ggml_tensor * a);
// hardsigmoid(x) = relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardsigmoid(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
@ -1491,6 +1503,17 @@ extern "C" {
int d1,
bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,

457
llama.cpp
View file

@ -1325,8 +1325,10 @@ static llama_state g_state;
// available llama models
enum e_model {
MODEL_UNKNOWN,
MODEL_0_5B,
MODEL_1B,
MODEL_3B,
MODEL_4B,
MODEL_7B,
MODEL_8B,
MODEL_13B,
@ -1680,7 +1682,8 @@ struct llama_context {
}
ggml_backend_buffer_free(buf_logits);
ggml_backend_buffer_free(buf_input);
ggml_free(ctx_input);
}
llama_cparams cparams;
@ -1736,8 +1739,14 @@ struct llama_context {
size_t n_compute_bufs = 0;
size_t i_compute_buf = 0;
// temporary buffer for copying data to/from the backend
std::vector<no_init<uint8_t>> buf_copy;
// input tensors
ggml_backend_buffer_t buf_input = nullptr;
ggml_context * ctx_input = nullptr;
struct ggml_tensor * inp_tokens; // I32 [n_batch]
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
@ -2321,18 +2330,18 @@ struct llama_model_loader {
}
switch (type_max) {
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
default:
@ -2682,6 +2691,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
default: return "unknown, may not work";
}
@ -2897,6 +2907,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
@ -2915,9 +2926,9 @@ static void llm_load_hparams(
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
case 80: model.type = e_model::MODEL_70B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
@ -3721,6 +3732,11 @@ static bool llm_load_tensors(
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
// optional bias tensors, present in Stable LM 2 1.6B
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
@ -4104,22 +4120,24 @@ static struct ggml_tensor * llm_build_inp_embd(
const llama_hparams & hparams,
const llama_batch & batch,
struct ggml_tensor * tok_embd,
struct ggml_tensor * inp_tokens,
struct ggml_tensor * inp_embd,
const llm_build_cb & cb) {
const int64_t n_embd = hparams.n_embd;
struct ggml_tensor * inpL;
if (batch.token) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
struct ggml_tensor * inp_tokens_v = ggml_view_1d(ctx, inp_tokens, batch.n_tokens, 0);
cb(inp_tokens, "inp_tokens", -1);
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
inpL = ggml_get_rows(ctx, tok_embd, inp_tokens_v);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
inpL = ggml_view_2d(ctx, inp_embd, n_embd, batch.n_tokens, inp_embd->nb[1], 0);
}
return inpL;
@ -4133,6 +4151,7 @@ static void llm_build_k_shift(
const llama_cparams & cparams,
const llama_kv_cache & kv,
struct ggml_cgraph * graph,
struct ggml_tensor * K_shift,
llm_rope_type type,
int64_t n_ctx,
float freq_base,
@ -4149,9 +4168,6 @@ static void llm_build_k_shift(
const float beta_fast = cparams.yarn_beta_fast;
const float beta_slow = cparams.yarn_beta_slow;
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);
int rope_type = 0;
switch (type) {
@ -4455,9 +4471,9 @@ static struct ggml_tensor * llm_build_kv(
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(graph, q_cur);
ggml_build_forward_expand(graph, k_cur);
ggml_build_forward_expand(graph, v_cur);
ggml_build_forward_expand(graph, q_cur);
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
@ -4472,6 +4488,7 @@ static struct ggml_tensor * llm_build_kv(
struct llm_build_context {
const llama_model & model;
const llama_context & lctx;
const llama_hparams & hparams;
const llama_cparams & cparams;
const llama_batch & batch;
@ -4518,6 +4535,7 @@ struct llm_build_context {
const llm_build_cb & cb,
bool worst_case) :
model (lctx.model),
lctx (lctx),
hparams (model.hparams),
cparams (lctx.cparams),
batch (batch),
@ -4578,20 +4596,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -4762,20 +4780,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -4883,20 +4901,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5005,15 +5023,15 @@ struct llm_build_context {
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@ -5102,19 +5120,19 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5309,11 +5327,11 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
for (int il = 0; il < n_layer; ++il) {
@ -5399,11 +5417,11 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
inpL = llm_build_norm(ctx0, inpL, hparams,
@ -5492,11 +5510,11 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
for (int il = 0; il < n_layer; ++il) {
@ -5588,20 +5606,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5619,12 +5637,24 @@ struct llm_build_context {
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@ -5699,20 +5729,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5813,20 +5843,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5934,20 +5964,20 @@ struct llm_build_context {
struct ggml_tensor * ffn_output;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6056,20 +6086,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6163,15 +6193,15 @@ struct llm_build_context {
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
@ -6261,20 +6291,20 @@ struct llm_build_context {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6368,15 +6398,7 @@ static struct ggml_cgraph * llama_build_graph(
// check if we should build the worst-case graph (for memory measurement)
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc_cpu);
// keep track of the input that has already been allocated
bool alloc_inp_tokens = false;
bool alloc_inp_embd = false;
bool alloc_inp_pos = false;
bool alloc_inp_KQ_mask = false;
bool alloc_inp_K_shift = false;
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
if (il >= 0) {
ggml_format_name(cur, "%s-%d", name, il);
@ -6384,127 +6406,79 @@ static struct ggml_cgraph * llama_build_graph(
ggml_set_name(cur, name);
}
if (!lctx.cparams.offload_kqv) {
if (strcmp(name, "kqv_merged_cont") == 0) {
// all nodes between the KV store and the attention output are run on the CPU
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
}
}
//
// allocate input tensors and set input data
//
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc_cpu) && batch.token) {
const int64_t n_tokens = cur->ne[0];
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
}
alloc_inp_tokens = true;
}
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc_cpu) && batch.embd) {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur));
}
alloc_inp_embd = true;
}
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc_cpu) && batch.pos) {
const int64_t n_tokens = cur->ne[0];
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur));
}
alloc_inp_pos = true;
}
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc_cpu)) {
const int64_t n_kv = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
float * data;
if (ggml_backend_buffer_is_host(cur->buffer)) {
data = (float *) cur->data;
} else {
lctx.buf_copy.resize(ggml_nbytes(cur));
data = (float *) lctx.buf_copy.data();
}
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_pos pos = batch.pos[j];
const llama_seq_id seq_id = batch.seq_id[j][0];
for (int i = 0; i < n_kv; ++i) {
float f;
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
f = -INFINITY;
} else {
f = 0;
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
}
}
if (data != cur->data) {
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
}
}
alloc_inp_KQ_mask = true;
}
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc_cpu)) {
const int64_t n_ctx = cur->ne[0];
int32_t * data;
if (ggml_backend_buffer_is_host(cur->buffer)) {
data = (int32_t *) cur->data;
} else {
lctx.buf_copy.resize(ggml_nbytes(cur));
data = (int32_t *) lctx.buf_copy.data();
}
for (int i = 0; i < n_ctx; ++i) {
data[i] = lctx.kv_self.cells[i].delta;
}
if (data != cur->data) {
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
}
}
alloc_inp_K_shift = true;
}
};
struct ggml_cgraph * result = NULL;
struct llm_build_context llm(lctx, batch, cb, worst_case);
//
// set input data
//
if (!ggml_tallocr_is_measure(lctx.alloc_cpu)) {
if (batch.token) {
const int64_t n_tokens = batch.n_tokens;
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
}
if (batch.embd) {
const int64_t n_embd = llm.n_embd;
const int64_t n_tokens = batch.n_tokens;
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
}
if (batch.pos) {
const int64_t n_tokens = batch.n_tokens;
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
}
{
const int64_t n_kv = llm.n_kv;
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
float * data = (float *) lctx.inp_KQ_mask->data;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_pos pos = batch.pos[j];
const llama_seq_id seq_id = batch.seq_id[j][0];
for (int i = 0; i < n_kv; ++i) {
float f;
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
f = -INFINITY;
} else {
f = 0;
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
}
}
}
if (llm.do_rope_shift) {
const int64_t n_ctx = llm.n_ctx;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
for (int i = 0; i < n_ctx; ++i) {
data[i] = lctx.kv_self.cells[i].delta;
}
}
}
llm.init();
switch (model.arch) {
@ -8818,9 +8792,13 @@ struct quantize_state_internal {
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
int n_ffn_down = 0;
int n_ffn_gate = 0;
int n_ffn_up = 0;
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
int i_ffn_down = 0;
int i_ffn_gate = 0;
int i_ffn_up = 0;
int n_k_quantized = 0;
int n_fallback = 0;
@ -8923,8 +8901,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
++qs.i_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
++qs.i_feed_forward_w2;
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
++qs.i_ffn_down;
}
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
} else if (name.find("attn_v.weight") != std::string::npos) {
@ -8961,18 +8939,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
new_type = GGML_TYPE_Q2_K;
}
} else if (name.find("ffn_down") != std::string::npos) {
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
int i_layer, n_layer;
if (n_expert == 1) {
i_layer = qs.i_feed_forward_w2;
n_layer = qs.n_feed_forward_w2;
i_layer = qs.i_ffn_down;
n_layer = qs.n_ffn_down;
} else {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name.
n_layer = qs.n_feed_forward_w2 / n_expert;
n_layer = qs.n_ffn_down / n_expert;
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
}
@ -8981,7 +8962,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
}
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@ -9011,11 +8992,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
}
++qs.i_feed_forward_w2;
++qs.i_ffn_down;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
new_type = GGML_TYPE_Q5_K;
}
@ -9033,6 +9015,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (name.find("ffn_gate") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
new_type = GGML_TYPE_Q2_K;
}
++qs.i_ffn_gate;
}
else if (name.find("ffn_up") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
new_type = GGML_TYPE_Q2_K;
}
++qs.i_ffn_up;
}
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@ -9087,8 +9083,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@ -9156,12 +9153,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
++qs.n_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
++qs.n_feed_forward_w2;
++qs.n_ffn_down;
}
else if (name.find("ffn_gate") != std::string::npos) {
++qs.n_ffn_gate;
}
else if (name.find("ffn_up") != std::string::npos) {
++qs.n_ffn_up;
}
}
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
}
size_t total_size_org = 0;
@ -9974,6 +9977,35 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd);
}
// graph inputs
{
ggml_init_params init_params = {
/* .mem_size */ ggml_tensor_overhead()*5,
/* .mem_buffer */ nullptr,
/* .no_alloc */ true,
};
ctx->ctx_input = ggml_init(init_params);
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
ggml_set_name(ctx->inp_tokens, "inp_tokens");
ggml_set_name(ctx->inp_embd, "inp_embd");
ggml_set_name(ctx->inp_pos, "inp_pos");
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(ctx->buf_input),
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
}
// scheduler and compute buffers
{
// buffer types used for the compute buffer of each backend
std::vector<ggml_backend_buffer_type_t> backend_buft;
@ -10000,9 +10032,6 @@ struct llama_context * llama_new_context_with_model(
// initialize scheduler with the worst-case graph
ggml_backend_sched_init_measure(ctx->sched, gf);
// note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
for (ggml_backend_t backend : ctx->backends) {
@ -10046,6 +10075,10 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: logits buffer size = %8.2f MiB, type = %s\n", __func__,
ggml_backend_buffer_get_size(ctx->buf_logits) / 1024.0 / 1024.0,
ggml_backend_buffer_name(ctx->buf_logits));
// note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
}
}

View file

@ -107,6 +107,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};

View file

@ -4,3 +4,4 @@ allow_untyped_calls = true
allow_untyped_defs = true
allow_incomplete_defs = true
disable_error_code = import-untyped
warn_return_any = false

View file

@ -2,8 +2,9 @@
#include <cassert>
#include <stdexcept>
#include <vector>
#include <string>
#include <unordered_map>
#include <vector>
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},