Merge branch 'master' into xsn/clean_up_server

This commit is contained in:
ngxson 2024-01-22 21:41:01 +01:00
commit 58fe9cf572
34 changed files with 2271 additions and 701 deletions

View file

@ -7,6 +7,18 @@
{ system, ... }: { system, ... }:
{ {
_module.args = { _module.args = {
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
# again, the below creates several nixpkgs instances which the
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
#
# This is currently "slow" and "expensive", on a certain scale.
# This also isn't "right" in that this hinders dependency injection at
# the level of flake inputs. This might get removed in the foreseeable
# future.
#
# Note that you can use these expressions without Nix
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
pkgsCuda = import inputs.nixpkgs { pkgsCuda = import inputs.nixpkgs {
inherit system; inherit system;
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc, # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,

View file

@ -73,6 +73,7 @@ let
ps: [ ps: [
ps.numpy ps.numpy
ps.sentencepiece ps.sentencepiece
ps.tiktoken
ps.torchWithoutCuda ps.torchWithoutCuda
ps.transformers ps.transformers
] ]
@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
pname = "llama-cpp${pnameSuffix}"; pname = "llama-cpp${pnameSuffix}";
version = llamaVersion; version = llamaVersion;
# Note: none of the files discarded here are visible in the sandbox or
# affect the output hash. This also means they can be modified without
# triggering a rebuild.
src = lib.cleanSourceWith { src = lib.cleanSourceWith {
filter = filter =
name: type: name: type:
!(builtins.any (_: _) [ let
noneOf = builtins.all (x: !x);
baseName = baseNameOf name;
in
noneOf [
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
(name == "README.md") # Ignore *.md changes whe computing outPaths (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
(lib.hasPrefix "." name) # Skip hidden files and directories (lib.hasPrefix "." baseName) # Skip hidden files and directories
]); (baseName == "flake.lock")
];
src = lib.cleanSource ../../.; src = lib.cleanSource ../../.;
}; };
@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
cmakeFlags = cmakeFlags =
[ [
(cmakeBool "LLAMA_NATIVE" true) (cmakeBool "LLAMA_NATIVE" false)
(cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" true) (cmakeBool "BUILD_SHARED_LIBS" true)
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)

View file

@ -4,6 +4,10 @@
llamaVersion ? "0.0.0", llamaVersion ? "0.0.0",
}: }:
# We're using `makeScope` instead of just writing out an attrset
# because it allows users to apply overlays later using `overrideScope'`.
# Cf. https://noogle.dev/f/lib/makeScope
lib.makeScope newScope ( lib.makeScope newScope (
self: { self: {
inherit llamaVersion; inherit llamaVersion;

View file

@ -295,7 +295,7 @@ jobs:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17 OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0 CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.21.1-2023-04-24 SDE_VERSION: 9.33.0-2024-01-07
strategy: strategy:
matrix: matrix:
@ -400,7 +400,7 @@ jobs:
id: cmake_test_sde id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: | run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz" curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz # for some weird reason windows tar doesn't like sde tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar

View file

@ -2,13 +2,20 @@ name: Nix aarch64 builds
on: on:
workflow_dispatch: # allows manual triggering workflow_dispatch: # allows manual triggering
schedule:
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
# 1.5h instead of minutes with the cold cache).
#
# randint(0, 59), randint(0, 23)
- cron: '26 12 * * *'
# But also rebuild if we touched any of the Nix expressions:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] paths: ['**/*.nix', 'flake.lock']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] paths: ['**/*.nix', 'flake.lock']
jobs: jobs:
nix-build-aarch64: nix-build-aarch64:

View file

@ -5,10 +5,8 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
jobs: jobs:
nix-eval: nix-eval:

View file

@ -47,6 +47,7 @@ option(BUILD_SHARED_LIBS "build shared libraries"
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
option(LLAMA_LTO "llama: enable link time optimization" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF)
option(LLAMA_CCACHE "llama: use ccache if available" ON)
# debug # debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
@ -107,6 +108,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
# add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF)
if (LLAMA_PERF)
add_definitions(-DGGML_PERF)
endif()
# Required for relocatable CMake package # Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -561,6 +569,17 @@ if (LLAMA_LTO)
endif() endif()
endif() endif()
if (LLAMA_CCACHE)
find_program(LLAMA_CCACHE_FOUND ccache)
if (LLAMA_CCACHE_FOUND)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
set(ENV{CCACHE_SLOPPINESS} time_macros)
message(STATUS "Using ccache")
else()
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
endif ()
endif()
# this version of Apple ld64 is buggy # this version of Apple ld64 is buggy
execute_process( execute_process(
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v

View file

@ -128,6 +128,7 @@ as the main playground for developing new features for the [ggml](https://github
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
**UI:** **UI:**

View file

@ -203,6 +203,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.prompt_cache_all = true; params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") { } else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true; params.prompt_cache_ro = true;
} else if (arg == "-bf" || arg == "--binary-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i], std::ios::binary);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
// store the external file name in params
params.prompt_file = argv[i];
file.seekg(0, std::ios::end);
size_t size = file.tellg();
file.seekg(0, std::ios::beg);
params.prompt.resize(size);
file.read((char *)params.prompt.data(), size);
fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
} else if (arg == "-f" || arg == "--file") { } else if (arg == "-f" || arg == "--file") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -653,6 +672,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
if (params.logdir.back() != DIRECTORY_SEPARATOR) { if (params.logdir.back() != DIRECTORY_SEPARATOR) {
params.logdir += DIRECTORY_SEPARATOR; params.logdir += DIRECTORY_SEPARATOR;
} }
} else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.logits_file = argv[i];
} else if (arg == "--perplexity" || arg == "--all-logits") { } else if (arg == "--perplexity" || arg == "--all-logits") {
params.logits_all = true; params.logits_all = true;
} else if (arg == "--ppl-stride") { } else if (arg == "--ppl-stride") {
@ -689,6 +714,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.winogrande_tasks = std::stoi(argv[i]); params.winogrande_tasks = std::stoi(argv[i]);
} else if (arg == "--multiple-choice") {
params.multiple_choice = true;
} else if (arg == "--multiple-choice-tasks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.multiple_choice_tasks = std::stoi(argv[i]);
} else if (arg == "--kl-divergence") {
params.kl_divergence = true;
} else if (arg == "--ignore-eos") { } else if (arg == "--ignore-eos") {
params.ignore_eos = true; params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") { } else if (arg == "--no-penalize-nl") {
@ -888,6 +923,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
printf(" -f FNAME, --file FNAME\n"); printf(" -f FNAME, --file FNAME\n");
printf(" prompt file to start generation.\n"); printf(" prompt file to start generation.\n");
printf(" -bf FNAME, --binary-file FNAME\n");
printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@ -936,6 +973,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);

View file

@ -91,6 +91,7 @@ struct gpt_params {
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files
std::string logits_file = ""; // file for saving *all* logits
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
@ -108,6 +109,11 @@ struct gpt_params {
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL-divergence
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs

View file

@ -10,7 +10,7 @@ import re
import sys import sys
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
import numpy as np import numpy as np
import torch import torch
@ -289,6 +289,58 @@ class Model:
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self): def _set_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
@ -487,7 +539,8 @@ class MPTModel(Model):
# map tensor names # map tensor names
if "scales" in name: if "scales" in name:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
new_name = new_name.replace("scales", "act.scales") if new_name is not None:
new_name = new_name.replace("scales", "act.scales")
else: else:
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None: if new_name is None:
@ -876,6 +929,13 @@ class PersimmonModel(Model):
class StableLMModel(Model): class StableLMModel(Model):
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
self._set_vocab_qwen()
def set_gguf_parameters(self): def set_gguf_parameters(self):
hparams = self.hparams hparams = self.hparams
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
@ -904,7 +964,7 @@ class QwenModel(Model):
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@staticmethod @staticmethod
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]: def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
parts = [bytes([b]) for b in token] parts = [bytes([b]) for b in token]
while True: while True:
min_idx = None min_idx = None
@ -921,52 +981,7 @@ class QwenModel(Model):
return parts return parts
def set_vocab(self): def set_vocab(self):
dir_model = self.dir_model self._set_vocab_qwen()
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[self.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
added_vocab = tokenizer.special_tokens
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.CONTROL)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name("Qwen") self.gguf_writer.add_name("Qwen")
@ -1285,7 +1300,7 @@ def main() -> None:
if args.awq_path: if args.awq_path:
sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model" tmp_model_path = args.model / "weighted_model"
dir_model = tmp_model_path dir_model = tmp_model_path
if tmp_model_path.is_dir(): if tmp_model_path.is_dir():

View file

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import os
import struct import struct
import sys import sys
from enum import IntEnum from enum import IntEnum
@ -9,7 +10,6 @@ from pathlib import Path
import numpy as np import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
else: else:
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab( vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, vocab_factory = convert.VocabFactory(vocab_path)
cfg.vocabtype) vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab, svocab) return params, vocab, special_vocab
def handle_args(): def handle_args():

View file

@ -5,17 +5,16 @@ import json
import os import os
import struct import struct
import sys import sys
from pathlib import Path
from typing import Any, BinaryIO, Sequence from typing import Any, BinaryIO, Sequence
import numpy as np import numpy as np
import torch import torch
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -60,7 +59,14 @@ if __name__ == '__main__':
input_model = os.path.join(sys.argv[1], "adapter_model.bin") input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu") if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu")
else:
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
model = load_file(input_model, device="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values(): if arch_name not in gguf.MODEL_ARCH_NAMES.values():

View file

@ -1,11 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import torch
import os
from pprint import pprint
import sys
import argparse import argparse
import os
import sys
from pathlib import Path from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
@ -69,7 +71,7 @@ def main():
persimmon_model = torch.load(args.ckpt_path) persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args'] hparams = persimmon_model['args']
pprint(hparams) pprint(hparams)
tensors = {} tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None) _flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON arch = gguf.MODEL_ARCH.PERSIMMON

View file

@ -17,58 +17,28 @@ import signal
import struct import struct
import sys import sys
import time import time
import warnings
import zipfile import zipfile
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from argparse import ArgumentParser
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import ( from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
IO,
TYPE_CHECKING,
Any,
Callable,
Iterable,
Literal,
Optional,
Tuple,
TypeVar,
)
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
try: if 'NO_LOCAL_GGUF' not in os.environ:
from transformers import AutoTokenizer sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
except ModuleNotFoundError as e: import gguf
warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory if TYPE_CHECKING:
if "NO_LOCAL_GGUF" not in os.environ: from typing import TypeAlias
# Use absolute path to the gguf-py directory
gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed
if gguf_py_dir not in sys.path:
sys.path.insert(1, gguf_py_dir)
# Import gguf module if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
try:
import gguf
except ModuleNotFoundError as e:
print(f"Could not import gguf: {e}")
sys.exit(1)
if TYPE_CHECKING: # NOTE: This isn't necessary.
from typing import TypeAlias # This can technically be omitted.
if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
faulthandler.register(signal.SIGUSR1) faulthandler.register(signal.SIGUSR1)
# NOTE: n-dimensional arrays should be directly referenced NDArray: TypeAlias = 'np.ndarray[Any, Any]'
NDArray: TypeAlias = "np.ndarray[Any, Any]"
# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
ARCH = gguf.MODEL_ARCH.LLAMA ARCH = gguf.MODEL_ARCH.LLAMA
DEFAULT_CONCURRENCY = 8 DEFAULT_CONCURRENCY = 8
@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
# #
# TODO: Clean up and refactor data types
@dataclass(frozen=True) @dataclass(frozen=True)
class DataType: class DataType:
name: str name: str
@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
@dataclass @dataclass
class Params: class Params:
n_vocab: int n_vocab: int
n_embd: int n_embd: int
n_layer: int n_layer: int
n_ctx: int n_ctx: int
n_ff: int n_ff: int
n_head: int n_head: int
n_head_kv: int n_head_kv: int
f_norm_eps: Optional[float] = None n_experts: int | None = None
n_experts: Optional[int] = None n_experts_used: int | None = None
n_experts_used: Optional[int] = None f_norm_eps: float | None = None
rope_scaling_type: Optional[gguf.RopeScalingType] = None rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: Optional[float] = None f_rope_freq_base: float | None = None
f_rope_scale: Optional[float] = None f_rope_scale: float | None = None
n_orig_ctx: Optional[int] = None n_orig_ctx: int | None = None
rope_finetuned: Optional[bool] = None rope_finetuned: bool | None = None
ftype: Optional[GGMLFileType] = None ftype: GGMLFileType | None = None
# path to the directory containing the model files # path to the directory containing the model files
path_model: Optional[Path] = None path_model: Path | None = None
@staticmethod @staticmethod
def guessed(model: LazyModel) -> "Params": def guessed(model: LazyModel) -> Params:
# try transformer naming first # try transformer naming first
n_vocab, n_embd = ( n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
model["model.embed_tokens.weight"].shape
if "model.embed_tokens.weight" in model
else model["tok_embeddings.weight"].shape
)
# try transformer naming first # try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model: if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer = next( n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
i elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
for i in itertools.count() n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
if f"model.layers.{i}.self_attn.q_proj.weight" not in model
)
elif (
"model.layers.0.self_attn.W_pack.weight" in model
): # next: try baichuan naming
n_layer = next(
i
for i in itertools.count()
if f"model.layers.{i}.self_attn.W_pack.weight" not in model
)
else: else:
n_layer = next( n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
i
for i in itertools.count()
if f"layers.{i}.attention.wq.weight" not in model
)
if n_layer < 1: if n_layer < 1:
raise Exception( raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"failed to guess 'n_layer'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
)
n_head = n_embd // 128 # guessed n_head = n_embd // 128 # guessed
n_mult = 256 # guessed n_mult = 256 # guessed
# TODO: verify this # TODO: verify this
n_ff = int(2 * (4 * n_embd) / 3) n_ff = int(2 * (4 * n_embd) / 3)
n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
return Params( return Params(
n_vocab=n_vocab, n_vocab = n_vocab,
n_embd=n_embd, n_embd = n_embd,
n_layer=n_layer, n_layer = n_layer,
n_ctx=-1, n_ctx = -1,
n_ff=n_ff, n_ff = n_ff,
n_head=n_head, n_head = n_head,
n_head_kv=n_head, n_head_kv = n_head,
f_norm_eps=1e-5, f_norm_eps = 1e-5,
) )
@staticmethod @staticmethod
def load_transformers_config(model: LazyModel, config_path: Path) -> "Params": def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@ -274,22 +223,20 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn": elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling["original_max_position_embeddings"] n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling["finetuned"] rope_finetuned = rope_scaling['finetuned']
else: else:
raise NotImplementedError(f"Unknown rope scaling type: {typ}") raise NotImplementedError(f'Unknown rope scaling type: {typ}')
if "max_sequence_length" in config: if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"] n_ctx = config["max_sequence_length"]
elif "max_position_embeddings" in config: elif "max_position_embeddings" in config:
n_ctx = config["max_position_embeddings"] n_ctx = config["max_position_embeddings"]
else: else:
raise Exception( raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"failed to guess 'n_ctx'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
)
n_experts = None n_experts = None
n_experts_used = None n_experts_used = None
if "num_local_experts" in config: if "num_local_experts" in config:
@ -297,30 +244,30 @@ class Params:
n_experts_used = config["num_experts_per_tok"] n_experts_used = config["num_experts_per_tok"]
return Params( return Params(
n_vocab=config["vocab_size"], n_vocab = config["vocab_size"],
n_embd=config["hidden_size"], n_embd = config["hidden_size"],
n_layer=config["num_hidden_layers"], n_layer = config["num_hidden_layers"],
n_ctx=n_ctx, n_ctx = n_ctx,
n_ff=config["intermediate_size"], n_ff = config["intermediate_size"],
n_head=(n_head := config["num_attention_heads"]), n_head = (n_head := config["num_attention_heads"]),
n_head_kv=config.get("num_key_value_heads", n_head), n_head_kv = config.get("num_key_value_heads", n_head),
n_experts=n_experts, n_experts = n_experts,
n_experts_used=n_experts_used, n_experts_used = n_experts_used,
f_norm_eps=config["rms_norm_eps"], f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base=config.get("rope_theta"), f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type=rope_scaling_type, rope_scaling_type = rope_scaling_type,
f_rope_scale=f_rope_scale, f_rope_scale = f_rope_scale,
n_orig_ctx=n_orig_ctx, n_orig_ctx = n_orig_ctx,
rope_finetuned=rope_finetuned, rope_finetuned = rope_finetuned,
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod @staticmethod
def load_torch_params(model: LazyModel, config_path: Path) -> "Params": def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_experts = None n_experts = None
n_experts_used = None n_experts_used = None
f_rope_freq_base = None f_rope_freq_base = None
@ -343,50 +290,50 @@ class Params:
if config.get("moe"): if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"] n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"] n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6 f_rope_freq_base = 1e6
return Params( return Params(
n_vocab=model["tok_embeddings.weight"].shape[0], n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd=config["dim"], n_embd = config["dim"],
n_layer=config["n_layers"], n_layer = config["n_layers"],
n_ctx=n_ctx, n_ctx = n_ctx,
n_ff=n_ff, n_ff = n_ff,
n_head=(n_head := config["n_heads"]), n_head = (n_head := config["n_heads"]),
n_head_kv=config.get("n_kv_heads", n_head), n_head_kv = config.get("n_kv_heads", n_head),
n_experts=n_experts, n_experts = n_experts,
n_experts_used=n_experts_used, n_experts_used = n_experts_used,
f_norm_eps=config["norm_eps"], f_norm_eps = config["norm_eps"],
f_rope_freq_base=config.get("rope_theta", f_rope_freq_base), f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
) )
@staticmethod @staticmethod
def load(model_plus: ModelPlus) -> "Params": def load(model_plus: ModelPlus) -> Params:
hf_config_path = model_plus.paths[0].parent / "config.json" hf_config_path = model_plus.paths[0].parent / "config.json"
orig_config_path = model_plus.paths[0].parent / "params.json" orig_config_path = model_plus.paths[0].parent / "params.json"
if hf_config_path.exists(): if hf_config_path.exists():
params = Params.load_transformers_config(model_plus.model, hf_config_path) params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
elif orig_config_path.exists(): elif orig_config_path.exists():
params = Params.load_torch_params(model_plus.model, orig_config_path) params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
elif model_plus.format != "none": elif model_plus.format != 'none':
params = Params.guessed(model_plus.model) params = Params.guessed(model_plus.model)
else: else:
raise ValueError("Cannot guess params when model format is none") raise ValueError('Cannot guess params when model format is none')
params.path_model = model_plus.paths[0].parent params.path_model = model_plus.paths[0].parent
return params return params
class BpeVocab: # GPT #
def __init__( # vocab
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] #
) -> None:
self.bpe_tokenizer = json.loads( class BpeVocab:
open(str(fname_tokenizer), encoding="utf-8").read() def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
) self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"] self.vocab = self.bpe_tokenizer["model"]["vocab"]
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
@ -394,34 +341,31 @@ class BpeVocab: # GPT
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else: else:
# Fall back to trying to find the added tokens in tokenizer.json # Fall back to trying to find the added tokens in tokenizer.json
tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json" tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file(): if not tokenizer_json_file.is_file():
added_tokens = {} added_tokens = {}
else: else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict( added_tokens = dict(
(item["content"], item["id"]) (item['content'], item['id'])
for item in tokenizer_json.get("added_tokens", []) for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary. # Added tokens here can be duplicates of the main vocabulary.
if item["content"] not in self.bpe_tokenizer if item['content'] not in self.bpe_tokenizer)
)
vocab_size: int = len(self.vocab) vocab_size: int = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values()) actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids: if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1 expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception( raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
)
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items] self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -442,10 +386,8 @@ class BpeVocab: # GPT
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab: # LlaMa class SentencePieceVocab:
def __init__( def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int] added_tokens: dict[str, int]
if fname_added_tokens is not None: if fname_added_tokens is not None:
@ -455,23 +397,19 @@ class SentencePieceVocab: # LlaMa
vocab_size: int = self.sentencepiece_tokenizer.vocab_size() vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
new_tokens = { new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
id: piece for piece, id in added_tokens.items() if id >= vocab_size
}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys()) actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids: if expected_new_ids != actual_new_ids:
raise ValueError( raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
)
# Token pieces that were added to the base vocabulary. # Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@ -512,11 +450,15 @@ class SentencePieceVocab: # LlaMa
class HfVocab: class HfVocab:
def __init__( def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
self, try:
fname_tokenizer: Path, from transformers import AutoTokenizer
fname_added_tokens: Optional[Path] = None, except ImportError as e:
) -> None: raise ImportError(
"To use HfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
print("fname_tokenizer:", fname_tokenizer) print("fname_tokenizer:", fname_tokenizer)
# Allow the tokenizer to default to slow or fast versions. # Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths. # Explicitly set tokenizer to use local paths.
@ -529,7 +471,7 @@ class HfVocab:
# Initialize lists and dictionaries for added tokens # Initialize lists and dictionaries for added tokens
self.added_tokens_list = [] self.added_tokens_list = []
self.added_tokens_dict = dict() self.added_tokens_dict = dict()
self.added_tokens_ids = set() self.added_tokens_ids = set()
# Process added tokens # Process added tokens
for tok, tokidx in sorted( for tok, tokidx in sorted(
@ -550,12 +492,12 @@ class HfVocab:
# Set vocabulary sizes # Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens self.fname_added_tokens = fname_added_tokens
def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = { reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
} }
@ -573,11 +515,9 @@ class HfVocab:
token_id, self.special_ids # Reuse already stored special IDs token_id, self.special_ids # Reuse already stored special IDs
) )
def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType: def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
# Determine token type based on whether it's a special token # Determine token type based on whether it's a special token
return ( return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
)
def get_token_score(self, token_id: int) -> float: def get_token_score(self, token_id: int) -> float:
# Placeholder for actual logic to determine the token's score # Placeholder for actual logic to determine the token's score
@ -589,7 +529,6 @@ class HfVocab:
if text in self.specials: if text in self.specials:
toktype = self.get_token_type(self.specials[text], self.special_ids) toktype = self.get_token_type(self.specials[text], self.special_ids)
score = self.get_token_score(self.specials[text]) score = self.get_token_score(self.specials[text])
else: else:
toktype = gguf.TokenType.USER_DEFINED toktype = gguf.TokenType.USER_DEFINED
score = -1000.0 score = -1000.0
@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
else: else:
model = merge_sharded([mp.model for mp in models_plus]) model = merge_sharded([mp.model for mp in models_plus])
return ModelPlus(model, paths, format, vocab) return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
CLASSES: dict[tuple[str, str], Any] = { CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to determine # getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute. # the staticmethods have a __func__ attribute.
("torch._tensor", "_rebuild_from_type_v2"): getattr( ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
rebuild_from_type_v2, "__func__" ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
), ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
("torch._utils", "_rebuild_tensor_v2"): getattr( ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
lazy_rebuild_tensor_v2, "__func__" ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
), ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16), ('torch', 'Tensor'): LazyTensor,
("torch", "HalfStorage"): LazyStorageKind(DT_F16),
("torch", "FloatStorage"): LazyStorageKind(DT_F32),
("torch", "IntStorage"): LazyStorageKind(DT_I32),
("torch", "Tensor"): LazyTensor,
} }
def find_class(self, module: str, name: str) -> Any: def find_class(self, module: str, name: str) -> Any:
@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
executor_class = ProcessPoolExecutor executor_class = ProcessPoolExecutor
else: else:
executor_class = ThreadPoolExecutor executor_class = ThreadPoolExecutor
with executor_class(max_workers = max_workers) as executor: with executor_class(max_workers=max_workers) as executor:
futures: list[concurrent.futures.Future[Out]] = [] futures: list[concurrent.futures.Future[Out]] = []
done = False done = False
for _ in range(concurrency): for _ in range(concurrency):
@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
class OutputFile: class OutputFile:
def __init__( def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
) -> None:
self.gguf = gguf.GGUFWriter(
fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
)
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
name = "LLaMA" name = "LLaMA"
@ -1036,21 +967,16 @@ class OutputFile:
if params.n_ctx == 4096: if params.n_ctx == 4096:
name = "LLaMA v2" name = "LLaMA v2"
elif params.path_model is not None: elif params.path_model is not None:
name = str(params.path_model.parent).split("/")[-1] name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name(name) self.gguf.add_name (name)
self.gguf.add_context_length(params.n_ctx) self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length(params.n_embd) self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count(params.n_layer) self.gguf.add_block_count (params.n_layer)
self.gguf.add_feed_forward_length(params.n_ff) self.gguf.add_feed_forward_length (params.n_ff)
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count(params.n_head) self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv(params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
if params.f_norm_eps is None:
raise ValueError("f_norm_eps is None")
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
if params.n_experts: if params.n_experts:
self.gguf.add_expert_count(params.n_experts) self.gguf.add_expert_count(params.n_experts)
@ -1058,6 +984,11 @@ class OutputFile:
if params.n_experts_used: if params.n_experts_used:
self.gguf.add_expert_used_count(params.n_experts_used) self.gguf.add_expert_used_count(params.n_experts_used)
if params.f_norm_eps:
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
else:
raise ValueError('f_norm_eps is None')
if params.f_rope_freq_base is not None: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -1089,7 +1020,7 @@ class OutputFile:
return tokenizer_model return tokenizer_model
def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]: def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
@ -1124,14 +1055,10 @@ class OutputFile:
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
n_elements = int(np.prod(tensor.shape)) n_elements = int(np.prod(tensor.shape))
raw_dtype = getattr(tensor.data_type, "ggml_type", None) raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
data_type = ( data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
)
data_nbytes = tensor.data_type.elements_to_bytes(n_elements) data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
self.gguf.add_tensor_info( self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
)
def write_meta(self) -> None: def write_meta(self) -> None:
self.gguf.write_header_to_file() self.gguf.write_header_to_file()
@ -1145,14 +1072,10 @@ class OutputFile:
@staticmethod @staticmethod
def write_vocab_only( def write_vocab_only(
fname_out: Path, fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
params: Params, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
vocab: Vocab,
svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out, endianess=endianess) of = OutputFile(fname_out, endianess=endianess)
@ -1180,14 +1103,8 @@ class OutputFile:
@staticmethod @staticmethod
def write_all( def write_all(
fname_out: Path, fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
ftype: GGMLFileType, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
params: Params,
model: LazyModel,
vocab: Vocab,
svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False, pad_vocab: bool = False,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1207,26 +1124,19 @@ class OutputFile:
of.write_tensor_info() of.write_tensor_info()
# tensor data # tensor data
ndarrays_inner = bounded_parallel_map( ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
OutputFile.do_item, model.items(), concurrency=concurrency
)
if ftype == GGMLFileType.MostlyQ8_0: if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map( ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
ndarrays_inner,
concurrency=concurrency,
max_workers=concurrency,
use_processpool_executor=True, use_processpool_executor=True,
) )
else: else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time() start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate( for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
zip(model.items(), ndarrays)
):
elapsed = time.time() - start elapsed = time.time() - start
size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape) size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model))) padi = len(str(len(model)))
print( print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory: class VocabFactory:
def __init__(self, path: Path): def __init__(self, path: Path):
self.path = path self.path = path
self.files = { self.files: dict[str, Path | None] = {
"tokenizer.model": None, "tokenizer.model": None,
"vocab.json": None, "vocab.json": None,
"tokenizer.json": None, "tokenizer.json": None,
@ -1380,24 +1290,18 @@ class VocabFactory:
self.files[file] = parent_file_path self.files[file] = parent_file_path
print(f"Found vocab files: {self.files}") print(f"Found vocab files: {self.files}")
def _select_file(self, vocabtype: Optional[str]) -> Path: def _select_file(self, vocabtype: str | None) -> Path:
if vocabtype in ["spm", "bpe"]: if vocabtype in ["spm", "bpe"]:
for file_key in self.files.keys(): for file_key in self.files.keys():
if self.files[file_key]: if (file := self.files[file_key]) is not None:
return self.files[file_key] return file
raise FileNotFoundError(f"{vocabtype} vocab not found.") raise FileNotFoundError(f"{vocabtype} vocab not found.")
elif vocabtype == "hfft": if vocabtype == "hfft":
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
return self.path return self.path
else: raise ValueError(f"Unsupported vocabulary type {vocabtype}")
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def _create_special_vocab( def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
self,
vocab: Vocab,
vocabtype: str,
model_parent_path: Path,
) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe" load_merges = vocabtype == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab( return gguf.SpecialVocab(
@ -1407,13 +1311,12 @@ class VocabFactory:
n_vocab=n_vocab, n_vocab=n_vocab,
) )
def load_vocab( def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
self, vocabtype: str, model_parent_path: Path
) -> Tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype) path = self._select_file(vocabtype)
print(f"Loading vocab file '{path}', type '{vocabtype}'") print(f"Loading vocab file '{path}', type '{vocabtype}'")
added_tokens_path = path.parent / "added_tokens.json" added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocabtype == "bpe": if vocabtype == "bpe":
vocab = BpeVocab( vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
@ -1428,6 +1331,7 @@ class VocabFactory:
) )
else: else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") raise ValueError(f"Unsupported vocabulary type {vocabtype}")
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab( special_vocab = self._create_special_vocab(
vocab, vocab,
vocabtype, vocabtype,
@ -1436,18 +1340,17 @@ class VocabFactory:
return vocab, special_vocab return vocab, special_vocab
def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = { namestr = {
GGMLFileType.AllF32: "f32", GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyF16: "f16",
GGMLFileType.MostlyQ8_0: "q8_0", GGMLFileType.MostlyQ8_0:"q8_0",
}[file_type] }[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
if ret in model_paths: if ret in model_paths:
sys.stderr.write( sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. " f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n" "Please explicitly specify a path using --outfile.\n")
)
sys.exit(1) sys.exit(1)
return ret return ret
@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.format = {model_plus.format!r}")
print(f"model_plus.vocab = {model_plus.vocab!r}") print(f"model_plus.vocab = {model_plus.vocab!r}")
for name, lazy_tensor in model_plus.model.items(): for name, lazy_tensor in model_plus.model.items():
print( print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
)
def get_argument_parser() -> ArgumentParser: def main(args_in: list[str] | None = None) -> None:
output_choices = ["f32", "f16"] output_choices = ["f32", "f16"]
if np.uint32(1) == np.uint32(1).newbyteorder("<"): if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems. # We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0") output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser = argparse.ArgumentParser( args = parser.parse_args(args_in)
description="Convert a LLaMa model to a GGML compatible file"
)
parser.add_argument(
"model",
type=Path,
help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
)
parser.add_argument(
"--awq-path",
type=Path,
help="Path to the Activation-aware Weight Quantization cache file",
default=None,
)
parser.add_argument(
"--dump",
action="store_true",
help="Display the model content without converting it",
)
parser.add_argument(
"--dump-single",
action="store_true",
help="Display the content of a single model file without conversion",
)
parser.add_argument(
"--vocab-only",
action="store_true",
help="Extract and output only the vocabulary",
)
parser.add_argument(
"--outtype",
choices=output_choices,
help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
)
parser.add_argument(
"--vocab-dir",
type=Path,
help="Directory containing the tokenizer.model, if separate from the model file",
)
parser.add_argument(
"--vocab-type",
choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer
default="spm",
help="The vocabulary format used to define the tokenizer model (default: spm)",
)
parser.add_argument(
"--pad-vocab",
action="store_true",
help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
)
parser.add_argument(
"--outfile",
type=Path,
help="Specify the path for the output file (default is based on input)",
)
parser.add_argument(
"--ctx", type=int, help="Model training context (default is based on input)"
)
parser.add_argument(
"--concurrency",
type=int,
help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
default=DEFAULT_CONCURRENCY,
)
parser.add_argument(
"--big-endian",
action="store_true",
help="Indicate that the model is executed on a big-endian machine",
)
return parser
def main(argv: Optional[list[str]] = None) -> None:
parser = get_argument_parser()
args = parser.parse_args(argv)
if args.awq_path: if args.awq_path:
sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py")) sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
from awq.apply_awq import add_scale_weights from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
tmp_model_path = args.model / "weighted_model" tmp_model_path = args.model / "weighted_model"
if tmp_model_path.is_dir(): if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.") print(f"{tmp_model_path} exists as a weighted model.")
@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
if not args.vocab_only: if not args.vocab_only:
model_plus = load_some_model(args.model) model_plus = load_some_model(args.model)
else: else:
model_plus = ModelPlus( model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model={}, paths=[args.model / "dummy"], format="none", vocab=None
)
if args.dump: if args.dump:
do_dump_model(model_plus) do_dump_model(model_plus)
return return
endianess = gguf.GGUFEndian.LITTLE endianess = gguf.GGUFEndian.LITTLE
if args.big_endian: if args.big_endian:
endianess = gguf.GGUFEndian.BIG endianess = gguf.GGUFEndian.BIG
@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
params = Params.load(model_plus) params = Params.load(model_plus)
if params.n_ctx == -1: if params.n_ctx == -1:
if args.ctx is None: if args.ctx is None:
raise Exception( raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
"The model doesn't have a context size, and you didn't specify one with --ctx\n" "Please specify one with --ctx:\n"
"Please specify one with --ctx:\n" " - LLaMA v1: --ctx 2048\n"
" - LLaMA v1: --ctx 2048\n" " - LLaMA v2: --ctx 4096\n")
" - LLaMA v2: --ctx 4096\n"
)
params.n_ctx = args.ctx params.n_ctx = args.ctx
if args.outtype: if args.outtype:
@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
if not args.outfile: if not args.outfile:
raise ValueError("need --outfile if using --vocab-only") raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile outfile = args.outfile
OutputFile.write_vocab_only( OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
outfile, endianess=endianess, pad_vocab=args.pad_vocab)
params,
vocab,
special_vocab,
endianess=endianess,
pad_vocab=args.pad_vocab,
)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
return return
if model_plus.vocab is not None and args.vocab_dir is None: if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab vocab = model_plus.vocab
model = model_plus.model print(f"Vocab info: {vocab}")
model = convert_model_names(model, params) print(f"Special vocab info: {special_vocab}")
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype) model = model_plus.model
outfile = args.outfile or default_output_file(model_plus.paths, ftype) model = convert_model_names(model, params)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all( OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
outfile, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
ftype,
params,
model,
vocab,
special_vocab,
concurrency=args.concurrency,
endianess=endianess,
pad_vocab=args.pad_vocab,
)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
if __name__ == "__main__": if __name__ == '__main__':
main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv main()

View file

@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
std::vector<size_t> train_samples_begin; std::vector<size_t> train_samples_begin;
std::vector<size_t> train_samples_size; std::vector<size_t> train_samples_size;
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
tokenize_file(lctx, tokenize_file(lctx,
params.common.fn_train_data, params.common.fn_train_data,
params.common.sample_start, params.common.sample_start,

View file

@ -26,6 +26,7 @@ struct StatParams {
std::string ofile = "imatrix.dat"; std::string ofile = "imatrix.dat";
int n_output_frequency = 10; int n_output_frequency = 10;
int verbosity = 1; int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false; bool collect_output_weight = false;
}; };
@ -42,6 +43,9 @@ private:
int m_last_call = 0; int m_last_call = 0;
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name) const;
void keep_imatrix(int ncall) const;
}; };
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
}
} }
} }
} else { } else {
@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_output_frequency == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
}
} }
} }
@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
void IMatrixCollector::save_imatrix() const { void IMatrixCollector::save_imatrix() const {
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(); save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
}
void IMatrixCollector::keep_imatrix(int ncall) const {
auto file_name = m_params.ofile;
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str());
}
void IMatrixCollector::save_imatrix(const char * fname) const {
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size(); int n_entries = m_stats.size();
out.write((const char*)&n_entries, sizeof(n_entries)); out.write((const char*)&n_entries, sizeof(n_entries));
@ -248,7 +269,7 @@ static void process_logits(
} }
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
} }
std::vector<float> logit_history; std::vector<float> logit_history;
logit_history.resize(tokens.size());
std::vector<float> prob_history; std::vector<float> prob_history;
prob_history.resize(tokens.size());
if (compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk_max = tokens.size() / n_ctx;
@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
if (compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx; const int start = i * n_ctx;
const int end = start + n_ctx; const int end = start + n_ctx;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits; std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now(); const auto t_start = std::chrono::high_resolution_clock::now();
@ -321,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
// restore the original token in case it was set to BOS // restore the original token in case it was set to BOS
tokens[batch_start] = token_org; tokens[batch_start] = token_org;
const auto * batch_logits = llama_get_logits(ctx); if (compute_ppl && num_batches > 1) {
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
} }
const auto t_end = std::chrono::high_resolution_clock::now(); const auto t_end = std::chrono::high_resolution_clock::now();
@ -338,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
} }
const int first = n_ctx/2; if (compute_ppl) {
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, const int first = n_ctx/2;
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
count += n_ctx - first - 1; process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout); fflush(stdout);
logits.clear();
}
} }
printf("\n"); printf("\n");
nll2 /= count; if (compute_ppl) {
nll /= count; nll2 /= count;
const double ppl = exp(nll); nll /= count;
nll2 -= nll * nll; const double ppl = exp(nll);
if (nll2 > 0) { nll2 -= nll * nll;
nll2 = sqrt(nll2/(count-1)); if (nll2 > 0) {
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); nll2 = sqrt(nll2/(count-1));
} else { printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
printf("Unexpected negative standard deviation of log(prob)\n"); } else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
} }
return true; return true;
@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
StatParams sparams; StatParams sparams;
bool compute_ppl = true;
std::vector<char*> args; std::vector<char*> args;
args.push_back(argv[0]); args.push_back(argv[0]);
int iarg = 1; int iarg = 1;
@ -381,12 +419,21 @@ int main(int argc, char ** argv) {
} }
else if (arg == "--verbosity") { else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]); sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else { } else {
args.push_back(argv[iarg]); args.push_back(argv[iarg]);
} }
} }
if (iarg < argc) { if (iarg < argc) {
args.push_back(argv[iarg]); std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]);
}
} }
gpt_params params; gpt_params params;
@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params); bool OK = compute_imatrix(ctx, params, compute_ppl);
if (!OK) { if (!OK) {
return 1; return 1;
} }

View file

@ -0,0 +1,131 @@
# MobileVLM
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
## Usage
Build with cmake or run `make llava-cli` to build it.
After building, run: `./llava-cli` to see the usage. For example:
```sh
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
--image path/to/an/image.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
```
## Model conversion
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
```
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert-image-encoder-to-gguf \
-m path/to/clip-vit-large-patch14-336 \
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
--output-dir path/to/MobileVLM-1.7B \
--projector-type ldp
```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./convert.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
```sh
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
```
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
## Android compile and run
### compile
refer to `examples/llava/android/build_64.sh`
```sh
mkdir examples/llava/android/build_64
cd examples/llava/android/build_64
../build_64.sh
```
### run on Android
refer to `android/adb_run.sh`, modify resources' `name` and `path`
## some result on Android with `Snapdragon 888` chip
### case 1
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/demo.jpg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch)
Susan Wise Bauer
llama_print_timings: load time = 23574.72 ms
llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second)
llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second)
llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second)
llama_print_timings: total time = 34731.93 ms
```
### case 2
**input**
```sh
/data/local/tmp/llava-cli \
-m /data/local/tmp/ggml-model-q4_k.gguf \
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
-t 4 \
--image /data/local/tmp/cat.jpeg \
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
```
**output**
```sh
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
The image depicts a cat sitting in the grass near some tall green plants.
llama_print_timings: load time = 23257.32 ms
llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second)
llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second)
llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second)
llama_print_timings: total time = 34570.79 ms
```
## Minor shortcomings
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
## TODO
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
- [ ] Optimize LDP projector performance
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
- [ ] run MobileVLM on `Jetson Orin`
- [ ] Support more model variants, such as `MobileVLM-3B`.
## contributor
```sh
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
```

View file

@ -0,0 +1,53 @@
#!/bin/bash
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
projector_name="mmproj-model-f16.gguf"
llama_name="ggml-model-q4_k.gguf"
img_dir="/Users/cxt/model/llm"
img_name="demo.jpg"
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
# img_name="cat.jpeg"
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
program_dir="build_64/bin"
binName="llava-cli"
n_threads=4
deviceDir="/data/local/tmp"
saveDir="output"
if [ ! -d ${saveDir} ]; then
mkdir ${saveDir}
fi
function android_run() {
# # copy resource into device
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
# copy program into device
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
adb shell "chmod 0777 ${deviceDir}/${binName}"
# run
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
-m ${deviceDir}/${llama_name} \
--mmproj ${deviceDir}/${projector_name} \
-t ${n_threads} \
--image ${deviceDir}/${img_name} \
-p \"${prompt}\" \
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
-m ${deviceDir}/${llama_name} \
--mmproj ${deviceDir}/${projector_name} \
-t ${n_threads} \
--image ${deviceDir}/${img_name} \
-p \"${prompt}\" \
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
}
android_run
echo "android_run is Done!"

View file

@ -0,0 +1,8 @@
#!/bin/bash
cmake ../../../../ \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-23 $1
make -j4

View file

@ -12,6 +12,7 @@
#include <regex> #include <regex>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
#include <sstream>
#include "clip.h" #include "clip.h"
#include "ggml.h" #include "ggml.h"
@ -67,6 +68,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
// //
// tensor name constants // tensor name constants
@ -89,6 +91,21 @@ static std::string format(const char * fmt, ...) {
#define TN_TEXT_PROJ "text_projection.weight" #define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight" #define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s" #define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
};
// //
// utilities to get data from a gguf file // utilities to get data from a gguf file
@ -129,6 +146,91 @@ static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype)); return ggml_type_name(static_cast<ggml_type>(ftype));
} }
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type);
}
static projector_type clip_projector_type_from_string(const std::string & name) {
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
if (kv.second == name) {
return kv.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
// //
// image data // image data
// //
@ -205,6 +307,32 @@ struct clip_vision_model {
struct ggml_tensor * mm_0_b; struct ggml_tensor * mm_0_b;
struct ggml_tensor * mm_2_w; struct ggml_tensor * mm_2_w;
struct ggml_tensor * mm_2_b; struct ggml_tensor * mm_2_b;
// MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w;
struct ggml_tensor * mm_model_mlp_1_b;
struct ggml_tensor * mm_model_mlp_3_w;
struct ggml_tensor * mm_model_mlp_3_b;
struct ggml_tensor * mm_model_block_1_block_0_0_w;
struct ggml_tensor * mm_model_block_1_block_0_1_w;
struct ggml_tensor * mm_model_block_1_block_0_1_b;
struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
struct ggml_tensor * mm_model_block_1_block_2_0_w;
struct ggml_tensor * mm_model_block_1_block_2_1_w;
struct ggml_tensor * mm_model_block_1_block_2_1_b;
struct ggml_tensor * mm_model_block_2_block_0_0_w;
struct ggml_tensor * mm_model_block_2_block_0_1_w;
struct ggml_tensor * mm_model_block_2_block_0_1_b;
struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
struct ggml_tensor * mm_model_block_2_block_2_0_w;
struct ggml_tensor * mm_model_block_2_block_2_1_w;
struct ggml_tensor * mm_model_block_2_block_2_1_b;
}; };
struct clip_ctx { struct clip_ctx {
@ -213,6 +341,7 @@ struct clip_ctx {
bool has_llava_projector = false; bool has_llava_projector = false;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
float image_mean[3]; float image_mean[3];
float image_std[3]; float image_std[3];
@ -430,16 +559,135 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
free(patches_data); free(patches_data);
} }
// shape [1, 576, 1024]
// ne is whcn, ne = [1024, 576, 1, 1]
embeddings = ggml_get_rows(ctx0, embeddings, patches); embeddings = ggml_get_rows(ctx0, embeddings, patches);
// mm projection 0 // print_tensor_info(embeddings, "embeddings");
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_gelu(ctx0, embeddings); // llava projector
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
int n_patch = 24;
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
mlp_1 = ggml_gelu(ctx0, mlp_1);
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
// block 1
struct ggml_tensor * block_1 = nullptr;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
// layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
// residual
block_1 = ggml_add(ctx0, mlp_3, block_1);
}
// block_2
{
// stride = 2
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// hardswish
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
// not sure the parameters is right for globalAvgPooling
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
int w = block_1->ne[0], h = block_1->ne[1];
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
block_1 = ggml_norm(ctx0, block_1, eps);
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
}
embeddings = block_1;
}
else {
GGML_ASSERT(false);
}
} }
// build the graph // build the graph
@ -485,16 +733,55 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("\n"); printf("\n");
} }
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
// kv // kv
if (verbosity >= 3) { const int n_kv = gguf_get_n_kv(ctx);
const int n_kv = gguf_get_n_kv(ctx); printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname);
{
std::map<enum ggml_type, uint32_t> n_type;
for (int i = 0; i < n_kv; ++i) { uint32_t n_type_max = 0;
const char * key = gguf_get_key(ctx, i); enum ggml_type type_max = GGML_TYPE_F32;
printf("%s: kv[%d]: key = %s\n", __func__, i, key); for (int i = 0; i < n_tensors; i++) {
enum ggml_type type = gguf_get_tensor_type(ctx, i);
n_type[type]++;
if (n_type_max < n_type[type]) {
n_type_max = n_type[type];
type_max = type;
}
}
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(ctx, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
replace_all(value, "\n", "\\n");
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}
// print type counts
for (auto & kv : n_type) {
if (kv.second == 0) {
continue;
}
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
printf("\n");
} }
// data // data
@ -503,20 +790,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
enum ggml_type type = gguf_get_tensor_type(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(meta, name); struct ggml_tensor * cur = ggml_get_tensor(meta, name);
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
buffer_size += tensor_size; buffer_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i, printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%d, %d, %d, %d], type: %d\n", __func__, i,
ggml_n_dims(cur), cur->name, tensor_size, offset); ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], type);
} }
} }
} }
buffer_size += n_tensors * 128 /* CLIP PADDING */; buffer_size += n_tensors * 128 /* CLIP PADDING */;
clip_ctx * new_clip = new clip_ctx; clip_ctx * new_clip = new clip_ctx;
// update projector type
{
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
if (idx != -1) {
const std::string proj_type = gguf_get_val_str(ctx, idx);
new_clip->proj_type = clip_projector_type_from_string(proj_type);
}
else {
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
}
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__); printf("%s: CLIP using CUDA backend\n", __func__);
@ -661,10 +963,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); // LLaVA projection
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
vision_model.layers.resize(hparams.n_layer); vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) { for (int il = 0; il < hparams.n_layer; ++il) {
@ -1100,13 +1437,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
} }
int clip_n_mmproj_embd(const struct clip_ctx * ctx) { int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_2_b->ne[0]; if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
return ctx->vision_model.mm_2_b->ne[0];
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
} }
int clip_n_patches(const struct clip_ctx * ctx) { int clip_n_patches(const struct clip_ctx * ctx) {
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size); if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
n_patches /= 4;
}
return n_patches;
} }
size_t clip_embd_nbytes(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) {

View file

@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
ap.add_argument("--clip_model_is_vision", action="store_true", required=False, ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
fout.add_description("vision-only CLIP model") fout.add_description("vision-only CLIP model")
elif has_llava_projector: elif has_llava_projector:
fout.add_description("image encoder for LLaVA") fout.add_description("image encoder for LLaVA")
# add projector type
fout.add_string("clip.projector_type", args.projector_type)
else: else:
fout.add_description("two-tower CLIP model") fout.add_description("two-tower CLIP model")
@ -218,7 +221,8 @@ if has_llava_projector:
projector = torch.load(args.llava_projector) projector = torch.load(args.llava_projector)
for name, data in projector.items(): for name, data in projector.items():
name = get_tensor_name(name) name = get_tensor_name(name)
if data.ndim == 2: # pw and dw conv ndim==4
if data.ndim == 2 or data.ndim == 4:
data = data.squeeze().numpy().astype(np.float16) data = data.squeeze().numpy().astype(np.float16)
else: else:
data = data.squeeze().numpy().astype(np.float32) data = data.squeeze().numpy().astype(np.float32)

View file

@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
} }
static inline int nearest_int(float fval) {
//assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
}
static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
float max_logit = logits[0];
float min_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
min_logit = std::min(min_logit, logits[i]);
}
min_logit = std::max(min_logit, max_logit - 16);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
const float log_sum_exp = log(sum_exp);
const float min_log_prob = min_logit - max_logit - log_sum_exp;
const float scale = (max_logit - min_logit)/65535.f;
float * d = (float *)log_prob;
d[0] = scale;
d[1] = min_log_prob;
log_prob += 4;
if (scale) {
const float inv_scale = 1/scale;
for (int i = 0; i < n_vocab; ++i) {
log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
}
} else {
std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
}
return max_logit + log_sum_exp - logits[tok];
}
static void process_logits( static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history double & nll, double & nll2, float * logit_history, float * prob_history
@ -147,6 +184,114 @@ static void process_logits(
} }
} }
static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0;
auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
double local_nll = 0;
double local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
}
struct kl_divergence_result {
double sum_nll = 0;
double sum_nll2 = 0;
double sum_kld = 0;
double sum_kld2 = 0;
double sum_nll_diff = 0;
double sum_nll_diff2 = 0;
size_t count = 0;
};
static void log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) {
max_logit = std::max(max_logit, logits[i]);
}
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
}
const float log_sum_exp = log(sum_exp);
const float * d = (const float *)base_log_prob;
const float scale = d[0];
const float min_log_prob = d[1];
base_log_prob += 4;
float nll = max_logit + log_sum_exp - logits[tok];
kld.sum_nll += nll;
kld.sum_nll2 += nll*nll;
nll += (scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff += nll;
kld.sum_nll_diff2 += nll*nll;
max_logit += log_sum_exp;
double sum = 0;
for (int i = 0; i < n_vocab; ++i) {
const float p_log_base = scale*base_log_prob[i] + min_log_prob;
if (p_log_base > -16.f) {
const float p_base = expf(p_log_base);
sum += p_base * (p_log_base - logits[i] + max_logit);
}
}
kld.sum_kld += sum;
kld.sum_kld2 += sum*sum;
++kld.count;
}
static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld) {
std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0;
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv] () {
kl_divergence_result local_kld;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
kld.sum_nll += local_kld.sum_nll;
kld.sum_nll2 += local_kld.sum_nll2;
kld.sum_kld += local_kld.sum_kld;
kld.sum_kld2 += local_kld.sum_kld2;
kld.sum_nll_diff += local_kld.sum_nll_diff;
kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
kld.count += local_kld.count;
break;
}
lock.unlock();
log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
}
};
for (auto & w : workers) {
w = std::thread(compute);
}
compute();
for (auto & w : workers) {
w.join();
}
}
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -294,6 +439,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str());
if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,6 +493,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1); std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
std::vector<uint16_t> log_probs;
if (!params.logits_file.empty()) {
logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
const int nv = 2*((n_vocab + 1)/2) + 4;
log_probs.resize(n_ctx * nv);
}
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx; const int start = i * n_ctx;
const int end = start + n_ctx; const int end = start + n_ctx;
@ -398,8 +564,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// process the entire prompt. // process the entire prompt.
const int first = n_ctx/2; const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, if (!params.logits_file.empty()) {
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs, nll, nll2);
} else {
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
}
count += n_ctx - first - 1; count += n_ctx - first - 1;
// perplexity is e^(average negative log-likelihood) // perplexity is e^(average negative log-likelihood)
@ -458,23 +629,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
return true; return true;
} }
#define K_TOKEN_CHUNK 4
static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers, static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) { const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
constexpr int k_token_chunk = 4;
if (eval_results.size() != eval_pairs.size()) { if (eval_results.size() != eval_pairs.size()) {
eval_results.resize(eval_pairs.size()); eval_results.resize(eval_pairs.size());
} }
if (eval_pairs.empty()) return; if (eval_pairs.empty()) return;
size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size()); size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
std::atomic<int> counter(0); std::atomic<int> counter(0);
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
float local_logprobs[k_token_chunk]; float local_logprobs[K_TOKEN_CHUNK];
while (true) { while (true) {
size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed); size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
if (first >= eval_results.size()) break; if (first >= eval_results.size()) break;
size_t last = std::min(first + k_token_chunk, eval_results.size()); size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
for (size_t i = first; i < last; ++i) { for (size_t i = first; i < last; ++i) {
auto logits = batch_logits + eval_pairs[i].first * n_vocab; auto logits = batch_logits + eval_pairs[i].first * n_vocab;
float max_logit = logits[0]; float max_logit = logits[0];
@ -497,7 +669,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
for (size_t it = 0; it < max_threads; ++it) { for (size_t it = 0; it < max_threads; ++it) {
workers[it].join(); workers[it].join();
} }
} }
static void hellaswag_score(llama_context * ctx, const gpt_params & params) { static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@ -540,14 +711,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// This is needed as usual for LLaMA models // This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
// Number of tasks to use when computing the score // Number of tasks to use when computing the score
if (params.hellaswag_tasks < hs_task_count) { if (params.hellaswag_tasks < hs_task_count) {
hs_task_count = params.hellaswag_tasks; hs_task_count = params.hellaswag_tasks;
} }
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
// The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
std::mt19937 rng(1); std::mt19937 rng(1);
@ -1031,6 +1202,531 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
} }
static bool deserialize_string(std::istream& in, std::string& str) {
uint32_t size;
if (!in.read((char *)&size, sizeof(size)).fail()) {
str.resize(size);
if (!in.read((char *)str.data(), size).fail()) return true;
}
return false;
}
struct multiple_choice_answers {
std::vector<std::string> answers;
std::vector<int> labels;
bool deserialize(std::istream& in) {
uint32_t n;
in.read((char *)&n, sizeof(n));
if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
answers.resize(n);
labels.resize(n);
for (auto& a : answers) {
if (!deserialize_string(in, a)) return false;
}
in.read((char *)labels.data(), n*sizeof(int));
return !in.fail();
}
};
struct multiple_choice_task {
std::string question; // the question (or context that needs to be continued)
multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer
multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet
bool deserialize(std::istream& in) {
if (!deserialize_string(in, question)) return false;
return mc1.deserialize(in) && mc2.deserialize(in);
}
// For evaluation
size_t i_batch; // starting index in the llama_batch
size_t common_prefix; // max number of initial tokens that are the same in all sentences
size_t required_tokens; // needed number of tokens to evaluate all answers
std::vector<std::vector<llama_token>> seq_tokens;
std::vector<float> log_probs;
};
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__);
}
return false;
}
task.seq_tokens.reserve(task.mc1.answers.size());
for (auto& answer : task.mc1.answers) {
if (answer.empty()) {
if (log_error) {
printf("%s: found empty answer\n", __func__);
}
return false;
}
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
}
auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) {
min_len = std::min(min_len, seq.size());
}
task.common_prefix = 0;
for (size_t k = 0; k < min_len; ++k) {
auto token = task.seq_tokens[0][k];
bool all_same = true;
for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
if (task.seq_tokens[i][k] != token) {
all_same = false;
break;
}
}
if (!all_same) {
break;
}
++task.common_prefix;
}
task.required_tokens = task.common_prefix;
for (auto& seq : task.seq_tokens) {
task.required_tokens += seq.size() - task.common_prefix;
}
return true;
}
//
// Calculates score for multiple choice tasks with single correct answer from prompt.
// Commonly used LLM evaluation metrics of this type are
// * ARC
// * HellaSwag
// * MMLU
// * TruthfulQA
//
// Validation datasets for these 4 tests can be found at
// https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
// The data for these datasets was extracted from
// git@hf.co:datasets/allenai/ai2_arc
// https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
// git@hf.co:datasets/Stevross/mmlu
// https://huggingface.co/datasets/truthful_qa
//
static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
std::istringstream strstream(params.prompt);
uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) {
printf("%s: no tasks\n", __func__);
return;
}
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
printf("%s: failed to raad task positions from prompt\n", __func__);
return;
}
std::vector<multiple_choice_task> tasks;
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks
tasks.resize(n_task);
printf("%s: reading tasks", __func__);
int n_dot = n_task/100;
int i = 0;
for (auto& task : tasks) {
++i;
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
return;
}
if (i%n_dot == 0) printf(".");
}
printf("done\n");
}
else {
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1);
std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
float scale = 1.f/(1.f + (float)std::mt19937::max());
tasks.resize(params.multiple_choice_tasks);
for (auto& task : tasks) {
int j = (int)(scale * rng() * aux.size());
int idx = aux[j];
aux[j] = aux.back();
aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return;
}
}
n_task = params.multiple_choice_tasks;
}
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
printf("%s: preparing task data", __func__);
fflush(stdout);
if (n_task > 500) {
printf("...");
fflush(stdout);
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
int num_tasks = tasks.size();
int n_bad_local = 0;
while (true) {
int first = counter.fetch_add(K_TOKEN_CHUNK);
if (first >= num_tasks) {
if (n_bad_local > 0) n_bad += n_bad_local;
break;
}
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
for (int i = first; i < last; ++i) {
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
}
}
};
size_t max_thread = std::thread::hardware_concurrency();
max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
std::vector<std::thread> workers(max_thread-1);
for (auto& w : workers) w = std::thread(prepare);
prepare();
for (auto& w : workers) w.join();
printf("done\n");
fflush(stdout);
int nbad = n_bad;
if (nbad > 0) {
printf("%s: found %d malformed tasks\n", __func__, nbad);
return;
}
} else {
int n_dot = n_task/100;
int i_task = 0;
for (auto& task : tasks) {
++i_task;
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
return;
}
if (i_task%n_dot == 0) {
printf(".");
fflush(stdout);
}
}
printf("done\n");
}
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
printf("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
std::vector<std::thread> workers(std::thread::hardware_concurrency());
std::vector<int> batch_indeces;
int n_done = 0;
int n_correct = 0;
int n_tot_answers = 0;
for (size_t i0 = 0; i0 < tasks.size(); i0++) {
int n_cur = 0;
size_t i1 = i0;
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
llama_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique seuqnce ids - one for each ending
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
int s0 = 0;
while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
auto& cur_task = tasks[i1];
int num_answers = cur_task.seq_tokens.size();
if (s0 + num_answers > max_seq) {
break;
}
if (int(batch_indeces.size()) != num_answers) {
batch_indeces.resize(num_answers);
}
for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
}
}
s0 += num_answers;
cur_task.i_batch = i_batch;
i_batch += cur_task.required_tokens;
n_cur += cur_task.required_tokens;
if (++i1 == tasks.size()) {
break;
}
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
llama_kv_cache_clear(ctx);
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
return;
}
// Compute log-probs in parallel
// First we collect all tasks
eval_pairs.clear();
for (size_t i = i0; i < i1; ++i) {
auto& cur_task = tasks[i];
size_t li = cur_task.common_prefix;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
}
++li;
}
}
// Then we do the actual calculation
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
size_t ir = 0;
// compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i];
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) {
// printf("%d", j+1);
// }
//}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
cur_task.log_probs.resize(cur_task.seq_tokens.size());
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
//printf(" %zu %g\n", ir, eval_results[ir]);
++count;
log_prob += eval_results[ir++];
}
cur_task.log_probs[s] = log_prob / count;
//printf(" Final: %g\n", log_prob / count);
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
}
// Find the ending with maximum logprob
size_t logprob_max_idx = 0;
float logprob_max_val = cur_task.log_probs[0];
for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
if (cur_task.log_probs[s] > logprob_max_val) {
logprob_max_val = cur_task.log_probs[s];
logprob_max_idx = s;
}
}
n_tot_answers += cur_task.log_probs.size();
if (cur_task.mc1.labels[logprob_max_idx] == 1) {
++n_correct;
}
++n_done;
// Print the accumulated accuracy mean x 100
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
fflush(stdout);
}
i0 = i1 - 1;
}
llama_batch_free(batch);
if (n_done < 100) return;
float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1));
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1));
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
printf("\n");
}
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (params.logits_file.empty()) {
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return;
}
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) {
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
return;
}
{
char check[9]; check[8] = 0;
in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return;
}
}
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
int n_vocab, n_chunk;
in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) {
fprintf(stderr, "%s: failed rwading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return;
}
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab);
}
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
if (count < 1) {
return std::make_pair(0., 0.);
}
double f = sum/count;
double df = sum2/count - f*f;
df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
return std::make_pair(f, df);
};
kl_divergence_result kld;
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
// clear the KV cache
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence\n");
}
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld);
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf\n", i+1, exp(ppl.first),
log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second);
fflush(stdout);
logits.clear();
}
printf("\n");
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
@ -1091,6 +1787,10 @@ int main(int argc, char ** argv) {
hellaswag_score(ctx, params); hellaswag_score(ctx, params);
} else if (params.winogrande) { } else if (params.winogrande) {
winogrande_score(ctx, params); winogrande_score(ctx, params);
} else if (params.multiple_choice) {
multiple_choice_score(ctx, params);
} else if (params.kl_divergence) {
kl_divergence(ctx, params);
} else { } else {
results = perplexity(ctx, params); results = perplexity(ctx, params);
} }

View file

@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1705133751, "lastModified": 1705677747,
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=", "narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d", "rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -1,3 +1,17 @@
# The flake interface to llama.cpp's Nix expressions. The flake is used as a
# more discoverable entry-point, as well as a way to pin the dependencies and
# expose default outputs, including the outputs built by the CI.
# For more serious applications involving some kind of customization you may
# want to consider consuming the overlay, or instantiating `llamaPackages`
# directly:
#
# ```nix
# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
# ```
# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
# of the relation between Nix and the Nix Flakes.
{ {
description = "Port of Facebook's LLaMA model in C/C++"; description = "Port of Facebook's LLaMA model in C/C++";

View file

@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
ggml_tallocr_t src_allocr = node_allocr(src); ggml_tallocr_t src_allocr = node_allocr(src);
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
if (src_allocr != node_allocr) { if (src_allocr != node_allocr) {
// create a copy of the input in the split's backend
size_t id = hash_id(src);
if (sched->node_copies[id][cur_backend_id] == NULL) {
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
sched->node_copies[id][cur_backend_id] = tensor_copy;
node_allocr(tensor_copy) = cur_allocr;
SET_CAUSE(tensor_copy, "4.cpy");
int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src;
}
node->src[j] = sched->node_copies[id][cur_backend_id];
#if 0
// check if the input is already in the split // check if the input is already in the split
bool found = false; bool found = false;
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src; sched->splits[cur_split].inputs[n_inputs] = src;
} }
#endif
// create a copy of the input in the split's backend
size_t id = hash_id(src);
if (sched->node_copies[id][cur_backend_id] == NULL) {
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
sched->node_copies[id][cur_backend_id] = tensor_copy;
node_allocr(tensor_copy) = cur_allocr;
SET_CAUSE(tensor_copy, "4.cpy");
}
node->src[j] = sched->node_copies[id][cur_backend_id];
} }
} }
} }
@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t compute_start_us = ggml_time_us(); uint64_t compute_start_us = ggml_time_us();
if (!sched->callback_eval) { if (!sched->callback_eval) {
ggml_backend_graph_compute(split_backend, &split->graph); ggml_backend_graph_compute(split_backend, &split->graph);
//ggml_backend_synchronize(split_backend); // necessary to measure compute time //ggml_backend_synchronize(split_backend); // necessary to measure compute time
} else { } else {
// similar to ggml_backend_compare_graph_backend // similar to ggml_backend_compare_graph_backend
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) { for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {

View file

@ -12,9 +12,6 @@
#include <vector> #include <vector>
#include <map> #include <map>
#include <array> #include <array>
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
@ -118,6 +115,11 @@
#endif // defined(GGML_USE_HIPBLAS) #endif // defined(GGML_USE_HIPBLAS)
// ggml-cuda need half type so keep ggml headers include at last
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CC_PASCAL 600 #define CC_PASCAL 600

339
ggml.c
View file

@ -1418,6 +1418,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
// TODO: optimize performance
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
static const float GELU_COEF_A = 0.044715f; static const float GELU_COEF_A = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f; static const float GELU_QUICK_COEF = -1.702f;
@ -1776,9 +1779,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"GELU", "GELU",
"GELU_QUICK", "GELU_QUICK",
"SILU", "SILU",
"HARDSWISH",
"HARDSIGMOID",
}; };
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10"); static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@ -3945,6 +3950,20 @@ struct ggml_tensor * ggml_silu_back(
return result; return result;
} }
// ggml hardswish
struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
}
// ggml hardsigmoid
struct ggml_tensor * ggml_hardsigmoid(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
}
// ggml_norm // ggml_norm
static struct ggml_tensor * ggml_norm_impl( static struct ggml_tensor * ggml_norm_impl(
@ -5344,6 +5363,33 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
return result; return result;
} }
// ggml_conv_depthwise
struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1) {
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
struct ggml_tensor * result =
ggml_mul_mat(ctx,
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC1, KH, KW] => [1, OC, 1, KH * KW]
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
return result;
}
// ggml_conv_2d // ggml_conv_2d
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@ -7764,6 +7810,9 @@ static void ggml_compute_forward_acc_f32(
bool inplace = (bool) ((int32_t *) dst->op_params)[4]; bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
if (params->ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions. // memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase // => do it in INIT phase
memcpy( memcpy(
@ -9333,6 +9382,87 @@ static void ggml_compute_forward_silu_back(
} }
} }
static void ggml_compute_forward_hardswish_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardswish_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
}
}
static void ggml_compute_forward_hardswish(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hardswish_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
static void ggml_compute_forward_hardsigmoid_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardsigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
}
}
static void ggml_compute_forward_hardsigmoid(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_norm // ggml_compute_forward_norm
static void ggml_compute_forward_norm_f32( static void ggml_compute_forward_norm_f32(
@ -9825,11 +9955,30 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) { if (ggml_compute_forward_mul_mat_use_blas(dst)) {
if (params->ith != 0) { const int64_t ne_plane = ne01*ne00;
return; const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
} UNUSED(desired_wsize);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
}
}
}
return; return;
} }
@ -9837,9 +9986,14 @@ static void ggml_compute_forward_mul_mat(
return; return;
} }
// perform sgemm, parallelization controlled by blas lib
if (ith != 0) {
return;
}
const int64_t tgemm0 = ggml_perf_time_us();
for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) { for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3; const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2; const int64_t i02 = i12/r2;
@ -9848,17 +10002,7 @@ static void ggml_compute_forward_mul_mat(
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) { if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata; x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
to_float((const char *) x + i01*nb01, wdata + id, ne00);
id += ne00;
}
assert(id*sizeof(float) <= params->wsize);
x = wdata;
} }
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -9868,6 +10012,7 @@ static void ggml_compute_forward_mul_mat(
0.0f, d, ne01); 0.0f, d, ne01);
} }
} }
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -9876,6 +10021,9 @@ static void ggml_compute_forward_mul_mat(
#endif #endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
char * wdata = params->wdata; char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10040,6 +10188,9 @@ static void ggml_compute_forward_mul_mat_id(
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
char * wdata = params->wdata; char * wdata = params->wdata;
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10225,6 +10376,9 @@ static void ggml_compute_forward_out_prod_f32(
return; return;
} }
#endif #endif
if (ith != 0) {
return;
}
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return; return;
} }
@ -10408,6 +10562,9 @@ static void ggml_compute_forward_out_prod_q_f32(
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return; return;
} }
@ -10592,6 +10749,9 @@ static void ggml_compute_forward_set_f32(
bool inplace = (bool) ((int32_t *) dst->op_params)[4]; bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
if (params->ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions. // memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase // => do it in INIT phase
memcpy( memcpy(
@ -10916,6 +11076,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
// ggml_compute_forward_dup_same_cont(params, opt0, dst); // ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memset(dst->data, 0, ggml_nbytes(dst)); memset(dst->data, 0, ggml_nbytes(dst));
} }
@ -10950,6 +11113,9 @@ static void ggml_compute_forward_get_rows_back_f32(
// ggml_compute_forward_dup_same_cont(params, opt0, dst); // ggml_compute_forward_dup_same_cont(params, opt0, dst);
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memset(dst->data, 0, ggml_nbytes(dst)); memset(dst->data, 0, ggml_nbytes(dst));
} }
@ -11087,6 +11253,9 @@ static void ggml_compute_forward_diag_mask_f32(
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
if (!inplace && (params->type == GGML_TASK_INIT)) { if (!inplace && (params->type == GGML_TASK_INIT)) {
if (ith != 0) {
return;
}
// memcpy needs to be synchronized across threads to avoid race conditions. // memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase // => do it in INIT phase
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@ -12057,6 +12226,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize); memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12151,6 +12323,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize); memset(params->wdata, 0, params->wsize);
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12349,6 +12524,7 @@ static void ggml_compute_forward_im2col(
} }
} }
// ggml_compute_forward_conv_transpose_2d // ggml_compute_forward_conv_transpose_2d
static void ggml_compute_forward_conv_transpose_2d( static void ggml_compute_forward_conv_transpose_2d(
@ -12374,6 +12550,9 @@ static void ggml_compute_forward_conv_transpose_2d(
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (ith != 0) {
return;
}
memset(params->wdata, 0, params->wsize); memset(params->wdata, 0, params->wsize);
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@ -13917,6 +14096,14 @@ static void ggml_compute_forward_unary(
{ {
ggml_compute_forward_silu(params, src0, dst); ggml_compute_forward_silu(params, src0, dst);
} break; } break;
case GGML_UNARY_OP_HARDSWISH:
{
ggml_compute_forward_hardswish(params, src0, dst);
} break;
case GGML_UNARY_OP_HARDSIGMOID:
{
ggml_compute_forward_hardsigmoid(params, src0, dst);
} break;
default: default:
{ {
GGML_ASSERT(false); GGML_ASSERT(false);
@ -13980,6 +14167,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
if (!inplace && params->type == GGML_TASK_INIT) { if (!inplace && params->type == GGML_TASK_INIT) {
if (params->ith != 0) {
return;
}
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
return; return;
} }
@ -16273,8 +16463,9 @@ struct ggml_compute_state_shared {
const int n_threads; const int n_threads;
// synchronization primitives // synchronization primitives
atomic_int n_active; // num active threads atomic_int n_active; // num active threads
atomic_int node_n; // active graph node atomic_int node_n; // active graph node
atomic_int node_task; // active graph node task phase
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data; void * abort_callback_data;
@ -16330,6 +16521,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
{ {
n_tasks = 1; n_tasks = 1;
} break; } break;
@ -16520,6 +16713,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
return n_tasks; return n_tasks;
} }
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_node_n = * node_n;
while (true) {
if (do_yield) {
sched_yield();
}
* node_n = atomic_load(&state->shared->node_n);
if (* node_n != last_node_n) break;
}
}
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_task_phase = * task_phase;
while (true) {
if (do_yield) {
sched_yield();
}
* task_phase = atomic_load(&state->shared->node_task);
if (* task_phase != last_task_phase) break;
}
}
static thread_ret_t ggml_graph_compute_thread(void * data) { static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@ -16530,7 +16751,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
set_numa_thread_affinity(state->ith, n_threads); set_numa_thread_affinity(state->ith, n_threads);
int node_n = -1; int node_n = -1;
int task_phase = GGML_TASK_FINALIZE;
while (true) { while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@ -16562,7 +16784,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// distribute new work or execute it direct if 1T // distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) { while (++node_n < cgraph->n_nodes) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads);
@ -16571,13 +16792,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
params.nth = n_tasks; params.nth = n_tasks;
/* INIT */
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
if (n_tasks == 1) { if (n_tasks == 1) {
/* INIT */
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
// they do something more efficient than spinning (?) // they do something more efficient than spinning (?)
params.type = GGML_TASK_COMPUTE; params.type = GGML_TASK_COMPUTE;
@ -16598,38 +16819,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
} }
atomic_store(&state->shared->n_active, n_threads); task_phase = GGML_TASK_INIT;
atomic_store(&state->shared->node_n, node_n); atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_n, node_n);
atomic_store(&state->shared->node_task, task_phase);
} else { } else {
// wait for other threads to finish ggml_graph_compute_thread_sync_node(&node_n, state, false);
const int last = node_n; ggml_graph_compute_thread_sync_task(&task_phase, state, false);
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
// UPD: adding the do_yield flag seems to resolve the issue universally
if (do_yield) {
sched_yield();
}
node_n = atomic_load(&state->shared->node_n);
if (node_n != last) break;
};
} }
// check if we should stop // check if we should stop
if (node_n >= cgraph->n_nodes) break; if (node_n >= cgraph->n_nodes) break;
/* COMPUTE */ /* INIT & COMPUTE */
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads); const int n_tasks = ggml_get_n_tasks(node, n_threads);
struct ggml_compute_params params = { struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_COMPUTE, /*.type =*/ GGML_TASK_INIT,
/*.ith =*/ state->ith, /*.ith =*/ state->ith,
/*.nth =*/ n_tasks, /*.nth =*/ n_tasks,
/*.wsize =*/ cplan->work_size, /*.wsize =*/ cplan->work_size,
@ -16637,8 +16844,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}; };
if (state->ith < n_tasks) { if (state->ith < n_tasks) {
if (GGML_OP_HAS_INIT[node->op]) {
ggml_compute_forward(&params, node);
}
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
task_phase = GGML_TASK_COMPUTE;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_task, task_phase);
}
else {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
// UPD: adding the do_yield flag seems to resolve the issue universally
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
}
if (state->ith < n_tasks) {
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
} }
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
task_phase = GGML_TASK_FINALIZE;
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_task, task_phase);
}
else {
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
}
} }
return GGML_EXIT_SUCCESS; return GGML_EXIT_SUCCESS;
@ -16695,8 +16933,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) { if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) { if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0 // here we need memory for fully dequantized matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
} }
} else } else
#endif #endif
@ -16850,6 +17088,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.n_threads =*/ n_threads, /*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads, /*.n_active =*/ n_threads,
/*.node_n =*/ -1, /*.node_n =*/ -1,
/*.node_task =*/ GGML_TASK_FINALIZE,
/*.abort_callback =*/ NULL, /*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL, /*.abort_callback_data =*/ NULL,
}; };

24
ggml.h
View file

@ -489,6 +489,8 @@ extern "C" {
GGML_UNARY_OP_GELU, GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK, GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU, GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_COUNT, GGML_UNARY_OP_COUNT,
}; };
@ -1032,6 +1034,16 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// hardswish(x) = x * relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardswish(
struct ggml_context * ctx,
struct ggml_tensor * a);
// hardsigmoid(x) = relu6(x + 3) / 6
GGML_API struct ggml_tensor * ggml_hardsigmoid(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows // normalize along rows
GGML_API struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1483,6 +1495,18 @@ extern "C" {
int d1, int d1,
bool is_2D); bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
GGML_API struct ggml_tensor * ggml_conv_1d( GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,

231
llama.cpp
View file

@ -1325,8 +1325,10 @@ static llama_state g_state;
// available llama models // available llama models
enum e_model { enum e_model {
MODEL_UNKNOWN, MODEL_UNKNOWN,
MODEL_0_5B,
MODEL_1B, MODEL_1B,
MODEL_3B, MODEL_3B,
MODEL_4B,
MODEL_7B, MODEL_7B,
MODEL_8B, MODEL_8B,
MODEL_13B, MODEL_13B,
@ -2659,6 +2661,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
default: return "unknown, may not work"; default: return "unknown, may not work";
} }
@ -2874,6 +2877,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_3B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
@ -2892,9 +2896,9 @@ static void llm_load_hparams(
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break; case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
case 80: model.type = e_model::MODEL_70B; break; case 80: model.type = e_model::MODEL_70B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
@ -3697,6 +3701,11 @@ static bool llm_load_tensors(
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
// optional bias tensors, present in Stable LM 2 1.6B
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
@ -4315,6 +4324,7 @@ static struct ggml_tensor * llm_build_kqv(
const llama_model & model, const llama_model & model,
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_kv_cache & kv, const llama_kv_cache & kv,
struct ggml_cgraph * graph,
struct ggml_tensor * wo, struct ggml_tensor * wo,
struct ggml_tensor * wo_b, struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur, struct ggml_tensor * q_cur,
@ -4393,6 +4403,8 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il); cb(cur, "kqv_merged_cont", il);
ggml_build_forward_expand(graph, cur);
cur = ggml_mul_mat(ctx, wo, cur); cur = ggml_mul_mat(ctx, wo, cur);
if (wo_b) { if (wo_b) {
cb(cur, "kqv_wo", il); cb(cur, "kqv_wo", il);
@ -4405,6 +4417,44 @@ static struct ggml_tensor * llm_build_kqv(
return cur; return cur;
} }
static struct ggml_tensor * llm_build_kv(
struct ggml_context * ctx,
const llama_model & model,
const llama_hparams & hparams,
const llama_kv_cache & kv,
struct ggml_cgraph * graph,
struct ggml_tensor * wo,
struct ggml_tensor * wo_b,
struct ggml_tensor * k_cur,
struct ggml_tensor * v_cur,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
int64_t n_ctx,
int32_t n_tokens,
int32_t kv_head,
int32_t n_kv,
float max_alibi_bias,
float kq_scale,
const llm_build_cb & cb,
int il) {
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(graph, k_cur);
ggml_build_forward_expand(graph, v_cur);
ggml_build_forward_expand(graph, q_cur);
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
wo, wo_b,
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
cb(cur, "kqv_out", il);
return cur;
}
struct llm_build_context { struct llm_build_context {
const llama_model & model; const llama_model & model;
const llama_hparams & hparams; const llama_hparams & hparams;
@ -4562,12 +4612,6 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(gf, Qcur);
ggml_build_forward_expand(gf, Kcur);
ggml_build_forward_expand(gf, Vcur);
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
@ -4582,11 +4626,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4763,14 +4805,13 @@ struct llm_build_context {
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
// apply ALiBi for 13B model // apply ALiBi for 13B model
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
cur = llm_build_kqv(ctx0, model, hparams, kv_self, cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4892,11 +4933,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -4993,11 +5032,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5200,12 +5237,9 @@ struct llm_build_context {
); );
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
// TODO: not tested, could be broken
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5292,11 +5326,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5390,11 +5422,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5485,11 +5515,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5576,12 +5604,24 @@ struct llm_build_context {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
@ -5597,11 +5637,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5714,11 +5752,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5837,11 +5873,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -5966,11 +6000,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -6071,11 +6103,9 @@ struct llm_build_context {
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
struct ggml_tensor * sa_out = cur; struct ggml_tensor * sa_out = cur;
@ -6172,11 +6202,9 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -6283,11 +6311,9 @@ struct llm_build_context {
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
} }
@ -6355,6 +6381,14 @@ static struct ggml_cgraph * llama_build_graph(
ggml_set_name(cur, name); ggml_set_name(cur, name);
} }
if (!lctx.cparams.offload_kqv) {
if (strcmp(name, "kqv_merged_cont") == 0) {
// all nodes between the KV store and the attention output are run on the CPU
ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
}
}
// //
// allocate input tensors and set input data // allocate input tensors and set input data
// //
@ -8750,9 +8784,13 @@ struct quantize_state_internal {
const llama_model_quantize_params * params; const llama_model_quantize_params * params;
int n_attention_wv = 0; int n_attention_wv = 0;
int n_feed_forward_w2 = 0; int n_ffn_down = 0;
int n_ffn_gate = 0;
int n_ffn_up = 0;
int i_attention_wv = 0; int i_attention_wv = 0;
int i_feed_forward_w2 = 0; int i_ffn_down = 0;
int i_ffn_gate = 0;
int i_ffn_up = 0;
int n_k_quantized = 0; int n_k_quantized = 0;
int n_fallback = 0; int n_fallback = 0;
@ -8855,8 +8893,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
++qs.i_attention_wv; ++qs.i_attention_wv;
} }
else if (name.find("ffn_down") != std::string::npos) { else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K; if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
++qs.i_feed_forward_w2; ++qs.i_ffn_down;
} }
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K; else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
} else if (name.find("attn_v.weight") != std::string::npos) { } else if (name.find("attn_v.weight") != std::string::npos) {
@ -8893,18 +8931,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// TODO: explore better strategies // TODO: explore better strategies
new_type = GGML_TYPE_Q8_0; new_type = GGML_TYPE_Q8_0;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
new_type = GGML_TYPE_Q2_K;
}
} else if (name.find("ffn_down") != std::string::npos) { } else if (name.find("ffn_down") != std::string::npos) {
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
int i_layer, n_layer; int i_layer, n_layer;
if (n_expert == 1) { if (n_expert == 1) {
i_layer = qs.i_feed_forward_w2; i_layer = qs.i_ffn_down;
n_layer = qs.n_feed_forward_w2; n_layer = qs.n_ffn_down;
} else { } else {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the // for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name. // tensor name.
n_layer = qs.n_feed_forward_w2 / n_expert; n_layer = qs.n_ffn_down / n_expert;
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
} }
@ -8913,7 +8954,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
} }
} }
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@ -8943,11 +8984,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
} }
++qs.i_feed_forward_w2; ++qs.i_ffn_down;
} else if (name.find("attn_output.weight") != std::string::npos) { } else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) { if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) { if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
new_type = GGML_TYPE_Q5_K; new_type = GGML_TYPE_Q5_K;
} }
@ -8965,6 +9007,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
} }
else if (name.find("ffn_gate") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
new_type = GGML_TYPE_Q2_K;
}
++qs.i_ffn_gate;
}
else if (name.find("ffn_up") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
new_type = GGML_TYPE_Q2_K;
}
++qs.i_ffn_up;
}
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@ -9019,8 +9075,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
// K-quants // K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@ -9088,12 +9145,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
++qs.n_attention_wv; ++qs.n_attention_wv;
} }
else if (name.find("ffn_down") != std::string::npos) { else if (name.find("ffn_down") != std::string::npos) {
++qs.n_feed_forward_w2; ++qs.n_ffn_down;
}
else if (name.find("ffn_gate") != std::string::npos) {
++qs.n_ffn_gate;
}
else if (name.find("ffn_up") != std::string::npos) {
++qs.n_ffn_up;
} }
} }
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
} }
size_t total_size_org = 0; size_t total_size_org = 0;

View file

@ -107,6 +107,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
}; };

View file

@ -4,3 +4,4 @@ allow_untyped_calls = true
allow_untyped_defs = true allow_untyped_defs = true
allow_incomplete_defs = true allow_incomplete_defs = true
disable_error_code = import-untyped disable_error_code = import-untyped
warn_return_any = false

View file

@ -2,8 +2,9 @@
#include <cassert> #include <cassert>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector>
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = { static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},