This commit is contained in:
Xuan-Son Nguyen 2025-02-10 17:34:13 +08:00 committed by GitHub
commit 8e72ebdc87
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 3061 additions and 25 deletions

View file

@ -1413,7 +1413,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",

View file

@ -80,6 +80,7 @@ enum llama_example {
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_VISION,
LLAMA_EXAMPLE_COUNT,
};

View file

@ -17,6 +17,7 @@ from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
from itertools import chain
from transformers import AutoConfig
import math
import numpy as np
import torch
@ -66,6 +67,13 @@ class Model:
metadata_override: Path | None
dir_model_card: Path
# for vision model
vision_arch: gguf.MODEL_ARCH | None = None
preprocessor_config: dict[str, Any] | None = None
vparams: dict[str, Any] | None = None
v_tensor_map: gguf.TensorNameMap | None = None
v_tensor_names: set[str] | None
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
@ -126,6 +134,16 @@ class Model:
return None
raise KeyError(f"could not find any of: {keys}")
def find_vparams(self, keys: Iterable[str], optional: bool = False) -> Any:
if self.vparams is None:
raise ValueError("vision model parameters not set")
key = next((k for k in keys if k in self.vparams), None)
if key is not None:
return self.vparams[key]
if optional:
return None
raise KeyError(f"(vision) could not find any of: {keys}")
def set_vocab(self):
self._set_vocab_gpt2()
@ -186,9 +204,10 @@ class Model:
f"Missing tensors: {missing}\n"
f"Extra tensors: {extra}")
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
if key not in gguf.MODEL_TENSORS[self.model_arch]:
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight", is_vision = False) -> str:
arch = self.vision_arch if is_vision and self.vision_arch is not None else self.model_arch
if key not in gguf.MODEL_TENSORS[arch]:
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {arch!r}")
name: str = gguf.TENSOR_NAMES[key]
if "{bid}" in name:
assert bid is not None
@ -210,9 +229,13 @@ class Model:
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
if new_name is None:
new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) if self.v_tensor_map is not None else None
if new_name is not None:
return new_name
elif new_name_vision is not None:
return new_name_vision
else:
raise ValueError(f"Can not map tensor {name!r}")
return new_name
def set_gguf_parameters(self):
self.gguf_writer.add_block_count(self.block_count)
@ -257,6 +280,23 @@ class Model:
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
# Vision model parameters
if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
self.gguf_writer.add_vision_type("vit")
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
self.gguf_writer.add_vision_vit_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
self.gguf_writer.add_vision_vit_block_count(self.vparams["num_hidden_layers"])
self.gguf_writer.add_vision_vit_embedding_length(self.vparams["hidden_size"])
self.gguf_writer.add_vision_vit_feed_forward_length(self.vparams["intermediate_size"])
self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"])
self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"])
try:
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
except KeyError:
self.gguf_writer.add_vision_vit_select_layer(0)
self.gguf_writer.add_file_type(self.ftype)
logger.info(f"gguf: file type = {self.ftype}")
@ -466,7 +506,25 @@ class Model:
@staticmethod
def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
hparams = json.load(f)
if "text_config" in hparams:
text_config = hparams["text_config"]
model_id = text_config.get("_name_or_path", None)
# for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID
if model_id is not None and model_id != "None" and model_id != "":
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
hparams = {**text_config, **hparams}
return hparams
@staticmethod
def load_preprocessor_config(dir_model: Path):
# TODO: this varies vastly among models, need to handle more cases in the future
file_path = dir_model / "preprocessor_config.json"
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
else:
raise Exception(f"Preprocessor config not found at {file_path}")
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@ -519,7 +577,9 @@ class Model:
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
# DEBIAN_FRONTEND=noninteractive means that the script is running in a non-interactive environment (i.e. CI), so we cannot answer Y/N when it asks for user input
is_cli_non_interactive = os.environ.get("DEBIAN_FRONTEND", "") == "noninteractive"
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=is_cli_non_interactive)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
@ -948,6 +1008,29 @@ class Model:
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
# TODO: maybe merge this with Model in the future
class VisionModelHelper:
model: Model
tok_embd_tensor: Tensor | None = None
def __init__(self, model: Model):
self.model = model
# TODO: how to do this without reading the whole safetensor file?
for tname, tensor in model.get_tensors():
if tname.endswith("embed_tokens.weight"):
self.tok_embd_tensor = tensor
def get_embd_for_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, gguf.MODEL_TENSOR]], tensor_name_postfix = '.weight') -> Iterable[tuple[str, Tensor]]:
if self.tok_embd_tensor is None:
raise ValueError("Token embedding tensor not found")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.model.dir_model, trust_remote_code=True)
for token, tensor_name in map_token_to_tensor_name:
tok_id = tokenizer.get_vocab()[token]
row = self.tok_embd_tensor[tok_id]
yield gguf.TENSOR_NAMES[tensor_name] + tensor_name_postfix, row
@Model.register("GPTNeoXForCausalLM")
class GPTNeoXModel(Model):
model_arch = gguf.MODEL_ARCH.GPTNEOX
@ -1560,10 +1643,38 @@ class StableLMModel(Model):
raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration")
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
model_type = self.hparams.get("model_type", None)
self.vision_arch = None
# only tested with https://huggingface.co/llava-hf/llava-1.5-7b-hf
if "vision_config" in self.hparams and model_type == "llava":
self.vparams = self.hparams["vision_config"]
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
self.vision_arch = gguf.MODEL_ARCH.VISION_LLAVA
# only tested with https://huggingface.co/mtgv/MobileVLM_V2-1.7B
if "mm_vision_tower" in self.hparams and model_type == "mobilevlm":
from transformers import AutoImageProcessor
vision_model_id = self.hparams["mm_vision_tower"]
self.vparams = AutoConfig.from_pretrained(vision_model_id).to_dict()["vision_config"]
self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict()
self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM
if "vision_config" in self.hparams and model_type == "idefics3":
self.vparams = self.hparams["vision_config"]
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
self.vision_arch = gguf.MODEL_ARCH.VISION_IDEFICS3
if self.vparams is not None and self.vision_arch is not None:
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
@ -1613,6 +1724,24 @@ class LlamaModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
# For vision model
if self.vparams is not None:
max_pos_embd = -1
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
# TODO: should not hardcode these, but they are currently missing from config.json
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
if self.vision_arch == gguf.MODEL_ARCH.VISION_IDEFICS3:
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
self.gguf_writer.add_vision_vit_scale_factor(self.hparams["scale_factor"])
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05)
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
@ -1626,11 +1755,24 @@ class LlamaModel(Model):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
is_vision_tensor = "vision_tower" in name or "vision_model" in name
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
if is_vision_tensor:
if name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM
else:
name = name.replace("model.vision_tower.", "")
if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3:
return [] # skip post_layernorm
if not is_vision_tensor:
if name.startswith("language_model"):
# language model tensors, remove the prefix
name = name.replace("language_model.", "")
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
@ -2234,6 +2376,173 @@ class Qwen2VLModel(Model):
yield name, data
@Model.register("MiniCPMV")
class MiniCPMVModel(Qwen2Model):
# MiniCPM-V 2.5 is Qwen2 and 2.6 is Qwen-2.5
model_arch = gguf.MODEL_ARCH.QWEN2
proj_type: gguf.constants.CLIPProjectorType | None
resampler_n_embd = 0
vhelper: VisionModelHelper | None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
model_type = self.hparams.get("model_type", None)
# only tested with https://huggingface.co/openbmb/MiniCPM-V-2_6
if "vision_config" in self.hparams and model_type == "minicpmv":
self.vparams = self.hparams["vision_config"]
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
self.vision_arch = gguf.MODEL_ARCH.VISION_MINICPMV
version = str(self.hparams.get("version", "unknown"))
if version == "2.5":
self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_5
elif version == "2.6":
self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6
else:
raise ValueError(f"Unsupported MiniCPM-V version: {version}")
self.vhelper = VisionModelHelper(self)
# TODO: how to do this without reading the whole safetensor file?
for tname, tensor in self.get_tensors():
if tname == "resampler.ln_post.bias":
self.resampler_n_embd = tensor.shape[0]
if self.resampler_n_embd < 2:
raise ValueError("Failed to detect resampler embedding size")
else:
raise ValueError("Expected vision_config, but not found")
assert self.vparams is not None
assert self.vision_arch is not None
assert self.preprocessor_config is not None
self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5]
self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5]
self.hparams["vision_feature_layer"] = 0
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.vparams is not None and self.proj_type is not None
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
self.gguf_writer.add_vision_vit_projector_type(self.proj_type)
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# because the model operates excusively on 70x70 patches for now, we should precompute the positional embeddings to gain performance
# in the future, we can do it in cpp if we figure out how to do it efficiently
yield (
self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True),
torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70)))
)
assert self.vhelper is not None
added_tokens = [
("<image>", gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE),
("</image>", gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE),
("<slice>", gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE),
("</slice>", gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE),
]
for tensor_name, tensor in self.vhelper.get_embd_for_tokens(added_tokens):
yield tensor_name, tensor
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# for language part
if name.startswith("llm."):
return [(self.map_tensor_name(name.replace("llm.", "")), data_torch)]
# split the resampler.attn.in_proj_(weight|bias) tensors into q, k, v
if name.endswith("in_proj_weight") or name.endswith("in_proj_bias"):
assert data_torch.shape[0] == 3 * self.resampler_n_embd
split_tensor = data_torch.chunk(3, dim=0)
name_q = name.replace("in_proj_", "in_proj_q.") # in_proj_q.(weight|bias)
name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias)
name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias)
return [
# TODO: permute these
(self.map_tensor_name(name_q), split_tensor[0]),
(self.map_tensor_name(name_k), split_tensor[1]),
(self.map_tensor_name(name_v), split_tensor[2]),
]
# append .weight to these tensors
if name == "resampler.proj" or name == "resampler.query":
name += ".weight"
if name.startswith("resampler.proj"):
data_torch = data_torch.transpose(-1, -2).contiguous()
if "post_layernorm" in name:
return [] # skip post_layernorm
return [(self.map_tensor_name(name), data_torch)]
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
del name, bid # unused
if "v.resmpl.query" in new_name or "v.resmpl.pos_embd_k" in new_name:
return gguf.GGMLQuantizationType.F32
if "v.resmpl." in new_name:
return gguf.GGMLQuantizationType.F32 if n_dims == 1 else gguf.GGMLQuantizationType.F16
return False
# utils to work with MiniCPM-V resampler
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
def _get_2d_sincos_pos_embed(self, embed_dim: int, grid_size: tuple[int, int] | int, cls_token=False) -> np.ndarray:
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
if isinstance(grid_size, int):
grid_h_size, grid_w_size = grid_size, grid_size
else:
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
grid_h = np.arange(grid_h_size, dtype=np.float32)
grid_w = np.arange(grid_w_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token:
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
return pos_embed
def _get_2d_sincos_pos_embed_from_grid(self, embed_dim: int, grid: np.ndarray) -> np.ndarray:
assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
emb_w = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
return emb
def _get_1d_sincos_pos_embed_from_grid(self, embed_dim: int, pos: np.ndarray) -> np.ndarray:
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float32)
omega /= embed_dim / 2.
omega = 1. / 10000 ** omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
@Model.register("WavTokenizerDec")
class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@ -4949,7 +5258,7 @@ class LazyTorchTensor(gguf.LazyBase):
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert a huggingface model to a GGML compatible file")
description="Convert a huggingface model to a GGML compatible file\n\nNote: When converting vision models, this script may use internet connection to download configuration files via Hugging Face.")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",

View file

@ -53,6 +53,7 @@ else()
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
add_subdirectory(vision)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)

View file

@ -3150,6 +3150,7 @@ struct server_context {
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
nullptr,
};
const int ret = llama_decode(ctx, batch_view);

View file

@ -0,0 +1,5 @@
set(TARGET llama-vision)
add_executable(${TARGET} vision.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View file

@ -0,0 +1,3 @@
# llama.cpp/example/simple-vision
Minimal demo for vision API

224
examples/vision/vision.cpp Normal file
View file

@ -0,0 +1,224 @@
#include "llama.h"
#include "common.h"
#include "arg.h"
#include "log.h"
#include "sampling.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <fstream>
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [--image img_path] [-p prompt]\n", argv[0]);
printf("\n");
}
static llama_vision_bitmap * load_image_from_file(const char * fname) {
std::ifstream file(fname, std::ios::binary);
if (!file) {
throw std::runtime_error("Unable to open file");
}
std::vector<char> image_bytes = std::vector<char>(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>());
// decode image to byte array
int nx, ny, nc;
auto * bytes = (unsigned char *) image_bytes.data();
auto * img = stbi_load_from_memory(bytes, image_bytes.size(), &nx, &ny, &nc, 3);
if (!img) {
throw std::runtime_error("failed to decode image bytes");
}
// printf("nx=%d ny=%d nc=%d\n", nx, ny, nc);
// GGML_ASSERT(nc == 3);
// for (int y = 0; y < ny; y++) {
// for (int x = 0; x < nx; x++) {
// unsigned char * pix = img + x*nc + y*nc*nx;
// printf("%02x%02x%02x ", pix[0], pix[1], pix[2]);
// }
// printf("\n");
// }
// printf("\n");
llama_vision_bitmap * result = llama_vision_bitmap_init(nx, ny);
memcpy(result->data, img, nx*ny*3);
stbi_image_free(img);
return result;
}
// split string by a `std::string delim` instead of `char delim`
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);
return tokens;
}
struct tokenized_part {
llama_tokens tokens;
bool is_image;
};
// TODO: this function is hacky, need to be improved
// static const llama_token TOKEN_IMG_PLACEMENT = -1000;
static const std::string IMG_PLACEMENT = "<img_placement>";
static std::vector<tokenized_part> tokenize_with_img_placement(
const llama_vocab * vocab,
const std::string & text,
bool add_special,
bool parse_special) {
std::vector<std::string> parts = string_split_str(text, IMG_PLACEMENT);
std::vector<tokenized_part> output;
for (const auto & part : parts) {
//printf("tokenizing part: %s\n", part.c_str());
bool add_bos = &parts.front() == &part;
auto tokens = common_tokenize(vocab, part, add_special && add_bos, parse_special);
if (tokens.empty()) {
continue;
}
output.push_back({std::move(tokens), false});
if (&parts.back() != &part) {
// add image token to middle of 2 parts
output.push_back({{}, true});
}
}
return output;
}
int main(int argc, char ** argv) {
common_params params;
// default prompt for llava 1.5
//params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:<img_placement>\nwhat did you see?\nASSISTANT:";
// default prompt for minicpmv 2.6
params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<img_placement>\nwhat do you see?<|im_end|>\n<|im_start|>assistant\n";
params.n_predict = 64;
params.n_batch = 2048;
params.n_ubatch = 1024;
params.n_gpu_layers = 99;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_VISION, print_usage)) {
return 1;
}
common_init();
common_init_result llama_init = common_init_from_params(params);
llama_context * ctx = llama_init.context.get();
const llama_model * model = llama_init.model.get();
const llama_vocab * vocab = llama_model_get_vocab(model);
if (!model) {
LOG_ERR("failed to load model\n");
return 1;
}
llama_vision_context_params vparams = llama_vision_context_default_params();
vparams.n_threads = llama_n_threads(ctx);
llama_vision_context * vctx = llama_vision_init_from_model(model, vparams);
if (!vctx) {
LOG_ERR("model does not have vision encoder\n");
return 1;
}
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
int n_past = 0;
int n_prompt = 0;
// process image
llama_vision_tokens * img_tokens = nullptr;
{
const char * img_path = params.image[0].c_str();
if (params.image[0].empty()) {
LOG_ERR("no image path provided\n");
return 1;
}
llama_vision_bitmap * img = load_image_from_file(img_path);
LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny);
img_tokens = llama_vision_tokenize(vctx, img);
if (!img_tokens) {
LOG_ERR("failed to create image tokens\n");
return 1;
}
if (llama_vision_encode(vctx, img_tokens)) {
LOG_ERR("failed to encode image\n");
return 1;
}
LOG_INF("encoded image\n");
}
// process prompt
{
std::vector<tokenized_part> parts = tokenize_with_img_placement(vocab, params.prompt, true, true);
for (const tokenized_part & part : parts) {
if (!part.is_image) {
for (const llama_token & token : part.tokens) {
//LOG_INF("%d -> %s\n", token, common_token_to_piece(ctx, token).c_str());
common_batch_add(batch, token, n_past++, {0}, &part == &parts.back());
}
LOG_INF("eval text batch (%d tokens)\n", batch.n_tokens);
if (llama_decode(ctx, batch)) {
LOG_ERR("failed to decode text prompt\n");
return 1;
}
} else {
auto * img_embd = llama_vision_get_output_tensor(vctx);
// std::vector<float> output_debug(ggml_nelements(img_embd));
// ggml_backend_tensor_get(img_embd, output_debug.data(), 0, ggml_nbytes(img_embd));
// for (int row = 0; row < 10; row++) {
// int off = row * img_embd->ne[0];
// printf("... %f %f %f\n", output_debug[off], output_debug[off+1], output_debug[off+2]);
// }
// exit(1);
llama_batch batch_img = llama_batch_get_one_from_tensor(img_embd, n_past, 0);
n_past += batch_img.n_tokens;
LOG_INF("eval image batch (%d embeddings)\n", batch_img.n_tokens);
if (llama_decode(ctx, batch_img)) {
LOG_ERR("failed to decode image prompt\n");
return 1;
}
llama_batch_free(batch_img);
}
}
n_prompt = n_past;
LOG_INF("prompt processed, %d tokens\n", n_prompt);
}
// generate response
while (true){
int n_generated = n_past - n_prompt;
if (n_generated > params.n_predict) {
printf("\n");
break;
}
llama_token token_id = common_sampler_sample(smpl, ctx, -1);
common_sampler_accept(smpl, token_id, true);
printf("%s", common_token_to_piece(ctx, token_id).c_str());
fflush(stdout);
if (llama_vocab_is_eog(vocab, token_id)) {
printf("\n");
break;
}
// eval the token
common_batch_clear(batch);
common_batch_add(batch, token_id, n_past++, {0}, true);
if (llama_decode(ctx, batch)) {
LOG_ERR("failed to decode token\n");
break;
}
}
return 0;
}

View file

@ -202,6 +202,9 @@ class Keys:
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
# Vision models
IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
# deprecated:
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
@ -211,6 +214,32 @@ class Keys:
TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha"
class Vision:
# only support vision.type = "vit" for now
TYPE = "vision.type"
IMAGE_SIZE = "vision.image_size"
PATCH_SIZE = "vision.patch_size"
IMAGE_MEAN = "vision.image_mean"
IMAGE_STD = "vision.image_std"
class Vit:
ARCHITECTURE = "vision.vit.architecture"
CONTEXT_LENGTH = "vision.vit.context_length"
EMBEDDING_LENGTH = "vision.vit.embedding_length"
BLOCK_COUNT = "vision.vit.block_count"
FEED_FORWARD_LENGTH = "vision.vit.feed_forward_length"
PROJECTION_TYPE = "vision.vit.projection_type"
PROJECTION_DIM = "vision.vit.projection_dim"
USE_GELU = "vision.vit.use_gelu"
MAX_POS_EMBEDDING = "vision.vit.max_position_embeddings"
MAX_SLICES = "vision.vit.max_slices"
PROJECTOR_TYPE = "vision.vit.projector_type"
SELECT_LAYER = "vision.vit.select_layer"
PATCH_MERGE_TYPE = "vision.vit.patch_merge_type"
HEAD_COUNT = "vision.vit.attention.head_count"
LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon"
SCALE_FACTOR = "vision.vit.scale_factor" # only used by idefics3 for now
#
# recommended mapping of model tensor names for storage in gguf
#
@ -279,6 +308,11 @@ class MODEL_ARCH(IntEnum):
GRANITE_MOE = auto()
CHAMELEON = auto()
WAVTOKENIZER_DEC = auto()
# vision models
VISION_LLAVA = auto()
VISION_MOBILEVLM = auto()
VISION_MINICPMV = auto()
VISION_IDEFICS3 = auto()
class MODEL_TENSOR(IntEnum):
@ -390,6 +424,7 @@ class MODEL_TENSOR(IntEnum):
ENC_OUTPUT_NORM = auto()
CLS = auto() # classifier
CLS_OUT = auto() # classifier output projection
# wavtokenizer
CONV1D = auto()
CONVNEXT_DW = auto()
CONVNEXT_NORM = auto()
@ -406,6 +441,39 @@ class MODEL_TENSOR(IntEnum):
POSNET_ATTN_K = auto()
POSNET_ATTN_V = auto()
POSNET_ATTN_OUT = auto()
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
V_MMPROJ_MLP = auto()
V_MMPROJ_PEG = auto()
V_ENC_EMBD_CLS = auto()
V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_POS = auto()
V_ENC_ATTN_Q = auto()
V_ENC_ATTN_K = auto()
V_ENC_ATTN_V = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_OUTPUT = auto()
V_ENC_OUTPUT_NORM = auto()
V_ENC_FFN_UP = auto()
V_ENC_FFN_DOWN = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
V_RESMPL_POS_EMBD_K = auto() # minicpmv
V_RESMPL_ATTN_Q = auto() # minicpmv
V_RESMPL_ATTN_K = auto() # minicpmv
V_RESMPL_ATTN_V = auto() # minicpmv
V_RESMPL_ATTN_OUT = auto() # minicpmv
V_RESMPL_KV = auto() # minicpmv
V_RESMPL_KV_NORM = auto() # minicpmv
V_RESMPL_POST_NORM = auto() # minicpmv
V_RESMPL_Q_NORM = auto() # minicpmv
V_RESMPL_PROJ = auto() # minicpmv
V_RESMPL_QUERY = auto() # minicpmv
V_TOK_EMBD_IMAGE = auto() # embedding for <image> token
V_TOK_EMBD_END_IMAGE = auto() # embedding for </image> token
V_TOK_EMBD_SLICE = auto() # embedding for <slice> token
V_TOK_EMBD_END_SLICE = auto() # embedding for </slice> token
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -466,6 +534,11 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
# vision
MODEL_ARCH.VISION_LLAVA: "llava",
MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm",
MODEL_ARCH.VISION_MINICPMV: "minicpmv",
MODEL_ARCH.VISION_IDEFICS3: "idefics3",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -593,6 +666,39 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
# vision
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "v.mmproj.fc",
MODEL_TENSOR.V_MMPROJ_MLP: "v.mmproj.mlp.{bid}",
MODEL_TENSOR.V_MMPROJ_PEG: "v.mmproj.peg.{bid}",
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q",
MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k",
MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v",
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm",
MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output",
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm",
MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up",
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "v.resmpl.pos_embd_k",
MODEL_TENSOR.V_RESMPL_ATTN_Q: "v.resmpl.attn_q",
MODEL_TENSOR.V_RESMPL_ATTN_K: "v.resmpl.attn_k",
MODEL_TENSOR.V_RESMPL_ATTN_V: "v.resmpl.attn_v",
MODEL_TENSOR.V_RESMPL_ATTN_OUT: "v.resmpl.attn_out",
MODEL_TENSOR.V_RESMPL_KV: "v.resmpl.kv",
MODEL_TENSOR.V_RESMPL_KV_NORM: "v.resmpl.kv_norm",
MODEL_TENSOR.V_RESMPL_POST_NORM: "v.resmpl.post_norm",
MODEL_TENSOR.V_RESMPL_Q_NORM: "v.resmpl.q_norm",
MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj",
MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query",
MODEL_TENSOR.V_TOK_EMBD_IMAGE: "v.tok_embd.image",
MODEL_TENSOR.V_TOK_EMBD_END_IMAGE: "v.tok_embd.end_image",
MODEL_TENSOR.V_TOK_EMBD_SLICE: "v.tok_embd.slice",
MODEL_TENSOR.V_TOK_EMBD_END_SLICE: "v.tok_embd.end_slice",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -1537,6 +1643,80 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.POSNET_ATTN_V,
MODEL_TENSOR.POSNET_ATTN_OUT,
],
MODEL_ARCH.VISION_LLAVA: [
MODEL_TENSOR.V_MMPROJ,
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_DOWN,
MODEL_TENSOR.V_PRE_NORM,
MODEL_TENSOR.V_POST_NORM,
],
MODEL_ARCH.VISION_MOBILEVLM: [
MODEL_TENSOR.V_MMPROJ_MLP,
MODEL_TENSOR.V_MMPROJ_PEG,
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_DOWN,
MODEL_TENSOR.V_PRE_NORM,
MODEL_TENSOR.V_POST_NORM,
],
MODEL_ARCH.VISION_MINICPMV: [
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_DOWN,
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
MODEL_TENSOR.V_RESMPL_ATTN_Q,
MODEL_TENSOR.V_RESMPL_ATTN_K,
MODEL_TENSOR.V_RESMPL_ATTN_V,
MODEL_TENSOR.V_RESMPL_ATTN_OUT,
MODEL_TENSOR.V_RESMPL_KV,
MODEL_TENSOR.V_RESMPL_KV_NORM,
MODEL_TENSOR.V_RESMPL_POST_NORM,
MODEL_TENSOR.V_RESMPL_Q_NORM,
MODEL_TENSOR.V_RESMPL_PROJ,
MODEL_TENSOR.V_RESMPL_QUERY,
MODEL_TENSOR.V_TOK_EMBD_IMAGE,
MODEL_TENSOR.V_TOK_EMBD_END_IMAGE,
MODEL_TENSOR.V_TOK_EMBD_SLICE,
MODEL_TENSOR.V_TOK_EMBD_END_SLICE,
],
MODEL_ARCH.VISION_IDEFICS3: [
MODEL_TENSOR.V_MMPROJ_FC,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_DOWN,
MODEL_TENSOR.V_POST_NORM,
],
# TODO
}
@ -1618,6 +1798,18 @@ class PoolingType(IntEnum):
CLS = 2
class CLIPProjectorType(Enum):
MLP = 'mlp'
LDPV2 = 'ldpv2'
MINICPMV_2_5 = 'minicpmv-2.5' # resampler
MINICPMV_2_6 = 'minicpmv-2.6' # resampler
class CLIPPatchMergeType(Enum):
FLAT = 'flat'
SPATIAL_UNPAD = 'spatial_unpad'
class GGMLQuantizationType(IntEnum):
F32 = 0
F16 = 1

View file

@ -27,6 +27,8 @@ from .constants import (
PoolingType,
TokenType,
ExpertGatingFuncType,
CLIPPatchMergeType,
CLIPProjectorType,
)
from .quants import quant_shape_from_byte_shape
@ -875,6 +877,60 @@ class GGUFWriter:
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
def add_vision_type(self, value: str) -> None:
self.add_string(Keys.Vision.TYPE, value)
def add_vision_image_size(self, value: int) -> None:
self.add_uint32(Keys.Vision.IMAGE_SIZE, value)
def add_vision_patch_size(self, value: int) -> None:
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
def add_vision_vit_architecture(self, value: str) -> None:
self.add_string(Keys.Vision.Vit.ARCHITECTURE, value)
def add_vision_vit_context_length(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.CONTEXT_LENGTH, value)
def add_vision_vit_embedding_length(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.EMBEDDING_LENGTH, value)
def add_vision_vit_block_count(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.BLOCK_COUNT, value)
def add_vision_vit_feed_forward_length(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.FEED_FORWARD_LENGTH, value)
def add_vision_vit_head_count(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.HEAD_COUNT, value)
def add_vision_vit_max_position_embeddings(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.MAX_POS_EMBEDDING, value)
def add_vision_vit_projector_type(self, value: CLIPProjectorType) -> None:
self.add_string(Keys.Vision.Vit.PROJECTOR_TYPE, value.value)
def add_vision_vit_max_slices(self, value: int) -> None:
self.add_uint32(Keys.Vision.Vit.MAX_SLICES, value)
def add_vision_vit_select_layer(self, value: int) -> None:
self.add_int32(Keys.Vision.Vit.SELECT_LAYER, value)
def add_vision_vit_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
self.add_string(Keys.Vision.Vit.PATCH_MERGE_TYPE, value.value)
def add_vision_vit_layer_norm_epsilon(self, value: float) -> None:
self.add_float32(Keys.Vision.Vit.LAYERNORM_EPS, value)
def add_vision_vit_image_mean(self, value: Sequence[float]) -> None:
self.add_array(Keys.Vision.IMAGE_MEAN, value)
def add_vision_vit_image_std(self, value: Sequence[float]) -> None:
self.add_array(Keys.Vision.IMAGE_STD, value)
def add_vision_vit_scale_factor(self, value: int) -> None:
self.add_int32(Keys.Vision.Vit.SCALE_FACTOR, value)
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if not isinstance(value, str):
template_default = None

View file

@ -787,6 +787,157 @@ class TensorNameMap:
MODEL_TENSOR.POSNET_ATTN_OUT: (
"backbone.posnet.{bid}.proj_out", # wavtokenizer
),
#############################################################################
MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
),
MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
),
MODEL_TENSOR.V_MMPROJ_MLP: (
"model.mm_projector.mlp.mlp.{bid}",
),
MODEL_TENSOR.V_MMPROJ_PEG: (
"model.mm_projector.peg.peg.{bid}",
),
MODEL_TENSOR.V_ENC_EMBD_CLS: (
"vision_tower.vision_model.embeddings.class_embedding",
),
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
"vision_tower.vision_model.embeddings.patch_embedding",
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
),
MODEL_TENSOR.V_ENC_EMBD_POS: (
"vision_tower.vision_model.embeddings.position_embedding",
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
),
MODEL_TENSOR.V_ENC_ATTN_Q: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
),
MODEL_TENSOR.V_ENC_ATTN_K: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
),
MODEL_TENSOR.V_ENC_ATTN_V: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
),
MODEL_TENSOR.V_ENC_OUTPUT: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
),
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
),
MODEL_TENSOR.V_ENC_FFN_UP: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
),
MODEL_TENSOR.V_ENC_FFN_DOWN: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
),
MODEL_TENSOR.V_PRE_NORM: (
"vision_tower.vision_model.pre_layrnorm",
),
MODEL_TENSOR.V_POST_NORM: (
"vision_tower.vision_model.post_layernorm",
"model.vision_model.post_layernorm", # SmolVLM
),
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
"resampler.pos_embed_k",
),
MODEL_TENSOR.V_RESMPL_ATTN_Q: (
"resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
),
MODEL_TENSOR.V_RESMPL_ATTN_K: (
"resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
),
MODEL_TENSOR.V_RESMPL_ATTN_V: (
"resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
),
MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
"resampler.attn.out_proj",
),
MODEL_TENSOR.V_RESMPL_KV: (
"resampler.kv_proj",
),
MODEL_TENSOR.V_RESMPL_POST_NORM: (
"resampler.ln_post",
),
MODEL_TENSOR.V_RESMPL_KV_NORM: (
"resampler.ln_kv",
),
MODEL_TENSOR.V_RESMPL_Q_NORM: (
"resampler.ln_q",
),
MODEL_TENSOR.V_RESMPL_PROJ: (
"resampler.proj",
),
MODEL_TENSOR.V_RESMPL_QUERY: (
"resampler.query",
),
MODEL_TENSOR.V_TOK_EMBD_IMAGE:(
"v.tok_embd.image", # tensor generated from token embeddings
),
MODEL_TENSOR.V_TOK_EMBD_END_IMAGE:(
"v.tok_embd.end_image", # tensor generated from token embeddings
),
MODEL_TENSOR.V_TOK_EMBD_SLICE:(
"v.tok_embd.slice", # tensor generated from token embeddings
),
MODEL_TENSOR.V_TOK_EMBD_END_SLICE:(
"v.tok_embd.end_slice", # tensor generated from token embeddings
),
}
# architecture-specific block mappings

View file

@ -229,6 +229,20 @@ extern "C" {
bool sorted;
} llama_token_data_array;
struct llama_vision_context;
// Structure represents the basic input unit of vision model
// This can be a processed image or slices of images under the hood
struct llama_vision_tokens;
// represent an RGB image
// size of data must be equal to 3*nx*ny
typedef struct llama_vision_bitmap {
uint32_t nx;
uint32_t ny;
unsigned char * data;
} llama_vision_bitmap;
typedef bool (*llama_progress_callback)(float progress, void * user_data);
// Input data for llama_decode
@ -253,6 +267,8 @@ extern "C" {
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits; // TODO: rename this to "output"
struct ggml_tensor * embd_tensor;
} llama_batch;
enum llama_model_kv_override_type {
@ -351,6 +367,10 @@ extern "C" {
void * abort_callback_data;
};
struct llama_vision_context_params {
int32_t n_threads;
};
// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
@ -388,6 +408,7 @@ extern "C" {
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_vision_context_params llama_vision_context_default_params(void);
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
@ -845,6 +866,10 @@ extern "C" {
int32_t embd,
int32_t n_seq_max);
// Allocates a batch based on a tensor, only used by vision API for now
// Unlike llama_batch_get_one, this will need to be freed after use
LLAMA_API struct llama_batch llama_batch_get_one_from_tensor(struct ggml_tensor * tensor, int32_t p0, int32_t seq_id);
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
@ -1276,6 +1301,35 @@ extern "C" {
// TODO: extend in the future
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
//
// Vision API
//
// Vision context
LLAMA_API struct llama_vision_context * llama_vision_init_from_model(
const struct llama_model * model,
struct llama_vision_context_params params);
LLAMA_API void llama_vision_free(struct llama_vision_context * ctx);
// Container for RGB bitmap
LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny);
LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp);
// Create image tokens from the RGB bitmap
LLAMA_API struct llama_vision_tokens * llama_vision_tokenize(
struct llama_vision_context * ctx,
struct llama_vision_bitmap * bmp);
LLAMA_API void llama_vision_tokens_free(struct llama_vision_tokens * img_tokens);
// User must reserve N number of tokens in tokenized text prompt for each image
// LLAMA_API int32_t llama_vision_get_n_tokens(const llama_vision_img_tokens * img_tokens);
// Encode patches into embeddings
LLAMA_API int32_t llama_vision_encode(
struct llama_vision_context * ctx,
struct llama_vision_tokens * img_tokens);
LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_vision_context * ctx);
//
// Model split
//

View file

@ -24,6 +24,7 @@ add_library(llama
llama-quant.cpp
llama-sampling.cpp
llama-vocab.cpp
llama-vision.cpp
unicode.h
unicode.cpp
unicode-data.cpp

View file

@ -3,6 +3,7 @@
#include "llama-impl.h"
#include <map>
#include <exception>
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
@ -62,6 +63,10 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_VISION_LLAVA, "llava" },
{ LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" },
{ LLM_ARCH_VISION_MINICPMV, "minicpmv" },
{ LLM_ARCH_VISION_IDEFICS3, "idefics3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@ -190,6 +195,28 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
{ LLM_KV_VISION_TYPE, "vision.type" },
{ LLM_KV_VISION_IMAGE_SIZE, "vision.image_size" },
{ LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" },
{ LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" },
{ LLM_KV_VISION_IMAGE_STD, "vision.image_std" },
{ LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" },
{ LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" },
{ LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" },
{ LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" },
{ LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" },
{ LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" },
{ LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" },
{ LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" },
{ LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" },
{ LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" },
{ LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" },
{ LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" },
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
{ LLM_KV_VISION_VIT_SCALE_FACTOR, "vision.vit.scale_factor" },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
@ -1296,6 +1323,95 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
},
},
// vision
{
LLM_ARCH_VISION_LLAVA,
{
{ LLM_TENSOR_V_MMPROJ, "v.mmproj_%d" },
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
}
},
{
LLM_ARCH_VISION_MOBILEVLM,
{
{ LLM_TENSOR_V_MMPROJ_MLP, "v.mmproj.mlp.%d" },
{ LLM_TENSOR_V_MMPROJ_PEG, "v.mmproj.peg.%d" },
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
}
},
{
LLM_ARCH_VISION_MINICPMV,
{
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ LLM_TENSOR_V_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" },
{ LLM_TENSOR_V_RESMPL_ATTN_Q, "v.resmpl.attn_q" },
{ LLM_TENSOR_V_RESMPL_ATTN_K, "v.resmpl.attn_k" },
{ LLM_TENSOR_V_RESMPL_ATTN_V, "v.resmpl.attn_v" },
{ LLM_TENSOR_V_RESMPL_ATTN_OUT, "v.resmpl.attn_out" },
{ LLM_TENSOR_V_RESMPL_KV, "v.resmpl.kv" },
{ LLM_TENSOR_V_RESMPL_KV_NORM, "v.resmpl.kv_norm" },
{ LLM_TENSOR_V_RESMPL_POST_NORM, "v.resmpl.post_norm" },
{ LLM_TENSOR_V_RESMPL_Q_NORM, "v.resmpl.q_norm" },
{ LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" },
{ LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" },
{ LLM_TENSOR_V_TOK_EMBD_IMAGE, "v.tok_embd.image" },
{ LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "v.tok_embd.end_image" },
{ LLM_TENSOR_V_TOK_EMBD_SLICE, "v.tok_embd.slice" },
{ LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" },
}
},
{
LLM_ARCH_VISION_IDEFICS3,
{
{ LLM_TENSOR_V_MMPROJ_FC, "v.mmproj.fc" },
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
}
},
{
LLM_ARCH_UNKNOWN,
{
@ -1445,6 +1561,39 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// vision
{LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_EMBD_CLS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
{LLM_TENSOR_V_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
{LLM_TENSOR_V_ENC_EMBD_POS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
{LLM_TENSOR_V_ENC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_INPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_V_ENC_OUTPUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_V_ENC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}},
{LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_V_RESMPL_POS_EMBD_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_ADD}},
{LLM_TENSOR_V_RESMPL_ATTN_Q, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_RESMPL_ATTN_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_RESMPL_ATTN_V, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_RESMPL_ATTN_OUT, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_RESMPL_KV, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_RESMPL_KV_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}},
{LLM_TENSOR_V_RESMPL_POST_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}},
{LLM_TENSOR_V_RESMPL_Q_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}},
{LLM_TENSOR_V_RESMPL_PROJ, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
{LLM_TENSOR_V_RESMPL_QUERY, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
// special token embeddings for image
{LLM_TENSOR_V_TOK_EMBD_IMAGE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
{LLM_TENSOR_V_TOK_EMBD_END_IMAGE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
{LLM_TENSOR_V_TOK_EMBD_SLICE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
{LLM_TENSOR_V_TOK_EMBD_END_SLICE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

View file

@ -66,6 +66,11 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
LLM_ARCH_WAVTOKENIZER_DEC,
// vision
LLM_ARCH_VISION_LLAVA,
LLM_ARCH_VISION_MOBILEVLM,
LLM_ARCH_VISION_MINICPMV,
LLM_ARCH_VISION_IDEFICS3,
LLM_ARCH_UNKNOWN,
};
@ -194,6 +199,28 @@ enum llm_kv {
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
LLM_KV_CONVNEXT_BLOCK_COUNT,
LLM_KV_VISION_TYPE,
LLM_KV_VISION_IMAGE_SIZE,
LLM_KV_VISION_PATCH_SIZE,
LLM_KV_VISION_IMAGE_MEAN,
LLM_KV_VISION_IMAGE_STD,
LLM_KV_VISION_VIT_ARCHITECTURE,
LLM_KV_VISION_VIT_CONTEXT_LENGTH,
LLM_KV_VISION_VIT_EMBEDDING_LENGTH,
LLM_KV_VISION_VIT_BLOCK_COUNT,
LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH,
LLM_KV_VISION_VIT_PROJECTION_TYPE,
LLM_KV_VISION_VIT_PROJECTION_DIM,
LLM_KV_VISION_VIT_USE_GELU,
LLM_KV_VISION_VIT_MAX_POS_EMBD,
LLM_KV_VISION_VIT_MAX_SLICES,
LLM_KV_VISION_VIT_PROJECTOR_TYPE,
LLM_KV_VISION_VIT_SELECT_LAYER,
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
LLM_KV_VISION_VIT_HEAD_COUNT,
LLM_KV_VISION_VIT_LAYERNORM_EPS,
LLM_KV_VISION_VIT_SCALE_FACTOR,
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
@ -327,11 +354,46 @@ enum llm_tensor {
LLM_TENSOR_POS_NET_ATTN_K,
LLM_TENSOR_POS_NET_ATTN_V,
LLM_TENSOR_POS_NET_ATTN_OUT,
// vision
LLM_TENSOR_V_MMPROJ,
LLM_TENSOR_V_MMPROJ_FC,
LLM_TENSOR_V_MMPROJ_MLP,
LLM_TENSOR_V_MMPROJ_PEG,
LLM_TENSOR_V_ENC_EMBD_CLS,
LLM_TENSOR_V_ENC_EMBD_PATCH,
LLM_TENSOR_V_ENC_EMBD_POS,
LLM_TENSOR_V_ENC_ATTN_Q,
LLM_TENSOR_V_ENC_ATTN_K,
LLM_TENSOR_V_ENC_ATTN_V,
LLM_TENSOR_V_ENC_INPUT_NORM,
LLM_TENSOR_V_ENC_OUTPUT,
LLM_TENSOR_V_ENC_OUTPUT_NORM,
LLM_TENSOR_V_ENC_FFN_UP,
LLM_TENSOR_V_ENC_FFN_DOWN,
LLM_TENSOR_V_PRE_NORM,
LLM_TENSOR_V_POST_NORM,
// vision - minicpmv
LLM_TENSOR_V_RESMPL_POS_EMBD_K,
LLM_TENSOR_V_RESMPL_ATTN_Q,
LLM_TENSOR_V_RESMPL_ATTN_K,
LLM_TENSOR_V_RESMPL_ATTN_V,
LLM_TENSOR_V_RESMPL_ATTN_OUT,
LLM_TENSOR_V_RESMPL_KV,
LLM_TENSOR_V_RESMPL_KV_NORM,
LLM_TENSOR_V_RESMPL_POST_NORM,
LLM_TENSOR_V_RESMPL_Q_NORM,
LLM_TENSOR_V_RESMPL_PROJ,
LLM_TENSOR_V_RESMPL_QUERY,
LLM_TENSOR_V_TOK_EMBD_IMAGE,
LLM_TENSOR_V_TOK_EMBD_END_IMAGE,
LLM_TENSOR_V_TOK_EMBD_SLICE,
LLM_TENSOR_V_TOK_EMBD_END_SLICE,
};
enum llm_tensor_layer {
LLM_TENSOR_LAYER_INPUT,
LLM_TENSOR_LAYER_REPEATING,
LLM_TENSOR_LAYER_PROJECTION,
LLM_TENSOR_LAYER_OUTPUT,
};

View file

@ -31,6 +31,7 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
/*n_seq_id =*/ ubatch_n_seq_id.data(),
/*seq_id =*/ ubatch_seq_id.data(),
/*output =*/ ubatch_output.data(),
/*embd_tensor =*/ nullptr,
};
return ubatch;
}
@ -55,7 +56,9 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
} else {
ubatch.token = nullptr;
}
if (batch->embd) {
if (batch->embd_tensor) {
ubatch.embd_tensor = batch->embd_tensor;
} else if (batch->embd) {
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
memcpy(
@ -139,7 +142,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr || batch->embd_tensor != nullptr);
ubatch.equal_seqs = false;
if (!seq.empty()) {
llama_sbatch_seq & s = seq[0];
@ -152,7 +155,7 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr || batch->embd_tensor != nullptr);
if (!seq.empty()) {
size_t length = 0;
size_t n_tokens_in_ubatch = 0;
@ -179,7 +182,7 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr || batch->embd_tensor != nullptr);
if (!seq.empty()) {
llama_sbatch_seq & s = seq[seq.size() - 1];
size_t length = s.length < n_ubatch ? s.length : n_ubatch;
@ -320,6 +323,7 @@ struct llama_batch llama_batch_get_one(
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
/*embd_tensor =*/ nullptr,
};
}
@ -332,6 +336,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
/*embd_tensor =*/ nullptr,
};
if (embd) {
@ -353,6 +358,35 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
return batch;
}
struct llama_batch llama_batch_get_one_from_tensor(struct ggml_tensor * tensor, int32_t p0, int32_t seq_id) {
GGML_ASSERT(tensor->ne[2] == 1 && tensor->ne[3] == 1);
int32_t n_tokens = tensor->ne[1];
llama_batch batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
/*embd_tensor =*/ tensor,
};
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens + 1));
for (int i = 0; i < n_tokens; ++i) {
batch.pos [i] = p0 + i;
batch.seq_id [i] = (llama_seq_id *) malloc(sizeof(llama_seq_id));
batch.seq_id [i][0] = seq_id;
batch.n_seq_id[i] = 1;
}
batch.seq_id[n_tokens] = nullptr;
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
return batch;
}
void llama_batch_free(struct llama_batch batch) {
if (batch.token) free(batch.token);
if (batch.embd) free(batch.embd);

View file

@ -21,6 +21,8 @@ struct llama_ubatch {
int32_t * n_seq_id; // [n_seqs]
llama_seq_id ** seq_id; // [n_seqs]
int8_t * output; // [n_tokens]
struct ggml_tensor * embd_tensor;
};
struct llama_sbatch_seq {

View file

@ -73,7 +73,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
}
if (ubatch.embd) {
if (ubatch.embd && !ubatch.embd_tensor) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_tokens = ubatch.n_tokens;

View file

@ -6,6 +6,7 @@
#include "llama-model.h"
#include "llama-kv-cache.h"
#include "llama-adapter.h"
#include "llama-vision.h"
#include "ggml-cpp.h"

View file

@ -96,7 +96,7 @@ struct llama_hparams {
float f_max_alibi_bias = 0.0f;
float f_logit_scale = 0.0f;
// Additional scale factors (Granite/Granite MoE)
// Additional scale factors (Granite/Granite MoE/MiniCPM)
float f_residual_scale = 0.0f;
float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f;

View file

@ -375,6 +375,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
template bool llama_model_loader::get_key<int32_t> (enum llm_kv kid, int32_t & result, bool required);
template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
@ -439,6 +440,7 @@ namespace GGUFMeta {
// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 3>>(enum llm_kv kid, std::array<float, 3> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(
const std::string & fname,

View file

@ -2,6 +2,7 @@
#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-vision.h"
#include "llama-model-loader.h"
#include "ggml-cpp.h"
@ -216,6 +217,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
} break;
case GGML_OP_CONCAT:
{
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_concat(ctx, w, b, 0);
} break;
default:
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
}
@ -1257,6 +1263,56 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
hparams.rope_type = llama_model_rope_type(this);
// vision model
auto & vparams = vit.hparams;
std::string vision_type;
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
if (vision_type == "vit") {
LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__);
has_vision = true;
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true);
ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true);
ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true);
ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true);
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
ml.get_key(LLM_KV_VISION_VIT_SCALE_FACTOR, vparams.scale_factor, false);
{
std::string name;
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
vparams.proj_type = vision_projector_type_from_name(name);
if (vparams.proj_type == VISION_PROJECTOR_TYPE_UNKNOWN) {
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
}
}
{
std::string name;
ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false);
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
}
{
std::string arch;
ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true);
vparams.arch = llm_arch_from_string(arch);
if (vparams.arch == LLM_ARCH_UNKNOWN) {
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));
}
}
} else if (!vision_type.empty()) {
throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str()));
}
// arch-specific CLIP hparams
// switch (vparams.arch) {
// case VISION_ARCH_LLAVA:
// default: (void)0;
// }
}
void llama_model::load_vocab(llama_model_loader & ml) {
@ -1434,7 +1490,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
// sanity checks
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
if (info.layer == LLM_TENSOR_LAYER_PROJECTION) {
// nothing to check
} else if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
if (tn.bid != -1) {
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
}
@ -1456,6 +1514,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_TENSOR_LAYER_REPEATING:
buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
break;
case LLM_TENSOR_LAYER_PROJECTION:
buft_list = pimpl->dev_layer.back().buft_list;
break;
default:
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
}
@ -3425,6 +3486,179 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
}
// load tensors for vision model
auto & vparams = vit.hparams;
if (has_vision) {
// language params
const int64_t n_embd = hparams.n_embd;
// vision params
const int64_t n_vlayer = vparams.n_layer;
const int64_t n_vembd = vparams.hidden_size;
const int64_t n_vff = vparams.n_intermediate;
const int64_t max_pos_embd = vparams.max_pos_embd;
const int64_t n_channel = 3; // always RGB
const int64_t patch_size = vparams.patch_size;
const auto tn = LLM_TN(vparams.arch);
// TODO: vit is cpu only for now
vit.buft = ggml_backend_cpu_buffer_type();
vit.layers.resize(n_vlayer);
switch (vparams.arch) {
case LLM_ARCH_VISION_LLAVA:
case LLM_ARCH_VISION_MOBILEVLM:
{
if (vparams.arch == LLM_ARCH_VISION_LLAVA) {
vit.mm_1_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}, 0);
vit.mm_1_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}, 0);
vit.mm_2_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}, 0);
vit.mm_2_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}, 0);
} else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) {
vit.mm_model_mlp_0_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}, 0);
vit.mm_model_mlp_0_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}, 0);
vit.mm_model_mlp_2_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}, 0);
vit.mm_model_mlp_2_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}, 0);
vit.mm_model_peg_0_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}, 0);
vit.mm_model_peg_0_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}, 0);
}
vit.class_embedding = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}, 0);
vit.patch_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}, 0);
vit.position_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}, 0);
vit.pre_norm_w = create_tensor(tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}, 0);
vit.pre_norm_b = create_tensor(tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}, 0);
vit.post_norm_w = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
vit.post_norm_b = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED);
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = vit.layers[i];
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
}
} break;
case LLM_ARCH_VISION_MINICPMV:
{
vit.patch_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}, 0);
vit.patch_bias = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}, 0);
vit.position_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}, 0);
// tok embd
vit.mm_tok_embd_image = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_IMAGE, "weight"), {n_embd}, 0);
vit.mm_tok_embd_end_image = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "weight"), {n_embd}, 0);
vit.mm_tok_embd_slice = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd}, 0);
vit.mm_tok_embd_end_slice = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd}, 0);
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = vit.layers[i];
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
}
// resampler, we consider it as one layer on top of the encoder
int il = n_vlayer - 1;
int rs_n_embd = llama_vision_n_mmproj_embd(vit);
vit.mm_model_pos_embed_k = create_tensor(tn(LLM_TENSOR_V_RESMPL_POS_EMBD_K, "weight", il), {rs_n_embd, max_pos_embd}, 0);
vit.mm_model_query = create_tensor(tn(LLM_TENSOR_V_RESMPL_QUERY, "weight", il), {rs_n_embd, 64}, 0); // why 64?
vit.mm_model_proj = create_tensor(tn(LLM_TENSOR_V_RESMPL_PROJ, "weight", il), {rs_n_embd, rs_n_embd}, 0);
vit.mm_model_kv_proj = create_tensor(tn(LLM_TENSOR_V_RESMPL_KV, "weight", il), {n_vembd, rs_n_embd}, 0);
vit.mm_model_attn_q_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "weight", il), {rs_n_embd, rs_n_embd}, 0);
vit.mm_model_attn_q_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "bias" , il), {rs_n_embd}, 0);
vit.mm_model_attn_k_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_K, "weight", il), {rs_n_embd, rs_n_embd}, 0);
vit.mm_model_attn_k_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_K, "bias" , il), {rs_n_embd}, 0);
vit.mm_model_attn_v_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_V, "weight", il), {rs_n_embd, rs_n_embd}, 0);
vit.mm_model_attn_v_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_V, "bias" , il), {rs_n_embd}, 0);
vit.mm_model_attn_o_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "weight", il), {rs_n_embd, rs_n_embd}, 0);
vit.mm_model_attn_o_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "bias" , il), {rs_n_embd}, 0);
vit.mm_model_ln_q_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_Q_NORM, "weight", il), {rs_n_embd}, 0);
vit.mm_model_ln_q_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_Q_NORM, "bias" , il), {rs_n_embd}, 0);
vit.mm_model_ln_kv_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_KV_NORM, "weight", il), {rs_n_embd}, 0);
vit.mm_model_ln_kv_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_KV_NORM, "bias" , il), {rs_n_embd}, 0);
vit.mm_model_ln_post_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight", il), {rs_n_embd}, 0);
vit.mm_model_ln_post_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" , il), {rs_n_embd}, 0);
} break;
case LLM_ARCH_VISION_IDEFICS3:
{
int scale_factor = vit.hparams.scale_factor;
vit.projection = create_tensor(tn(LLM_TENSOR_V_MMPROJ_FC, "weight"), {n_vembd * scale_factor * scale_factor, n_embd}, 0);
vit.patch_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}, 0);
vit.patch_bias = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}, 0);
vit.position_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}, 0);
vit.post_norm_w = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, 0);
vit.post_norm_b = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, 0);
for (int i = 0; i < n_vlayer; ++i) {
auto & layer = vit.layers[i];
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
}
} break;
default:
throw std::runtime_error("unknown vision architecture");
}
if (llama_vision_n_mmproj_embd(vit) != hparams.n_embd) {
std::runtime_error("model has vision, but n_mmproj_embd != n_embd");
}
}
}
ml.done_getting_tensors();
@ -3920,6 +4154,12 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN2VL:
return LLAMA_ROPE_TYPE_MROPE;
case LLM_ARCH_VISION_LLAVA:
case LLM_ARCH_VISION_MOBILEVLM:
case LLM_ARCH_VISION_MINICPMV:
case LLM_ARCH_VISION_IDEFICS3:
GGML_ABORT("vision arch does not use RoPE");
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");

View file

@ -4,6 +4,7 @@
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-vocab.h"
#include "llama-vision.h"
#include <memory>
#include <string>
@ -362,6 +363,10 @@ struct llama_model {
const struct ggml_tensor * get_tensor(const char * name) const;
// vision
bool has_vision = false;
llama_vision_model vit;
private:
struct impl;
std::unique_ptr<impl> pimpl;

1343
src/llama-vision.cpp Normal file

File diff suppressed because it is too large Load diff

195
src/llama-vision.h Normal file
View file

@ -0,0 +1,195 @@
#pragma once
#include "ggml.h"
#include "ggml-cpp.h"
#include "llama.h"
#include "llama-arch.h"
#include <vector>
#include <array>
#define VISION_GRAPH_MAX_NODE 2048
enum vision_projector_type {
VISION_PROJECTOR_TYPE_UNKNOWN,
VISION_PROJECTOR_TYPE_MLP,
VISION_PROJECTOR_TYPE_LDPV2,
VISION_PROJECTOR_TYPE_MINICPMV_2_5,
VISION_PROJECTOR_TYPE_MINICPMV_2_6,
};
enum mm_patch_merge {
MM_PATCH_MERGE_UNKNOWN,
MM_PATCH_MERGE_FLAT,
MM_PATCH_MERGE_SPATIAL_UNPAD,
};
struct llama_vision_model {
struct vision_hparams {
llm_arch arch = LLM_ARCH_UNKNOWN;
uint32_t image_size;
uint32_t patch_size;
uint32_t hidden_size;
uint32_t n_intermediate;
uint32_t projection_dim;
uint32_t n_head;
uint32_t n_layer;
uint32_t max_pos_embd;
int32_t select_layer = 0;
bool use_gelu = false;
float eps;
vision_projector_type proj_type = VISION_PROJECTOR_TYPE_UNKNOWN;
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN;
std::array<float, 3> image_mean;
std::array<float, 3> image_std;
std::array<int32_t, 32> image_grid_pinpoints; // TODO: should this be array of (x, y) pairs?
int32_t image_crop_resolution;
// idefics3
int scale_factor = 0;
};
struct vision_hparams hparams;
ggml_backend_buffer_type_t buft;
// embeddings
struct ggml_tensor * class_embedding = nullptr;
struct ggml_tensor * patch_embeddings = nullptr;
struct ggml_tensor * patch_bias = nullptr;
struct ggml_tensor * position_embeddings = nullptr;
struct ggml_tensor * pre_norm_w = nullptr;
struct ggml_tensor * pre_norm_b = nullptr;
struct vision_layer {
// attention
struct ggml_tensor * k_w = nullptr;
struct ggml_tensor * k_b = nullptr;
struct ggml_tensor * q_w = nullptr;
struct ggml_tensor * q_b = nullptr;
struct ggml_tensor * v_w = nullptr;
struct ggml_tensor * v_b = nullptr;
struct ggml_tensor * output_w = nullptr;
struct ggml_tensor * output_b = nullptr;
// layernorm 1
struct ggml_tensor * norm_in_w = nullptr;
struct ggml_tensor * norm_in_b = nullptr;
// ff
struct ggml_tensor * ffn_up_w = nullptr;
struct ggml_tensor * ffn_up_b = nullptr;
struct ggml_tensor * ffn_down_w = nullptr;
struct ggml_tensor * ffn_down_b = nullptr;
// layernorm 2
struct ggml_tensor * norm_out_w = nullptr;
struct ggml_tensor * norm_out_b = nullptr;
};
std::vector<vision_layer> layers;
struct ggml_tensor * post_norm_w = nullptr;
struct ggml_tensor * post_norm_b = nullptr;
struct ggml_tensor * projection = nullptr;
// LLaVA projection
struct ggml_tensor * mm_1_w = nullptr;
struct ggml_tensor * mm_1_b = nullptr;
struct ggml_tensor * mm_2_w = nullptr;
struct ggml_tensor * mm_2_b = nullptr;
// MobileVLM_V2 projection
struct ggml_tensor * mm_model_mlp_0_w = nullptr;
struct ggml_tensor * mm_model_mlp_0_b = nullptr;
struct ggml_tensor * mm_model_mlp_2_w = nullptr;
struct ggml_tensor * mm_model_mlp_2_b = nullptr;
struct ggml_tensor * mm_model_peg_0_w = nullptr;
struct ggml_tensor * mm_model_peg_0_b = nullptr;
// MINICPMV projection
struct ggml_tensor * mm_model_pos_embed_k = nullptr;
struct ggml_tensor * mm_model_query = nullptr;
struct ggml_tensor * mm_model_proj = nullptr;
struct ggml_tensor * mm_model_kv_proj = nullptr;
struct ggml_tensor * mm_model_attn_q_w = nullptr;
struct ggml_tensor * mm_model_attn_q_b = nullptr;
struct ggml_tensor * mm_model_attn_k_w = nullptr;
struct ggml_tensor * mm_model_attn_k_b = nullptr;
struct ggml_tensor * mm_model_attn_v_w = nullptr;
struct ggml_tensor * mm_model_attn_v_b = nullptr;
struct ggml_tensor * mm_model_attn_o_w = nullptr;
struct ggml_tensor * mm_model_attn_o_b = nullptr;
struct ggml_tensor * mm_model_ln_q_w = nullptr;
struct ggml_tensor * mm_model_ln_q_b = nullptr;
struct ggml_tensor * mm_model_ln_kv_w = nullptr;
struct ggml_tensor * mm_model_ln_kv_b = nullptr;
struct ggml_tensor * mm_model_ln_post_w = nullptr;
struct ggml_tensor * mm_model_ln_post_b = nullptr;
// special tokens
struct ggml_tensor * mm_tok_embd_image = nullptr;
struct ggml_tensor * mm_tok_embd_end_image = nullptr;
struct ggml_tensor * mm_tok_embd_slice = nullptr;
struct ggml_tensor * mm_tok_embd_end_slice = nullptr;
};
struct llama_vision_context {
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_ptr sched;
std::vector<ggml_backend_ptr> backends;
ggml_backend_t backend_cpu;
const llama_vision_model * model;
// temporary output data, to be picked up by llama_decode()
struct ggml_context * ctx_ggml = nullptr;
struct ggml_tensor * output;
};
// for now, this only contains:
// - the instruction for ggml_conv_2d to break the image into patches
// - the pre-processed image data in f32
struct llama_vision_tokens {
uint32_t px; // size of patch
uint32_t py; // size of patch
size_t n_px; // number of patches in x direction
size_t n_py; // number of patches in y direction
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
std::vector<std::vector<float>> buf; // preprocessed image data
};
inline mm_patch_merge mm_patch_merge_from_name(std::string & name) {
if (name == "flat") {
return MM_PATCH_MERGE_FLAT;
} else if (name == "spatial_unpad") {
return MM_PATCH_MERGE_SPATIAL_UNPAD;
}
return MM_PATCH_MERGE_UNKNOWN;
}
inline vision_projector_type vision_projector_type_from_name(std::string & name) {
if (name == "mlp") {
return VISION_PROJECTOR_TYPE_MLP;
} else if (name == "ldpv2") {
return VISION_PROJECTOR_TYPE_LDPV2;
} else if (name == "minicpmv-2.5") {
return VISION_PROJECTOR_TYPE_MINICPMV_2_5;
} else if (name == "minicpmv-2.6") {
return VISION_PROJECTOR_TYPE_MINICPMV_2_6;
}
return VISION_PROJECTOR_TYPE_UNKNOWN;
}
// only for sanity check: must be equal to n_embd of language model
uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel);
struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx);

View file

@ -138,6 +138,9 @@ static struct ggml_tensor * llm_build_inp_embd(
), scale);
inpL = ggml_add(ctx, inpL, inpL_delta);
}
} else if (ubatch.embd_tensor) {
inpL = ubatch.embd_tensor;
ggml_set_input(ubatch.embd_tensor);
} else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd;
@ -8457,7 +8460,9 @@ static int llama_prepare_sbatch(
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
GGML_ASSERT((batch.token && !batch.embd && !batch.embd_tensor)
|| (!batch.token && batch.embd && !batch.embd_tensor)
|| (!batch.token && !batch.embd && batch.embd_tensor)); // NOLINT
if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) {
@ -9282,7 +9287,7 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
// initialize scheduler with the worst-case graph
@ -9845,7 +9850,7 @@ struct llama_context * llama_init_from_model(
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
// reserve pp graph first so that buffers are only allocated once
@ -9854,7 +9859,7 @@ struct llama_context * llama_init_from_model(
int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
// reserve with tg graph to get the number of splits and nodes
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());