add more kv metadata

This commit is contained in:
Xuan Son Nguyen 2024-10-02 12:37:50 +02:00
parent 6089b0a50a
commit c2ec885264
6 changed files with 76 additions and 18 deletions

View file

@ -471,7 +471,7 @@ class Model:
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
hparams = {**text_config, **hparams}
return hparams
@staticmethod
def load_preprocessor_config(dir_model: Path):
file_path = dir_model / "preprocessor_config.json"
@ -1590,7 +1590,7 @@ class LlamaModel(Model):
# For vision model
if self.vparams is not None and self.preprocessor_config is not None:
self.gguf_writer.add_vision_type("clip")
self.gguf_writer.add_vision_type("clip-vit")
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
self.gguf_writer.add_vision_clip_architecture("llava")
@ -1600,6 +1600,8 @@ class LlamaModel(Model):
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
# TODO: should not hardcode these, but they are currently missing from config.json

View file

@ -173,13 +173,15 @@ class Keys:
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"
IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
class Adapter:
TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha"
class Vision:
# only support vision.type = "clip" for now
# only support vision.type = "clip-vit" for now
TYPE = "vision.type"
IMAGE_SIZE = "vision.image_size"
PATCH_SIZE = "vision.patch_size"
@ -196,7 +198,10 @@ class Keys:
PROJECTION_DIM = "vision.clip.projection_dim"
USE_GELU = "vision.clip.use_gelu"
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
MAX_SLICES = "vision.clip.max_slices"
PROJECTOR_TYPE = "vision.clip.projector_type"
SELECT_LAYER = "vision.clip.select_layer"
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
HEAD_COUNT = "vision.clip.attention.head_count"
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
@ -1430,6 +1435,11 @@ class CLIPProjectorType(Enum):
MLP = 'mlp'
class CLIPPatchMergeType(Enum):
FLAT = 'flat'
SPATIAL_UNPAD = 'spatial_unpad'
class GGMLQuantizationType(IntEnum):
F32 = 0
F16 = 1

View file

@ -27,6 +27,7 @@ from .constants import (
PoolingType,
TokenType,
CLIPProjectorType,
CLIPPatchMergeType,
)
from .quants import quant_shape_from_byte_shape
@ -848,6 +849,15 @@ class GGUFWriter:
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
def add_vision_clip_max_slices(self, value: int) -> None:
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
def add_vision_clip_select_layer(self, value: int) -> None:
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)

View file

@ -54,6 +54,22 @@ struct clip_image_f32 {
using clip_image_f32_batch = std::vector<clip_image_f32>;
using clip_image_f8_batch = std::vector<clip_image_u8>;
clip_projector_type projector_type_from_name(std::string & name) {
if (name == "mlp") {
return CLIP_PROJECTOR_TYPE_MLP;
}
return CLIP_PROJECTOR_TYPE_UNKNOWN;
}
mm_patch_merge mm_patch_merge_from_name(std::string & name) {
if (name == "flat") {
return MM_PATCH_MERGE_FLAT;
} else if (name == "spatial_unpad") {
return MM_PATCH_MERGE_SPATIAL_UNPAD;
}
return MM_PATCH_MERGE_UNKNOWN;
}
int clip_n_patches(const clip_context & ctx) {
auto & hparams = ctx.model->hparams;
int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
@ -456,7 +472,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
}
// loop over layers
for (int il = 0; il < (int)hparams.n_layer - 2; il++) {
for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) {
struct ggml_tensor * cur = embeddings;
// layernorm1

View file

@ -11,10 +11,12 @@ enum vision_arch {
};
enum clip_projector_type {
CLIP_PROJECTOR_TYPE_UNKNOWN,
CLIP_PROJECTOR_TYPE_MLP,
};
enum mm_patch_merge {
MM_PATCH_MERGE_UNKNOWN,
MM_PATCH_MERGE_FLAT,
MM_PATCH_MERGE_SPATIAL_UNPAD,
};
@ -30,11 +32,12 @@ struct clip_hparams {
uint32_t n_head;
uint32_t n_layer;
uint32_t max_pos_embd;
int32_t select_layer = 0;
bool use_gelu = false;
float eps;
clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_MLP;
clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN;
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT;
std::array<float, 3> image_mean;
@ -112,6 +115,8 @@ struct clip_context {
std::vector<float> output; // size == n_output * n_embd
};
mm_patch_merge mm_patch_merge_from_name(std::string & name);
clip_projector_type projector_type_from_name(std::string & name);
int clip_n_patches(const clip_context & ctx);
int clip_n_mmproj_embd(const clip_context & ctx);
int clip_n_embd(const clip_context & ctx);

View file

@ -401,7 +401,10 @@ enum llm_kv {
LLM_KV_VISION_CLIP_PROJECTION_DIM,
LLM_KV_VISION_CLIP_USE_GELU,
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
LLM_KV_VISION_CLIP_MAX_SLICES,
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
LLM_KV_VISION_CLIP_SELECT_LAYER,
LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE,
LLM_KV_VISION_CLIP_HEAD_COUNT,
LLM_KV_VISION_CLIP_LAYERNORM_EPS,
};
@ -527,7 +530,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" },
{ LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" },
{ LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" },
{ LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" },
{ LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" },
{ LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" },
{ LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" },
{ LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" },
{ LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" },
};
@ -6227,9 +6233,8 @@ static void llm_load_hparams(
auto & vparams = model.clip.hparams;
std::string vision_type;
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
if (vision_type == "clip") {
if (vision_type == "clip-vit") {
model.has_vision = true;
std::string proj_type;
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
@ -6239,18 +6244,28 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, proj_type, true);
if (proj_type == "mlp") {
vparams.proj_type = CLIP_PROJECTOR_TYPE_MLP;
} else {
throw std::runtime_error(format("unsupported clip projector type: %s", proj_type.c_str()));
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
{
std::string name;
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
vparams.proj_type = projector_type_from_name(name);
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
}
}
std::string arch;
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
for (auto & it : VISION_ARCH_NAMES) {
if (arch == it.second) {
vparams.arch = it.first;
break;
{
std::string name;
ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false);
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
}
{
std::string arch;
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
for (auto & it : VISION_ARCH_NAMES) {
if (arch == it.second) {
vparams.arch = it.first;
break;
}
}
}
} else if (!vision_type.empty()) {