add more kv metadata
This commit is contained in:
parent
6089b0a50a
commit
c2ec885264
6 changed files with 76 additions and 18 deletions
|
@ -471,7 +471,7 @@ class Model:
|
|||
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
|
||||
hparams = {**text_config, **hparams}
|
||||
return hparams
|
||||
|
||||
|
||||
@staticmethod
|
||||
def load_preprocessor_config(dir_model: Path):
|
||||
file_path = dir_model / "preprocessor_config.json"
|
||||
|
@ -1590,7 +1590,7 @@ class LlamaModel(Model):
|
|||
|
||||
# For vision model
|
||||
if self.vparams is not None and self.preprocessor_config is not None:
|
||||
self.gguf_writer.add_vision_type("clip")
|
||||
self.gguf_writer.add_vision_type("clip-vit")
|
||||
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
|
||||
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
|
||||
self.gguf_writer.add_vision_clip_architecture("llava")
|
||||
|
@ -1600,6 +1600,8 @@ class LlamaModel(Model):
|
|||
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
|
||||
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
|
||||
self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
|
||||
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
|
||||
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
|
||||
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
|
||||
# TODO: should not hardcode these, but they are currently missing from config.json
|
||||
|
|
|
@ -173,13 +173,15 @@ class Keys:
|
|||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||
EOM_ID = "tokenizer.ggml.eom_token_id"
|
||||
IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
|
||||
IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
|
||||
|
||||
class Adapter:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
class Vision:
|
||||
# only support vision.type = "clip" for now
|
||||
# only support vision.type = "clip-vit" for now
|
||||
TYPE = "vision.type"
|
||||
IMAGE_SIZE = "vision.image_size"
|
||||
PATCH_SIZE = "vision.patch_size"
|
||||
|
@ -196,7 +198,10 @@ class Keys:
|
|||
PROJECTION_DIM = "vision.clip.projection_dim"
|
||||
USE_GELU = "vision.clip.use_gelu"
|
||||
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
|
||||
MAX_SLICES = "vision.clip.max_slices"
|
||||
PROJECTOR_TYPE = "vision.clip.projector_type"
|
||||
SELECT_LAYER = "vision.clip.select_layer"
|
||||
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
|
||||
HEAD_COUNT = "vision.clip.attention.head_count"
|
||||
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
|
||||
|
||||
|
@ -1430,6 +1435,11 @@ class CLIPProjectorType(Enum):
|
|||
MLP = 'mlp'
|
||||
|
||||
|
||||
class CLIPPatchMergeType(Enum):
|
||||
FLAT = 'flat'
|
||||
SPATIAL_UNPAD = 'spatial_unpad'
|
||||
|
||||
|
||||
class GGMLQuantizationType(IntEnum):
|
||||
F32 = 0
|
||||
F16 = 1
|
||||
|
|
|
@ -27,6 +27,7 @@ from .constants import (
|
|||
PoolingType,
|
||||
TokenType,
|
||||
CLIPProjectorType,
|
||||
CLIPPatchMergeType,
|
||||
)
|
||||
|
||||
from .quants import quant_shape_from_byte_shape
|
||||
|
@ -848,6 +849,15 @@ class GGUFWriter:
|
|||
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
|
||||
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
|
||||
|
||||
def add_vision_clip_max_slices(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
|
||||
|
||||
def add_vision_clip_select_layer(self, value: int) -> None:
|
||||
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
|
||||
|
||||
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
|
||||
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
|
||||
|
||||
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
|
||||
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
|
||||
|
||||
|
|
|
@ -54,6 +54,22 @@ struct clip_image_f32 {
|
|||
using clip_image_f32_batch = std::vector<clip_image_f32>;
|
||||
using clip_image_f8_batch = std::vector<clip_image_u8>;
|
||||
|
||||
clip_projector_type projector_type_from_name(std::string & name) {
|
||||
if (name == "mlp") {
|
||||
return CLIP_PROJECTOR_TYPE_MLP;
|
||||
}
|
||||
return CLIP_PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
mm_patch_merge mm_patch_merge_from_name(std::string & name) {
|
||||
if (name == "flat") {
|
||||
return MM_PATCH_MERGE_FLAT;
|
||||
} else if (name == "spatial_unpad") {
|
||||
return MM_PATCH_MERGE_SPATIAL_UNPAD;
|
||||
}
|
||||
return MM_PATCH_MERGE_UNKNOWN;
|
||||
}
|
||||
|
||||
int clip_n_patches(const clip_context & ctx) {
|
||||
auto & hparams = ctx.model->hparams;
|
||||
int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
|
||||
|
@ -456,7 +472,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size,
|
|||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < (int)hparams.n_layer - 2; il++) {
|
||||
for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) {
|
||||
struct ggml_tensor * cur = embeddings;
|
||||
|
||||
// layernorm1
|
||||
|
|
|
@ -11,10 +11,12 @@ enum vision_arch {
|
|||
};
|
||||
|
||||
enum clip_projector_type {
|
||||
CLIP_PROJECTOR_TYPE_UNKNOWN,
|
||||
CLIP_PROJECTOR_TYPE_MLP,
|
||||
};
|
||||
|
||||
enum mm_patch_merge {
|
||||
MM_PATCH_MERGE_UNKNOWN,
|
||||
MM_PATCH_MERGE_FLAT,
|
||||
MM_PATCH_MERGE_SPATIAL_UNPAD,
|
||||
};
|
||||
|
@ -30,11 +32,12 @@ struct clip_hparams {
|
|||
uint32_t n_head;
|
||||
uint32_t n_layer;
|
||||
uint32_t max_pos_embd;
|
||||
int32_t select_layer = 0;
|
||||
bool use_gelu = false;
|
||||
|
||||
float eps;
|
||||
|
||||
clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_MLP;
|
||||
clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN;
|
||||
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT;
|
||||
|
||||
std::array<float, 3> image_mean;
|
||||
|
@ -112,6 +115,8 @@ struct clip_context {
|
|||
std::vector<float> output; // size == n_output * n_embd
|
||||
};
|
||||
|
||||
mm_patch_merge mm_patch_merge_from_name(std::string & name);
|
||||
clip_projector_type projector_type_from_name(std::string & name);
|
||||
int clip_n_patches(const clip_context & ctx);
|
||||
int clip_n_mmproj_embd(const clip_context & ctx);
|
||||
int clip_n_embd(const clip_context & ctx);
|
||||
|
|
|
@ -401,7 +401,10 @@ enum llm_kv {
|
|||
LLM_KV_VISION_CLIP_PROJECTION_DIM,
|
||||
LLM_KV_VISION_CLIP_USE_GELU,
|
||||
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
|
||||
LLM_KV_VISION_CLIP_MAX_SLICES,
|
||||
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
|
||||
LLM_KV_VISION_CLIP_SELECT_LAYER,
|
||||
LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE,
|
||||
LLM_KV_VISION_CLIP_HEAD_COUNT,
|
||||
LLM_KV_VISION_CLIP_LAYERNORM_EPS,
|
||||
};
|
||||
|
@ -527,7 +530,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" },
|
||||
{ LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" },
|
||||
{ LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" },
|
||||
{ LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" },
|
||||
{ LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" },
|
||||
{ LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" },
|
||||
{ LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" },
|
||||
{ LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" },
|
||||
{ LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" },
|
||||
};
|
||||
|
@ -6227,9 +6233,8 @@ static void llm_load_hparams(
|
|||
auto & vparams = model.clip.hparams;
|
||||
std::string vision_type;
|
||||
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
|
||||
if (vision_type == "clip") {
|
||||
if (vision_type == "clip-vit") {
|
||||
model.has_vision = true;
|
||||
std::string proj_type;
|
||||
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
|
||||
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
|
||||
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
|
||||
|
@ -6239,18 +6244,28 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
|
||||
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, proj_type, true);
|
||||
if (proj_type == "mlp") {
|
||||
vparams.proj_type = CLIP_PROJECTOR_TYPE_MLP;
|
||||
} else {
|
||||
throw std::runtime_error(format("unsupported clip projector type: %s", proj_type.c_str()));
|
||||
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
|
||||
{
|
||||
std::string name;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
|
||||
vparams.proj_type = projector_type_from_name(name);
|
||||
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
|
||||
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
|
||||
}
|
||||
}
|
||||
std::string arch;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
|
||||
for (auto & it : VISION_ARCH_NAMES) {
|
||||
if (arch == it.second) {
|
||||
vparams.arch = it.first;
|
||||
break;
|
||||
{
|
||||
std::string name;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false);
|
||||
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
|
||||
}
|
||||
{
|
||||
std::string arch;
|
||||
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
|
||||
for (auto & it : VISION_ARCH_NAMES) {
|
||||
if (arch == it.second) {
|
||||
vparams.arch = it.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (!vision_type.empty()) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue