diff --git a/examples/xgenmm/model_breakdown.ipynb b/examples/xgenmm/model_breakdown.ipynb new file mode 100644 index 000000000..395cff39c --- /dev/null +++ b/examples/xgenmm/model_breakdown.ipynb @@ -0,0 +1,492 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from open_flamingo import create_model_and_transforms\n", + "from omegaconf import OmegaConfd\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'image_aspect_ratio': 'anyres', 'num_vision_tokens': 128, 'anyres_patch_sampling': True}\n" + ] + } + ], + "source": [ + "cfg = dict(\n", + " model_family = 'kosmos',\n", + " lm_path = 'microsoft/Phi-3-mini-4k-instruct',\n", + " # vision_encoder_path = 'ViT-H-14-378-quickgelu',\n", + " # vision_encoder_pretrained = 'dfn5b',\n", + " vision_encoder_path = 'google/siglip-so400m-patch14-384',\n", + " vision_encoder_pretrained = 'google',\n", + " num_vision_tokens = 128,\n", + " image_aspect_ratio = 'anyres',\n", + " anyres_patch_sampling = True,\n", + " anyres_grids=[[1,2],[2,1],[2,2],[3,1],[1,3]],\n", + " ckpt_pth = '/export/share/manli_shu/models/open-flamingo-dev/anyres_ablation_HFSiglip_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_5-llava_1p6_ocrmathmix_v4-8x8-ckpt2/checkpoint_0.pt',\n", + ")\n", + "cfg = OmegaConf.create(cfg)\n", + "if cfg.model_family in ['kosmos-instruct', 'kosmos', 'llava']:\n", + " additional_kwargs = {\n", + " \"image_aspect_ratio\": cfg.image_aspect_ratio,\n", + " }\n", + " if cfg.model_family in ['kosmos-instruct', 'kosmos']:\n", + " additional_kwargs.update({\n", + " \"num_vision_tokens\": cfg.num_vision_tokens,\n", + " \"anyres_patch_sampling\": cfg.anyres_patch_sampling,\n", + " })\n", + "print(additional_kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.\n", + "Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "beeb7d9b79894d648f7aa4d27654cd76", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig\n", + "import json\n", + "import torch\n", + "with open('/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/vision_encoder/config.json', 'r') as f:\n", + " vision_config = json.load(f)\n", + "vision_config = SiglipVisionConfig(**vision_config)\n", + "vision_encoder = SiglipVisionTransformer(vision_config)\n", + "vit_ckpt = torch.load('/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/vision_encoder/xgenmm.vision_encoder')\n", + "vision_encoder.load_state_dict(vit_ckpt, strict=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "state_dict = vision_encoder.state_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "odict_keys(['embeddings.patch_embedding.weight', 'embeddings.patch_embedding.bias', 'embeddings.position_embedding.weight', 'encoder.layers.0.self_attn.k_proj.weight', 'encoder.layers.0.self_attn.k_proj.bias', 'encoder.layers.0.self_attn.v_proj.weight', 'encoder.layers.0.self_attn.v_proj.bias', 'encoder.layers.0.self_attn.q_proj.weight', 'encoder.layers.0.self_attn.q_proj.bias', 'encoder.layers.0.self_attn.out_proj.weight', 'encoder.layers.0.self_attn.out_proj.bias', 'encoder.layers.0.layer_norm1.weight', 'encoder.layers.0.layer_norm1.bias', 'encoder.layers.0.mlp.fc1.weight', 'encoder.layers.0.mlp.fc1.bias', 'encoder.layers.0.mlp.fc2.weight', 'encoder.layers.0.mlp.fc2.bias', 'encoder.layers.0.layer_norm2.weight', 'encoder.layers.0.layer_norm2.bias', 'encoder.layers.1.self_attn.k_proj.weight', 'encoder.layers.1.self_attn.k_proj.bias', 'encoder.layers.1.self_attn.v_proj.weight', 'encoder.layers.1.self_attn.v_proj.bias', 'encoder.layers.1.self_attn.q_proj.weight', 'encoder.layers.1.self_attn.q_proj.bias', 'encoder.layers.1.self_attn.out_proj.weight', 'encoder.layers.1.self_attn.out_proj.bias', 'encoder.layers.1.layer_norm1.weight', 'encoder.layers.1.layer_norm1.bias', 'encoder.layers.1.mlp.fc1.weight', 'encoder.layers.1.mlp.fc1.bias', 'encoder.layers.1.mlp.fc2.weight', 'encoder.layers.1.mlp.fc2.bias', 'encoder.layers.1.layer_norm2.weight', 'encoder.layers.1.layer_norm2.bias', 'encoder.layers.2.self_attn.k_proj.weight', 'encoder.layers.2.self_attn.k_proj.bias', 'encoder.layers.2.self_attn.v_proj.weight', 'encoder.layers.2.self_attn.v_proj.bias', 'encoder.layers.2.self_attn.q_proj.weight', 'encoder.layers.2.self_attn.q_proj.bias', 'encoder.layers.2.self_attn.out_proj.weight', 'encoder.layers.2.self_attn.out_proj.bias', 'encoder.layers.2.layer_norm1.weight', 'encoder.layers.2.layer_norm1.bias', 'encoder.layers.2.mlp.fc1.weight', 'encoder.layers.2.mlp.fc1.bias', 'encoder.layers.2.mlp.fc2.weight', 'encoder.layers.2.mlp.fc2.bias', 'encoder.layers.2.layer_norm2.weight', 'encoder.layers.2.layer_norm2.bias', 'encoder.layers.3.self_attn.k_proj.weight', 'encoder.layers.3.self_attn.k_proj.bias', 'encoder.layers.3.self_attn.v_proj.weight', 'encoder.layers.3.self_attn.v_proj.bias', 'encoder.layers.3.self_attn.q_proj.weight', 'encoder.layers.3.self_attn.q_proj.bias', 'encoder.layers.3.self_attn.out_proj.weight', 'encoder.layers.3.self_attn.out_proj.bias', 'encoder.layers.3.layer_norm1.weight', 'encoder.layers.3.layer_norm1.bias', 'encoder.layers.3.mlp.fc1.weight', 'encoder.layers.3.mlp.fc1.bias', 'encoder.layers.3.mlp.fc2.weight', 'encoder.layers.3.mlp.fc2.bias', 'encoder.layers.3.layer_norm2.weight', 'encoder.layers.3.layer_norm2.bias', 'encoder.layers.4.self_attn.k_proj.weight', 'encoder.layers.4.self_attn.k_proj.bias', 'encoder.layers.4.self_attn.v_proj.weight', 'encoder.layers.4.self_attn.v_proj.bias', 'encoder.layers.4.self_attn.q_proj.weight', 'encoder.layers.4.self_attn.q_proj.bias', 'encoder.layers.4.self_attn.out_proj.weight', 'encoder.layers.4.self_attn.out_proj.bias', 'encoder.layers.4.layer_norm1.weight', 'encoder.layers.4.layer_norm1.bias', 'encoder.layers.4.mlp.fc1.weight', 'encoder.layers.4.mlp.fc1.bias', 'encoder.layers.4.mlp.fc2.weight', 'encoder.layers.4.mlp.fc2.bias', 'encoder.layers.4.layer_norm2.weight', 'encoder.layers.4.layer_norm2.bias', 'encoder.layers.5.self_attn.k_proj.weight', 'encoder.layers.5.self_attn.k_proj.bias', 'encoder.layers.5.self_attn.v_proj.weight', 'encoder.layers.5.self_attn.v_proj.bias', 'encoder.layers.5.self_attn.q_proj.weight', 'encoder.layers.5.self_attn.q_proj.bias', 'encoder.layers.5.self_attn.out_proj.weight', 'encoder.layers.5.self_attn.out_proj.bias', 'encoder.layers.5.layer_norm1.weight', 'encoder.layers.5.layer_norm1.bias', 'encoder.layers.5.mlp.fc1.weight', 'encoder.layers.5.mlp.fc1.bias', 'encoder.layers.5.mlp.fc2.weight', 'encoder.layers.5.mlp.fc2.bias', 'encoder.layers.5.layer_norm2.weight', 'encoder.layers.5.layer_norm2.bias', 'encoder.layers.6.self_attn.k_proj.weight', 'encoder.layers.6.self_attn.k_proj.bias', 'encoder.layers.6.self_attn.v_proj.weight', 'encoder.layers.6.self_attn.v_proj.bias', 'encoder.layers.6.self_attn.q_proj.weight', 'encoder.layers.6.self_attn.q_proj.bias', 'encoder.layers.6.self_attn.out_proj.weight', 'encoder.layers.6.self_attn.out_proj.bias', 'encoder.layers.6.layer_norm1.weight', 'encoder.layers.6.layer_norm1.bias', 'encoder.layers.6.mlp.fc1.weight', 'encoder.layers.6.mlp.fc1.bias', 'encoder.layers.6.mlp.fc2.weight', 'encoder.layers.6.mlp.fc2.bias', 'encoder.layers.6.layer_norm2.weight', 'encoder.layers.6.layer_norm2.bias', 'encoder.layers.7.self_attn.k_proj.weight', 'encoder.layers.7.self_attn.k_proj.bias', 'encoder.layers.7.self_attn.v_proj.weight', 'encoder.layers.7.self_attn.v_proj.bias', 'encoder.layers.7.self_attn.q_proj.weight', 'encoder.layers.7.self_attn.q_proj.bias', 'encoder.layers.7.self_attn.out_proj.weight', 'encoder.layers.7.self_attn.out_proj.bias', 'encoder.layers.7.layer_norm1.weight', 'encoder.layers.7.layer_norm1.bias', 'encoder.layers.7.mlp.fc1.weight', 'encoder.layers.7.mlp.fc1.bias', 'encoder.layers.7.mlp.fc2.weight', 'encoder.layers.7.mlp.fc2.bias', 'encoder.layers.7.layer_norm2.weight', 'encoder.layers.7.layer_norm2.bias', 'encoder.layers.8.self_attn.k_proj.weight', 'encoder.layers.8.self_attn.k_proj.bias', 'encoder.layers.8.self_attn.v_proj.weight', 'encoder.layers.8.self_attn.v_proj.bias', 'encoder.layers.8.self_attn.q_proj.weight', 'encoder.layers.8.self_attn.q_proj.bias', 'encoder.layers.8.self_attn.out_proj.weight', 'encoder.layers.8.self_attn.out_proj.bias', 'encoder.layers.8.layer_norm1.weight', 'encoder.layers.8.layer_norm1.bias', 'encoder.layers.8.mlp.fc1.weight', 'encoder.layers.8.mlp.fc1.bias', 'encoder.layers.8.mlp.fc2.weight', 'encoder.layers.8.mlp.fc2.bias', 'encoder.layers.8.layer_norm2.weight', 'encoder.layers.8.layer_norm2.bias', 'encoder.layers.9.self_attn.k_proj.weight', 'encoder.layers.9.self_attn.k_proj.bias', 'encoder.layers.9.self_attn.v_proj.weight', 'encoder.layers.9.self_attn.v_proj.bias', 'encoder.layers.9.self_attn.q_proj.weight', 'encoder.layers.9.self_attn.q_proj.bias', 'encoder.layers.9.self_attn.out_proj.weight', 'encoder.layers.9.self_attn.out_proj.bias', 'encoder.layers.9.layer_norm1.weight', 'encoder.layers.9.layer_norm1.bias', 'encoder.layers.9.mlp.fc1.weight', 'encoder.layers.9.mlp.fc1.bias', 'encoder.layers.9.mlp.fc2.weight', 'encoder.layers.9.mlp.fc2.bias', 'encoder.layers.9.layer_norm2.weight', 'encoder.layers.9.layer_norm2.bias', 'encoder.layers.10.self_attn.k_proj.weight', 'encoder.layers.10.self_attn.k_proj.bias', 'encoder.layers.10.self_attn.v_proj.weight', 'encoder.layers.10.self_attn.v_proj.bias', 'encoder.layers.10.self_attn.q_proj.weight', 'encoder.layers.10.self_attn.q_proj.bias', 'encoder.layers.10.self_attn.out_proj.weight', 'encoder.layers.10.self_attn.out_proj.bias', 'encoder.layers.10.layer_norm1.weight', 'encoder.layers.10.layer_norm1.bias', 'encoder.layers.10.mlp.fc1.weight', 'encoder.layers.10.mlp.fc1.bias', 'encoder.layers.10.mlp.fc2.weight', 'encoder.layers.10.mlp.fc2.bias', 'encoder.layers.10.layer_norm2.weight', 'encoder.layers.10.layer_norm2.bias', 'encoder.layers.11.self_attn.k_proj.weight', 'encoder.layers.11.self_attn.k_proj.bias', 'encoder.layers.11.self_attn.v_proj.weight', 'encoder.layers.11.self_attn.v_proj.bias', 'encoder.layers.11.self_attn.q_proj.weight', 'encoder.layers.11.self_attn.q_proj.bias', 'encoder.layers.11.self_attn.out_proj.weight', 'encoder.layers.11.self_attn.out_proj.bias', 'encoder.layers.11.layer_norm1.weight', 'encoder.layers.11.layer_norm1.bias', 'encoder.layers.11.mlp.fc1.weight', 'encoder.layers.11.mlp.fc1.bias', 'encoder.layers.11.mlp.fc2.weight', 'encoder.layers.11.mlp.fc2.bias', 'encoder.layers.11.layer_norm2.weight', 'encoder.layers.11.layer_norm2.bias', 'encoder.layers.12.self_attn.k_proj.weight', 'encoder.layers.12.self_attn.k_proj.bias', 'encoder.layers.12.self_attn.v_proj.weight', 'encoder.layers.12.self_attn.v_proj.bias', 'encoder.layers.12.self_attn.q_proj.weight', 'encoder.layers.12.self_attn.q_proj.bias', 'encoder.layers.12.self_attn.out_proj.weight', 'encoder.layers.12.self_attn.out_proj.bias', 'encoder.layers.12.layer_norm1.weight', 'encoder.layers.12.layer_norm1.bias', 'encoder.layers.12.mlp.fc1.weight', 'encoder.layers.12.mlp.fc1.bias', 'encoder.layers.12.mlp.fc2.weight', 'encoder.layers.12.mlp.fc2.bias', 'encoder.layers.12.layer_norm2.weight', 'encoder.layers.12.layer_norm2.bias', 'encoder.layers.13.self_attn.k_proj.weight', 'encoder.layers.13.self_attn.k_proj.bias', 'encoder.layers.13.self_attn.v_proj.weight', 'encoder.layers.13.self_attn.v_proj.bias', 'encoder.layers.13.self_attn.q_proj.weight', 'encoder.layers.13.self_attn.q_proj.bias', 'encoder.layers.13.self_attn.out_proj.weight', 'encoder.layers.13.self_attn.out_proj.bias', 'encoder.layers.13.layer_norm1.weight', 'encoder.layers.13.layer_norm1.bias', 'encoder.layers.13.mlp.fc1.weight', 'encoder.layers.13.mlp.fc1.bias', 'encoder.layers.13.mlp.fc2.weight', 'encoder.layers.13.mlp.fc2.bias', 'encoder.layers.13.layer_norm2.weight', 'encoder.layers.13.layer_norm2.bias', 'encoder.layers.14.self_attn.k_proj.weight', 'encoder.layers.14.self_attn.k_proj.bias', 'encoder.layers.14.self_attn.v_proj.weight', 'encoder.layers.14.self_attn.v_proj.bias', 'encoder.layers.14.self_attn.q_proj.weight', 'encoder.layers.14.self_attn.q_proj.bias', 'encoder.layers.14.self_attn.out_proj.weight', 'encoder.layers.14.self_attn.out_proj.bias', 'encoder.layers.14.layer_norm1.weight', 'encoder.layers.14.layer_norm1.bias', 'encoder.layers.14.mlp.fc1.weight', 'encoder.layers.14.mlp.fc1.bias', 'encoder.layers.14.mlp.fc2.weight', 'encoder.layers.14.mlp.fc2.bias', 'encoder.layers.14.layer_norm2.weight', 'encoder.layers.14.layer_norm2.bias', 'encoder.layers.15.self_attn.k_proj.weight', 'encoder.layers.15.self_attn.k_proj.bias', 'encoder.layers.15.self_attn.v_proj.weight', 'encoder.layers.15.self_attn.v_proj.bias', 'encoder.layers.15.self_attn.q_proj.weight', 'encoder.layers.15.self_attn.q_proj.bias', 'encoder.layers.15.self_attn.out_proj.weight', 'encoder.layers.15.self_attn.out_proj.bias', 'encoder.layers.15.layer_norm1.weight', 'encoder.layers.15.layer_norm1.bias', 'encoder.layers.15.mlp.fc1.weight', 'encoder.layers.15.mlp.fc1.bias', 'encoder.layers.15.mlp.fc2.weight', 'encoder.layers.15.mlp.fc2.bias', 'encoder.layers.15.layer_norm2.weight', 'encoder.layers.15.layer_norm2.bias', 'encoder.layers.16.self_attn.k_proj.weight', 'encoder.layers.16.self_attn.k_proj.bias', 'encoder.layers.16.self_attn.v_proj.weight', 'encoder.layers.16.self_attn.v_proj.bias', 'encoder.layers.16.self_attn.q_proj.weight', 'encoder.layers.16.self_attn.q_proj.bias', 'encoder.layers.16.self_attn.out_proj.weight', 'encoder.layers.16.self_attn.out_proj.bias', 'encoder.layers.16.layer_norm1.weight', 'encoder.layers.16.layer_norm1.bias', 'encoder.layers.16.mlp.fc1.weight', 'encoder.layers.16.mlp.fc1.bias', 'encoder.layers.16.mlp.fc2.weight', 'encoder.layers.16.mlp.fc2.bias', 'encoder.layers.16.layer_norm2.weight', 'encoder.layers.16.layer_norm2.bias', 'encoder.layers.17.self_attn.k_proj.weight', 'encoder.layers.17.self_attn.k_proj.bias', 'encoder.layers.17.self_attn.v_proj.weight', 'encoder.layers.17.self_attn.v_proj.bias', 'encoder.layers.17.self_attn.q_proj.weight', 'encoder.layers.17.self_attn.q_proj.bias', 'encoder.layers.17.self_attn.out_proj.weight', 'encoder.layers.17.self_attn.out_proj.bias', 'encoder.layers.17.layer_norm1.weight', 'encoder.layers.17.layer_norm1.bias', 'encoder.layers.17.mlp.fc1.weight', 'encoder.layers.17.mlp.fc1.bias', 'encoder.layers.17.mlp.fc2.weight', 'encoder.layers.17.mlp.fc2.bias', 'encoder.layers.17.layer_norm2.weight', 'encoder.layers.17.layer_norm2.bias', 'encoder.layers.18.self_attn.k_proj.weight', 'encoder.layers.18.self_attn.k_proj.bias', 'encoder.layers.18.self_attn.v_proj.weight', 'encoder.layers.18.self_attn.v_proj.bias', 'encoder.layers.18.self_attn.q_proj.weight', 'encoder.layers.18.self_attn.q_proj.bias', 'encoder.layers.18.self_attn.out_proj.weight', 'encoder.layers.18.self_attn.out_proj.bias', 'encoder.layers.18.layer_norm1.weight', 'encoder.layers.18.layer_norm1.bias', 'encoder.layers.18.mlp.fc1.weight', 'encoder.layers.18.mlp.fc1.bias', 'encoder.layers.18.mlp.fc2.weight', 'encoder.layers.18.mlp.fc2.bias', 'encoder.layers.18.layer_norm2.weight', 'encoder.layers.18.layer_norm2.bias', 'encoder.layers.19.self_attn.k_proj.weight', 'encoder.layers.19.self_attn.k_proj.bias', 'encoder.layers.19.self_attn.v_proj.weight', 'encoder.layers.19.self_attn.v_proj.bias', 'encoder.layers.19.self_attn.q_proj.weight', 'encoder.layers.19.self_attn.q_proj.bias', 'encoder.layers.19.self_attn.out_proj.weight', 'encoder.layers.19.self_attn.out_proj.bias', 'encoder.layers.19.layer_norm1.weight', 'encoder.layers.19.layer_norm1.bias', 'encoder.layers.19.mlp.fc1.weight', 'encoder.layers.19.mlp.fc1.bias', 'encoder.layers.19.mlp.fc2.weight', 'encoder.layers.19.mlp.fc2.bias', 'encoder.layers.19.layer_norm2.weight', 'encoder.layers.19.layer_norm2.bias', 'encoder.layers.20.self_attn.k_proj.weight', 'encoder.layers.20.self_attn.k_proj.bias', 'encoder.layers.20.self_attn.v_proj.weight', 'encoder.layers.20.self_attn.v_proj.bias', 'encoder.layers.20.self_attn.q_proj.weight', 'encoder.layers.20.self_attn.q_proj.bias', 'encoder.layers.20.self_attn.out_proj.weight', 'encoder.layers.20.self_attn.out_proj.bias', 'encoder.layers.20.layer_norm1.weight', 'encoder.layers.20.layer_norm1.bias', 'encoder.layers.20.mlp.fc1.weight', 'encoder.layers.20.mlp.fc1.bias', 'encoder.layers.20.mlp.fc2.weight', 'encoder.layers.20.mlp.fc2.bias', 'encoder.layers.20.layer_norm2.weight', 'encoder.layers.20.layer_norm2.bias', 'encoder.layers.21.self_attn.k_proj.weight', 'encoder.layers.21.self_attn.k_proj.bias', 'encoder.layers.21.self_attn.v_proj.weight', 'encoder.layers.21.self_attn.v_proj.bias', 'encoder.layers.21.self_attn.q_proj.weight', 'encoder.layers.21.self_attn.q_proj.bias', 'encoder.layers.21.self_attn.out_proj.weight', 'encoder.layers.21.self_attn.out_proj.bias', 'encoder.layers.21.layer_norm1.weight', 'encoder.layers.21.layer_norm1.bias', 'encoder.layers.21.mlp.fc1.weight', 'encoder.layers.21.mlp.fc1.bias', 'encoder.layers.21.mlp.fc2.weight', 'encoder.layers.21.mlp.fc2.bias', 'encoder.layers.21.layer_norm2.weight', 'encoder.layers.21.layer_norm2.bias', 'encoder.layers.22.self_attn.k_proj.weight', 'encoder.layers.22.self_attn.k_proj.bias', 'encoder.layers.22.self_attn.v_proj.weight', 'encoder.layers.22.self_attn.v_proj.bias', 'encoder.layers.22.self_attn.q_proj.weight', 'encoder.layers.22.self_attn.q_proj.bias', 'encoder.layers.22.self_attn.out_proj.weight', 'encoder.layers.22.self_attn.out_proj.bias', 'encoder.layers.22.layer_norm1.weight', 'encoder.layers.22.layer_norm1.bias', 'encoder.layers.22.mlp.fc1.weight', 'encoder.layers.22.mlp.fc1.bias', 'encoder.layers.22.mlp.fc2.weight', 'encoder.layers.22.mlp.fc2.bias', 'encoder.layers.22.layer_norm2.weight', 'encoder.layers.22.layer_norm2.bias', 'encoder.layers.23.self_attn.k_proj.weight', 'encoder.layers.23.self_attn.k_proj.bias', 'encoder.layers.23.self_attn.v_proj.weight', 'encoder.layers.23.self_attn.v_proj.bias', 'encoder.layers.23.self_attn.q_proj.weight', 'encoder.layers.23.self_attn.q_proj.bias', 'encoder.layers.23.self_attn.out_proj.weight', 'encoder.layers.23.self_attn.out_proj.bias', 'encoder.layers.23.layer_norm1.weight', 'encoder.layers.23.layer_norm1.bias', 'encoder.layers.23.mlp.fc1.weight', 'encoder.layers.23.mlp.fc1.bias', 'encoder.layers.23.mlp.fc2.weight', 'encoder.layers.23.mlp.fc2.bias', 'encoder.layers.23.layer_norm2.weight', 'encoder.layers.23.layer_norm2.bias', 'encoder.layers.24.self_attn.k_proj.weight', 'encoder.layers.24.self_attn.k_proj.bias', 'encoder.layers.24.self_attn.v_proj.weight', 'encoder.layers.24.self_attn.v_proj.bias', 'encoder.layers.24.self_attn.q_proj.weight', 'encoder.layers.24.self_attn.q_proj.bias', 'encoder.layers.24.self_attn.out_proj.weight', 'encoder.layers.24.self_attn.out_proj.bias', 'encoder.layers.24.layer_norm1.weight', 'encoder.layers.24.layer_norm1.bias', 'encoder.layers.24.mlp.fc1.weight', 'encoder.layers.24.mlp.fc1.bias', 'encoder.layers.24.mlp.fc2.weight', 'encoder.layers.24.mlp.fc2.bias', 'encoder.layers.24.layer_norm2.weight', 'encoder.layers.24.layer_norm2.bias', 'encoder.layers.25.self_attn.k_proj.weight', 'encoder.layers.25.self_attn.k_proj.bias', 'encoder.layers.25.self_attn.v_proj.weight', 'encoder.layers.25.self_attn.v_proj.bias', 'encoder.layers.25.self_attn.q_proj.weight', 'encoder.layers.25.self_attn.q_proj.bias', 'encoder.layers.25.self_attn.out_proj.weight', 'encoder.layers.25.self_attn.out_proj.bias', 'encoder.layers.25.layer_norm1.weight', 'encoder.layers.25.layer_norm1.bias', 'encoder.layers.25.mlp.fc1.weight', 'encoder.layers.25.mlp.fc1.bias', 'encoder.layers.25.mlp.fc2.weight', 'encoder.layers.25.mlp.fc2.bias', 'encoder.layers.25.layer_norm2.weight', 'encoder.layers.25.layer_norm2.bias', 'encoder.layers.26.self_attn.k_proj.weight', 'encoder.layers.26.self_attn.k_proj.bias', 'encoder.layers.26.self_attn.v_proj.weight', 'encoder.layers.26.self_attn.v_proj.bias', 'encoder.layers.26.self_attn.q_proj.weight', 'encoder.layers.26.self_attn.q_proj.bias', 'encoder.layers.26.self_attn.out_proj.weight', 'encoder.layers.26.self_attn.out_proj.bias', 'encoder.layers.26.layer_norm1.weight', 'encoder.layers.26.layer_norm1.bias', 'encoder.layers.26.mlp.fc1.weight', 'encoder.layers.26.mlp.fc1.bias', 'encoder.layers.26.mlp.fc2.weight', 'encoder.layers.26.mlp.fc2.bias', 'encoder.layers.26.layer_norm2.weight', 'encoder.layers.26.layer_norm2.bias', 'post_layernorm.weight', 'post_layernorm.bias', 'head.probe', 'head.attention.in_proj_weight', 'head.attention.in_proj_bias', 'head.attention.out_proj.weight', 'head.attention.out_proj.bias', 'head.layernorm.weight', 'head.layernorm.bias', 'head.mlp.fc1.weight', 'head.mlp.fc1.bias', 'head.mlp.fc2.weight', 'head.mlp.fc2.bias'])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "state_dict.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([729, 1152])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "state_dict['embeddings.position_embedding.weight'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 729, 1152])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "state_dict['embeddings.position_embedding.weight'].unsqueeze(0).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "projector = torch.load('/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/xgenmm.projector')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "latents\n", + "projection.weight\n", + "projection.bias\n", + "layers.0.0.norm_media.weight\n", + "layers.0.0.norm_media.bias\n", + "layers.0.0.norm_latents.weight\n", + "layers.0.0.norm_latents.bias\n", + "layers.0.0.to_q.weight\n", + "layers.0.0.to_kv.weight\n", + "layers.0.0.to_out.weight\n", + "layers.0.1.0.weight\n", + "layers.0.1.0.bias\n", + "layers.0.1.1.weight\n", + "layers.0.1.3.weight\n", + "layers.1.0.norm_media.weight\n", + "layers.1.0.norm_media.bias\n", + "layers.1.0.norm_latents.weight\n", + "layers.1.0.norm_latents.bias\n", + "layers.1.0.to_q.weight\n", + "layers.1.0.to_kv.weight\n", + "layers.1.0.to_out.weight\n", + "layers.1.1.0.weight\n", + "layers.1.1.0.bias\n", + "layers.1.1.1.weight\n", + "layers.1.1.3.weight\n", + "layers.2.0.norm_media.weight\n", + "layers.2.0.norm_media.bias\n", + "layers.2.0.norm_latents.weight\n", + "layers.2.0.norm_latents.bias\n", + "layers.2.0.to_q.weight\n", + "layers.2.0.to_kv.weight\n", + "layers.2.0.to_out.weight\n", + "layers.2.1.0.weight\n", + "layers.2.1.0.bias\n", + "layers.2.1.1.weight\n", + "layers.2.1.3.weight\n", + "layers.3.0.norm_media.weight\n", + "layers.3.0.norm_media.bias\n", + "layers.3.0.norm_latents.weight\n", + "layers.3.0.norm_latents.bias\n", + "layers.3.0.to_q.weight\n", + "layers.3.0.to_kv.weight\n", + "layers.3.0.to_out.weight\n", + "layers.3.1.0.weight\n", + "layers.3.1.0.bias\n", + "layers.3.1.1.weight\n", + "layers.3.1.3.weight\n", + "layers.4.0.norm_media.weight\n", + "layers.4.0.norm_media.bias\n", + "layers.4.0.norm_latents.weight\n", + "layers.4.0.norm_latents.bias\n", + "layers.4.0.to_q.weight\n", + "layers.4.0.to_kv.weight\n", + "layers.4.0.to_out.weight\n", + "layers.4.1.0.weight\n", + "layers.4.1.0.bias\n", + "layers.4.1.1.weight\n", + "layers.4.1.3.weight\n", + "layers.5.0.norm_media.weight\n", + "layers.5.0.norm_media.bias\n", + "layers.5.0.norm_latents.weight\n", + "layers.5.0.norm_latents.bias\n", + "layers.5.0.to_q.weight\n", + "layers.5.0.to_kv.weight\n", + "layers.5.0.to_out.weight\n", + "layers.5.1.0.weight\n", + "layers.5.1.0.bias\n", + "layers.5.1.1.weight\n", + "layers.5.1.3.weight\n", + "norm.weight\n", + "norm.bias\n" + ] + } + ], + "source": [ + "for k in projector.keys():\n", + " print(k)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "to_q = projector['layers.0.0.to_q.weight']\n", + "to_kv= projector['layers.0.0.to_kv.weight']\n", + "to_o = projector['layers.0.0.to_out.weight']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(torch.Size([1536, 1152]), torch.Size([3072, 1152]), torch.Size([1152, 1536]))" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_q.shape, to_kv.shape, to_o.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'attn.q.weight'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "re.sub(\"attn.in_proj_\", \"attn.q.\", 'attn.in_proj_weight')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "on_device", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py index 9e5889f21..a550a9f8e 100644 --- a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py +++ b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py @@ -143,7 +143,7 @@ if __name__ == "__main__": elif args.xgenmm_projector is not None: fname_middle = "mmproj-" has_text_encoder = False - has_xgenmm_projector = False + has_xgenmm_projector = True elif args.vision_only: fname_middle = "vision-" has_text_encoder = False @@ -189,9 +189,13 @@ if __name__ == "__main__": fout.add_uint32("clip.vision.projection_dim", 0) fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"]) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), vision_config["layer_norm_eps"]) - block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"] + # TODO: chekck this as it might causes bugs + # orginial llaval implementation: + # block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"] + # we are different from llama1.6, which used the second to the last layer's hidden states as the image features. + block_count = vision_config["num_hidden_layers"] fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) - + print(KEY_BLOCK_COUNT) # xgenmm use anyres with grids configuration # 1*2, 2*1, 2*2, 3*1, 1*3, the same as the llava1.6, we just hard code it here image_grid_pinpoints = [336, 672, 672, 336, 672, 672, 1008, 336, 336, 1008] @@ -206,6 +210,20 @@ if __name__ == "__main__": # TODO: need to check; vision_config["hidden_act"] is gelu_pytorch_tanh use_gelu = "gelu" in vision_config["hidden_act"].lower() fout.add_bool("clip.use_gelu", use_gelu) + + if has_xgenmm_projector: + projector = torch.load(args.xgenmm_projector) + fout.add_uint32("clip.projector.input_dim", projector["input_dim"]) + fout.add_uint32("clip.projector.output_dim", projector["output_dim"]) + fout.add_uint32("clip.projector.num_heads", projector["num_heads"]) + fout.add_uint32("clip.projector.num_layers", projector["num_layers"]) + fout.add_uint32("clip.projector.hidden_dim", projector["hidden_dim"]) + fout.add_float32("clip.projector.dropout", projector["dropout"]) + fout.add_string("clip.projector.activation", projector["activation"]) + fout.add_string("clip.projector.norm", projector["norm"]) + fout.add_string("clip.projector.pooling", projector["pooling"]) + fout.add_string("clip.projector.pooling_norm", projector["pooling_norm"]) + fout.add_string("clip.projector.pooling_activation", projector["pooling_activation fout.write_header_to_file() fout.write_kv_data_to_file()