diff --git a/examples/xgenmm/convert.sh b/examples/xgenmm/convert.sh
index a2829a5d3..c8fe8ba50 100644
--- a/examples/xgenmm/convert.sh
+++ b/examples/xgenmm/convert.sh
@@ -1,16 +1,48 @@
 source /export/share/yutong/miniconda3/bin/activate
 conda activate xgenmm-flamingo
-# which python
-# # step 1: surgery
-# python xgenmm_surgery.py
+which python
 
-# # step 2: convert vit + projector to gguf 
+# ======= siglip_kosmos_phi3_4k_instruct =======
 
-# python xgenmm_convert_image_encoder_to_gguf.py \
-#     --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
-#     --output_dirname gguf_test \
-#     --version siglip_kosmos_phi3_4k_instruct \
-#     --use_f32 
+# # # step 1: surgery
+# # python xgenmm_surgery.py
+
+# # # step 2: convert vit + projector to gguf 
+
+# # python xgenmm_convert_image_encoder_to_gguf.py \
+# #     --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
+# #     --output_dirname gguf \
+# #     --version siglip_kosmos_phi3_4k_instruct \
+# #     --use_f32 
+
+# # step 3:  convert llm to gguf
+# # https://github.com/ggerganov/llama.cpp/discussions/7927
+# cd ../../
+# # HF_TOKEN=<PUT YOUR TOKEN HERE>
+# # downloads the tokenizer models of the specified models from Huggingface; generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+# # python convert_hf_to_gguf_update.py $HF_TOKEN
+
+
+# LLM_PATH=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/llm
+# outtype=f32
+# LLM_OUTPUT_FILE=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_mini_4k_instruct_$outtype.gguf
+# echo $LLM_OUTPUT_FILE
+# python convert_hf_to_gguf.py $LLM_PATH --outfile $LLM_OUTPUT_FILE --outtype $outtype
+
+
+# ======= siglip_kosmos_phi3_4k_instruct_bf16_patch128 =======
+
+CKPT_PATH=/export/share/manli_shu/models/open-flamingo-dev/fixed_offset-bf16-maxlen2048-newsamplerv1-anyres_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_6-SFT_v3.6.1.v2-mantis-mix-v0.3.5-continue-8x16-ckpt0/checkpoint_0.pt
+VERSION=siglip_kosmos_phi3_4k_instruct_bf16_patch128
+SAVE_PATH=/export/share/yutong/xgenmm/llamacpp_wd
+# step 1: surgery
+python xgenmm_surgery.py --ckpt_pth $CKPT_PATH --save_pth $SAVE_PATH --version $VERSION
+# step 2: convert vit + projector to gguf 
+python xgenmm_convert_image_encoder_to_gguf.py \
+    --surgery_dir  $SAVE_PATH\
+    --output_dirname gguf \
+    --version $VERSION \
+    --use_f32 
 
 # step 3:  convert llm to gguf
 # https://github.com/ggerganov/llama.cpp/discussions/7927
@@ -19,9 +51,8 @@ cd ../../
 # downloads the tokenizer models of the specified models from Huggingface; generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 # python convert_hf_to_gguf_update.py $HF_TOKEN
 
-
-LLM_PATH=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/llm
-outtype=f32
-LLM_OUTPUT_FILE=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_mini_4k_instruct_$outtype.gguf
-echo $LLM_OUTPUT_FILE
-python convert_hf_to_gguf.py $LLM_PATH --outfile $LLM_OUTPUT_FILE --outtype $outtype
\ No newline at end of file
+# go to llm folder and nano config.json change vocab_size to 32064
+LLM_PATH=$SAVE_PATH/$VERSION/llm
+OUTTYPE=f16
+LLM_OUTPUT_FILE=$SAVE_PATH/$VERSION/gguf/phi3_mini_4k_instruct_$OUTTYPE.gguf
+python convert_hf_to_gguf.py $LLM_PATH --outfile $LLM_OUTPUT_FILE --outtype $OUTTYPE
\ No newline at end of file
diff --git a/examples/xgenmm/run_cli.sh b/examples/xgenmm/run_cli.sh
index dcbee799c..5da2bd376 100644
--- a/examples/xgenmm/run_cli.sh
+++ b/examples/xgenmm/run_cli.sh
@@ -41,17 +41,24 @@ make xgenmm-cli
 # Q="What is card holder's name?"
 # Q="What is the transaction date?"
 # Q="What is the phone number of this resturant?"
-Q="Who is the attendant?"
-# Q="Who is the cashier?"
+# Q="Who is the attendant?"
+Q="Who is the cashier?"
 # Q="Briefly describe this image."
 prompt="<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n $Q<|end|>\n<|assistant|>\n"
 echo $prompt
 
-model=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_mini_4k_instruct_f32.gguf
-# model=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_mini_4k_instruct_f16.gguf
+# base_path=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf
+# # model=$base_path/phi3_mini_4k_instruct_f32.gguf
+# model=$base_path/phi3_mini_4k_instruct_f16.gguf
+# mmproj=$base_path/mmproj-model-f32.gguf
+
+base_path=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct_bf16_patch128/gguf
+model=$base_path/phi3_mini_4k_instruct_f16.gguf
+mmproj=$base_path/mmproj-model-f32.gguf
+
 ./xgenmm-cli --model $model\
-    --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
+    --mmproj $mmproj \
     --image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
     --prompt "$prompt" \
     --seed 42 --ctx-size 4096 --predict 1024 \
-    --temp 0.8 --verbose-prompt --color --ubatch-size 1280
\ No newline at end of file
+    --temp 0.0 --verbose-prompt --color --ubatch-size 1280
\ No newline at end of file
diff --git a/xgenmm-cli b/xgenmm-cli
index 2d1a1e430..ef1407df1 100755
Binary files a/xgenmm-cli and b/xgenmm-cli differ