Avoid dropping last image encoder layer in llava models
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
This commit is contained in:
parent
65935431b4
commit
d85580c41c
1 changed files with 9 additions and 7 deletions
|
@ -162,7 +162,7 @@ if args.use_f32:
|
||||||
|
|
||||||
if args.clip_model_is_siglip:
|
if args.clip_model_is_siglip:
|
||||||
model = SiglipVisionModel.from_pretrained(dir_model)
|
model = SiglipVisionModel.from_pretrained(dir_model)
|
||||||
processor = None # TODO - optionally handle processor to correctly extract image stats etc
|
processor = None
|
||||||
elif args.clip_model_is_vision or args.clip_model_is_openclip:
|
elif args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||||
model = CLIPVisionModel.from_pretrained(dir_model)
|
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||||
processor = None
|
processor = None
|
||||||
|
@ -228,10 +228,12 @@ if has_text_encoder:
|
||||||
fout.add_token_list(tokens)
|
fout.add_token_list(tokens)
|
||||||
|
|
||||||
if has_vision_encoder:
|
if has_vision_encoder:
|
||||||
# vision feature layer may be an integer or an array.
|
## FIXME Need to pull this out of the overall model config, not just the top one?
|
||||||
# TODO - it seems like llama cpp may not handle this correctly
|
# TODO or document that vision_feature_layer can be set here, but it's usually in the
|
||||||
# normally; check if HF llava next models can run through this converter...
|
# llava config and not the vision config itself;
|
||||||
if "vision_feature_layer" in v_hparams:
|
# Handle vision feature layers in transformers, where features may be taken
|
||||||
|
# from layers that are not the last. NOTE - these values can be unsigned...
|
||||||
|
if "vision_feature_layer" in config:
|
||||||
feature_layers = v_hparams["vision_feature_layer"]
|
feature_layers = v_hparams["vision_feature_layer"]
|
||||||
if isinstance(feature_layers, int):
|
if isinstance(feature_layers, int):
|
||||||
feature_layers = [feature_layers]
|
feature_layers = [feature_layers]
|
||||||
|
@ -251,7 +253,7 @@ if has_vision_encoder:
|
||||||
fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
|
fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
|
||||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
||||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
|
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
|
||||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
block_count = v_hparams["num_hidden_layers"] #- 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||||
# /**
|
# /**
|
||||||
# "image_grid_pinpoints": [
|
# "image_grid_pinpoints": [
|
||||||
|
@ -319,7 +321,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
|
||||||
|
|
||||||
|
|
||||||
if has_llava_projector:
|
if has_llava_projector:
|
||||||
model.vision_model.encoder.layers.pop(-1)
|
# model.vision_model.encoder.layers.pop(-1)
|
||||||
projector = torch.load(args.llava_projector)
|
projector = torch.load(args.llava_projector)
|
||||||
for name, data in projector.items():
|
for name, data in projector.items():
|
||||||
name = get_tensor_name(name)
|
name = get_tensor_name(name)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue