diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 3010b70ec..342042ffb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1484,7 +1484,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; - ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); diff --git a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py index fbb425851..b84c0056f 100644 --- a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py @@ -110,24 +110,6 @@ class SiglipVisionConfig(PretrainedConfig): self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -140,7 +122,7 @@ def _get_unpad_data(attention_mask): seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) return ( indices, cu_seqlens, @@ -199,7 +181,7 @@ def _trunc_normal_(tensor, mean, std, a, b): def trunc_normal_tf_( tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0 -) -> torch.Tensor: +): """Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` @@ -223,6 +205,7 @@ def trunc_normal_tf_( def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + denom = fan_in if mode == "fan_in": denom = fan_in elif mode == "fan_out": @@ -253,34 +236,6 @@ def lecun_normal_(tensor): def default_flax_embed_init(tensor): variance_scaling_(tensor, mode="fan_in", distribution="normal") - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip -class SiglipVisionModelOutput(ModelOutput): - """ - Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. - Args: - image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): - The image embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - image_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - class SiglipVisionEmbeddings(nn.Module): def __init__(self, config: SiglipVisionConfig): super().__init__() diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index dfe5fbe62..cbcbf26c9 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,4 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 -torchvision==0.17.1 +torchvision~=0.17.1