From 54a9727399b84715499b277603e366690ffed301 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 5 Nov 2024 11:10:30 +0400 Subject: [PATCH] add minicpmv support (#972) --- optimum/exporters/openvino/model_configs.py | 271 +++++++++++ optimum/exporters/openvino/model_patcher.py | 212 ++++++++- optimum/exporters/openvino/utils.py | 2 +- .../openvino/modeling_visual_language.py | 429 +++++++++++++++++- tests/openvino/test_modeling.py | 92 +++- tests/openvino/utils_tests.py | 1 + 6 files changed, 977 insertions(+), 30 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ace5c150d..108deed57 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -75,6 +75,8 @@ JaisModelPatcher, LlamaModelPatcher, LlavaImageEmbeddingModelPatcher, + MiniCPMVImageEmbeddingsModelPatcher, + MiniCPMVResamplerModelPatcher, MistralModelPatcher, MixtralModelPatcher, MPTModelPatcher, @@ -1738,3 +1740,272 @@ def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs) + + +class DummyMiniCPMVImageInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("pixel_values", "patch_attention_mask", "position_ids") + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__(task, normalized_config, batch_size, num_channels, width, height) + self.patch_size = normalized_config.config.patch_size + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "pixel_values": + return self.random_float_tensor( + shape=[ + self.batch_size, + self.num_channels, + self.patch_size, + (self.height * self.width) // self.patch_size, + ], + framework=framework, + dtype=float_dtype, + ) + + if input_name == "patch_attention_mask": + return self.random_int_tensor( + shape=[self.batch_size, 1, (self.height // self.patch_size) * (self.width // self.patch_size)], + framework=framework, + dtype=float_dtype, + min_value=0, + max_value=2, + ) + + if input_name == "position_ids": + return self.random_int_tensor( + shape=[self.batch_size, (self.height // self.patch_size) * (self.width // self.patch_size)], + max_value=self.patch_size, + ) + + +class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask") + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__(task, normalized_config, batch_size, num_channels, width, height) + self.patch_size = normalized_config.config.patch_size + self.hidden_size = normalized_config.config.hidden_size + self.img_hidden_size = normalized_config.config.vision_config.hidden_size + self.feat_size = (normalized_config.config.vision_config.image_size // self.patch_size) * ( + normalized_config.config.vision_config.image_size // self.patch_size + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "image_feature": + return self.random_float_tensor( + shape=[self.batch_size, self.feat_size, self.img_hidden_size], framework=framework, dtype=float_dtype + ) + + if input_name == "key_padding_mask": + return self.constant_tensor( + shape=[self.batch_size, self.feat_size], + framework=framework, + value=1, + dtype=DTYPE_MAPPER.pt(float_dtype), + ) + + if input_name == "pos_embed": + return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size]) + + +class MiniCPMVConfigBehavior(str, enum.Enum): + RESAMPLER = "resampler" + LANGUAGE = "language" + VISION_EMBEDDINGS = "vision_embeddings" + TEXT_EMBEDDINGS = "text_embeddings" + + +@register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers") +class MiniCPMVOpenVINOConfig(OnnxConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior] + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DUMMY_INPUT_GENERATOR_CLASSES = () + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: MiniCPMVConfigBehavior = MiniCPMVConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + self._behavior = behavior + self._orig_config = config + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): + self._config = config.vision_config + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,) + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVResampleInputGenerator,) + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: + return { + "pixel_values": {0: "batch_size", 2: "height", 3: "width"}, + "patch_attention_mask": {0: "batch_size", 1: "num_patches", 2: "patch_size"}, + "position_ids": {0: "batch_size", 1: "patch_size"}, + } + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: + return { + "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, + "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"}, + "key_padding_mask": {0: "batch_size", 1: "patch_size"}, + } + return {} + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: + return {"last_hidden_state": {0: "batch_size", 1: "patch_height", 2: "patch_width"}} + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: + return {"last_hidden_state": {0: "batch_size"}} + + return {} + + def with_behavior( + self, + behavior: Union[str, MiniCPMVConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): + behavior = MiniCPMVConfigBehavior(behavior) + + if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: + model_type = "qwen2" + model_type = model_type.replace("_", "-") + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: + raise ValueError( + f"Unsupported language model type provided `{model_type}`. Please define custom export config" + ) + + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: + raise ValueError( + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" + ) + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ + "text-generation-with-past" + ] + internal_export_config = internal_export_config_class( + self._orig_config, + use_past=True, + use_past_in_inputs=True, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS + export_config = InputEmbedOpenvVINOConfig( + self._orig_config, + task="feature-extraction", + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + return export_config + + if behavior == MiniCPMVConfigBehavior.LANGUAGE: + model_type = "qwen2" + model_type = model_type.replace("_", "-") + + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: + raise ValueError( + f"Unsupported language model type provided `{model_type}`. Please define custom export config" + ) + + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: + raise ValueError( + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" + ) + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ + "text-generation-with-past" + ] + internal_export_config = internal_export_config_class( + self._orig_config, + use_past=True, + use_past_in_inputs=True, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + ) + export_config = LMInputEmbedsConfigHelper(internal_export_config) + export_config._normalized_config = internal_export_config._normalized_config + return export_config + + if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + if behavior == MiniCPMVConfigBehavior.RESAMPLER: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def get_model_for_behavior(self, model, behavior: Union[str, MiniCPMVConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): + behavior = MiniCPMVConfigBehavior(behavior) + + if behavior == MiniCPMVConfigBehavior.LANGUAGE: + return model.llm + + if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: + return model.vpm + + if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = model.get_input_embeddings() + text_embedding.config = model.llm.config + return text_embedding + if behavior == MiniCPMVConfigBehavior.RESAMPLER: + model.resampler.config = model.vpm.config + return model.resampler + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: + return MiniCPMVImageEmbeddingsModelPatcher(self, model, model_kwargs) + + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: + return MiniCPMVResamplerModelPatcher(self, model, model_kwargs) + + return super().patch_model_for_export(model, model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7e5cd76a7..b1aa7eaa9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -21,7 +21,7 @@ import torch import torch.nn.functional as F -from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.utils import is_tf_available from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, ModelPatcher, override_arguments @@ -2763,3 +2763,213 @@ def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) if hasattr(self._model.pos_embed, "_orig_forward"): self._model.pos_embed.forward = self._model.pos_embed._orig_forward + + +def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask): + bs = image_feature.shape[0] + image_feature = self.kv_proj(image_feature) # B * L * D + image_feature = self.ln_kv(image_feature).permute(1, 0, 2) # L * B * D + + q = self.ln_q(self.query) # Q * D + + q_bs = q.unsqueeze(1).repeat(1, bs, 1) + + out = self.attn(q_bs, image_feature + pos_embed, image_feature, key_padding_mask=key_padding_mask)[ + 0 + ] # Q * B * D # L * B * D + L * B * D + # out: Q * B * D + x = out.permute(1, 0, 2) # B * Q * D + + x = self.ln_post(x) + x = x @ self.proj + return x + + +def _minicpmv_siglip_vis_embed_forward( + self, + pixel_values: torch.FloatTensor, + patch_attention_mask: torch.BoolTensor, + tgt_sizes: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.FloatTensor] = None, +) -> torch.Tensor: + patch_embeds = self.patch_embedding(pixel_values) + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + if position_ids is None: + batch_size = pixel_values.size(0) + max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) + max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size + boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) + position_ids = torch.full( + size=( + batch_size, + max_nb_patches_h * max_nb_patches_w, + ), + fill_value=0, + ) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + if tgt_sizes is not None: + nb_patches_h = tgt_sizes[batch_idx][0] + nb_patches_w = tgt_sizes[batch_idx][1] + else: + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + position_ids = position_ids.to(self.position_embedding.weight.device) + + embeddings = embeddings + self.position_embedding(position_ids) + return embeddings + + +def _minicpmv_siglip_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None + + +def _minicpmv_siglip_transformer_forward( + self, + pixel_values, + patch_attention_mask: Optional[torch.BoolTensor] = None, + tgt_sizes: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPooling]: + from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.size(0) + if patch_attention_mask is None: + patch_attention_mask = torch.ones( + size=( + batch_size, + pixel_values.size(2) // self.config.patch_size, + pixel_values.size(3) // self.config.patch_size, + ), + dtype=torch.bool, + device=pixel_values.device, + ) + + hidden_states = self.embeddings( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + tgt_sizes=tgt_sizes, + position_ids=position_ids, + ) + + patch_attention_mask = patch_attention_mask.view(batch_size, -1) + attention_mask = ( + _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) + if not self._use_flash_attention_2 + else patch_attention_mask + ) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state, None) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=None, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class MiniCPMVResamplerModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(_minicpmv_resampler_forward, model) + + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + model.forward = types.MethodType(_minicpmv_siglip_transformer_forward, model) + + super().__init__(config, model, model_kwargs) + + def __enter__(self): + super().__enter__() + self._model.embeddings._orig_forward = self._model.embeddings.forward + self._model.embeddings.forward = types.MethodType(_minicpmv_siglip_vis_embed_forward, self._model.embeddings) + + if is_torch_version(">=", "2.0.0"): + for layer in self._model.encoder.layers: + layer.self_attn._orig_forward = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_minicpmv_siglip_attn_forward, layer.self_attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + self._model.embeddings.forward = self._model.embeddings._orig_forward + if is_torch_version(">=", "2.0.0"): + for layer in self._model.encoder.layers: + layer.self_attn.forward = layer.self_attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 75106fc2b..35e0c3017 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -208,4 +208,4 @@ def get_submodels(model): return custom_export, fn_get_submodels -MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat"] +MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat", "minicpmv"] diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 141abeb87..b071602d9 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -171,6 +171,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: super().__init__(model, parent_model, model_name=self._model_name) self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs} self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)} + self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)} self.hidden_states_output_names = [] if len(self.model.outputs) > 2: self.hidden_states_output_names = [ @@ -178,7 +179,12 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: ] def forward(self, pixel_values, **kwargs): - result = self.request({"pixel_values": pixel_values}) + inputs = {"pixel_values": pixel_values} + if len(self.input_names) > 1: + for name in self.input_names: + if name in kwargs: + inputs[name] = kwargs[name] + result = self.request(inputs) last_hidden_state = result[0] hidden_states = None pooler_out = None @@ -193,7 +199,22 @@ def forward(self, pixel_values, **kwargs): ) -MODEL_PARTS_CLS_MAPPING = {} +class OVResampler(OVModelPart): + _model_name = "resampler" + + def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None: + super().__init__(model, parent_model, model_name=self._model_name) + self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs} + self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)} + + def forward(self, image_feature, pos_embed, key_padding_mask): + result = self.request( + {"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask} + )[0] + return result + + +MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler} class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin): @@ -513,7 +534,7 @@ def _from_transformers( ov_config=ov_config, stateful=stateful, ) - config = AutoConfig.from_pretrained(save_dir_path) + config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code) return cls._from_pretrained( model_id=save_dir_path, config=config, @@ -553,6 +574,8 @@ def forward( image_sizes=None, attention_mask=None, position_ids=None, + image_bound=None, + tgt_sizes=None, **kwargs, ): inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( @@ -562,6 +585,8 @@ def forward( attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, + image_bound=image_bound, + tgt_sizes=tgt_sizes, **kwargs, ) return self.language_model.forward( @@ -628,14 +653,14 @@ def prepare_inputs_for_generation( elif past_length < input_ids.shape[1]: input_ids = input_ids[:, past_length:] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - elif self.config.image_token_index in input_ids: + elif getattr(self.config, "image_token_index", None) in input_ids: input_ids = input_ids[:, input_ids.shape[1] - 1 :] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: + if past_key_values is not None: position_ids = position_ids[:, -input_ids.shape[1] :] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step @@ -652,6 +677,8 @@ def prepare_inputs_for_generation( "attention_mask": attention_mask, "pixel_values": pixel_values, "image_sizes": image_sizes, + "image_bound": kwargs.get("image_bound"), + "tgt_sizes": kwargs.get("tgt_sizes"), } ) return model_inputs @@ -1123,8 +1150,400 @@ def merge_vision_text_embeddings( return input_embeds, attention_mask, position_ids +class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM): + additional_parts = ["resampler"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = True, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + super().__init__( + language_model, + text_embeddings, + vision_embeddings, + config, + device, + dynamic_shapes, + ov_config, + model_save_dir, + quantization_config, + **kwargs, + ) + self.embed_dim = self.language_model.config.hidden_size + max_size = self.config.vision_config.image_size // self.config.vision_config.patch_size + self._pos_embeds = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float() + self.max_size = (max_size, max_size) + + def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): + if input_ids is not None and input_ids.shape[1] == 1: + return None + tgt_sizes = kwargs["tgt_sizes"] + pixel_values_list = pixel_values + vision_hidden_states = [] + all_pixel_values = [] + img_cnt = [] + for pixel_value in pixel_values_list: + img_cnt.append(len(pixel_value)) + all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_value]) + + vision_embedding = None + # exist image + if all_pixel_values: + tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)] + tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32) + + max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1]) + + all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0) + B, L, _ = all_pixel_values.shape + all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L) + + patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool) + for i in range(B): + patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True + position_ids = self._prepare_vis_position_ids( + all_pixel_values, + patch_attn_mask, + tgt_sizes, + self.config.vision_config.patch_size, + self.config.vision_config.image_size // self.config.patch_size, + ) + vision_embedding = torch.from_numpy( + self.vision_embeddings( + pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids + )[0] + ) + vision_embedding = self.resampling(vision_embedding, tgt_sizes) + + start = 0 + for pixel_value in pixel_values_list: + img_cnt = len(pixel_value) + if img_cnt > 0: + vision_hidden_states.append(vision_embedding[start : start + img_cnt]) + start += img_cnt + else: + vision_hidden_states.append([]) + else: # no image + dummy_feature = [] + for _ in range(len(pixel_values_list)): + vision_hidden_states.append(dummy_feature) + return vision_hidden_states + + def resampling(self, x, tgt_sizes): + bs = x.shape[0] + + patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] + + self._adjust_pos_cache(tgt_sizes) + + max_patch_len = torch.max(patch_len) + key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) + + pos_embed = [] + for i in range(bs): + tgt_h, tgt_w = tgt_sizes[i] + pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D + key_padding_mask[i, patch_len[i] :] = True + + pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute( + 1, 0, 2 + ) # BLD => L * B * D + res = torch.from_numpy(self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask)) + return res + + def _set_2d_pos_cache(self, max_size): + pos_embed = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float() + self._pos_embed = pos_embed + + def _adjust_pos_cache(self, tgt_sizes): + max_h = torch.max(tgt_sizes[:, 0]) + max_w = torch.max(tgt_sizes[:, 1]) + if max_h > self.max_size[0] or max_w > self.max_size[1]: + self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])] + self._set_2d_pos_cache(self.max_size) + + def _get_2d_sincos_pos_embed(self, embed_dim, image_size): + """ + image_size: image_size or (image_height, image_width) + return: + pos_embed: [image_height, image_width, embed_dim] + """ + if isinstance(image_size, int): + grid_h_size, grid_w_size = image_size, image_size + else: + grid_h_size, grid_w_size = image_size[0], image_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + return pos_embed + + def _get_2d_sincos_pos_embed_from_grid(self, embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = self._get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0]) # (H, W, D/2) + emb_w = self._get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1]) # (H, W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) + return emb + + def _get_1d_sincos_pos_embed_from_grid_new(self, embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (H, W) + out: (H, W, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product + + emb_sin = np.sin(out) # (H, W, D/2) + emb_cos = np.cos(out) # (H, W, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) + return emb + + def _prepare_vis_position_ids( + self, pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side + ): + batch_size = pixel_values.size(0) + max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) + max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size + boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side) + position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + if tgt_sizes is not None: + nb_patches_h = tgt_sizes[batch_idx][0] + nb_patches_w = tgt_sizes[batch_idx][1] + else: + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + return position_ids + + def merge_vision_text_embeddings( + self, vision_embeds, input_embeds, input_ids, attention_mask, position_ids=None, **kwargs + ): + bs = input_ids.shape[0] + image_bound = kwargs["image_bound"] + vllm_embedding = torch.from_numpy(input_embeds) + for i in range(bs): + cur_vs_hs = vision_embeds[i] + if len(cur_vs_hs) > 0: + cur_vllm_emb = vllm_embedding[i] + cur_image_bound = image_bound[i] + if len(cur_image_bound) > 0: + image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]) + + cur_vllm_emb.scatter_( + 0, + image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), + cur_vs_hs.view(-1, cur_vs_hs.shape[-1]), + ) + return vllm_embedding, attention_mask, position_ids + + +class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM): + def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): + if input_ids is not None and input_ids.shape[1] == 1: + return None + if isinstance(pixel_values, list) or pixel_values.ndim == 5: + concat_images = torch.cat(pixel_values, dim=0) if isinstance(pixel_values, list) else pixel_values + image_features = torch.from_numpy(self.vision_embeddings(concat_images).last_hidden_state) + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + image_features = [x.flatten(0, 1).to(self.device) for x in image_features] + else: + image_features = self.vision_embeddings(pixel_values).last_hidden_state + + return image_features + + def get_multimodal_embeddings( + self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs + ): + vision_embeds = None + IGNORE_INDEX = -100 + IMAGE_TOKEN_INDEX = -200 + if pixel_values is not None: + vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs) + if vision_embeds is None: + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + past_len = self.language_model._get_past_length(kwargs.get("past_key_values")) + if attention_mask is not None and attention_mask.shape[1] < past_len + input_ids.shape[1]: + attention_mask = torch.cat( + [ + attention_mask, + torch.ones(attention_mask.shape[0], past_len + input_ids.shape[1] - attention_mask.shape[1]), + ], + dim=1, + ) + position_ids = None + return inputs_embeds, attention_mask, position_ids + + vision_embeds = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids, dtype=torch.long) + if position_ids is None: + position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) + labels = torch.full_like(input_ids, IGNORE_INDEX) + + # remove the padding using attention_mask -- TODO: double check + input_ids = [ + cur_input_ids[cur_attention_mask] + for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask.bool()) + ] + labels = [ + cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask.bool()) + ] + + new_input_embeds = [] + new_labels = [] + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum() + if num_images == 0: + cur_image_features = vision_embeds[cur_image_idx] + cur_input_embeds_1 = torch.from_numpy(self.get_text_embeddings(cur_input_ids.unsqueeze(0))[0]) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0) + new_input_embeds.append(cur_input_embeds) + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + + image_token_indices = ( + [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]] + ) + cur_input_ids_noim = [] + cur_labels = labels[batch_idx] + cur_labels_noim = [] + for i in range(len(image_token_indices) - 1): + cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]]) + cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]) + split_sizes = [x.shape[0] for x in cur_labels_noim] + cur_input_embeds = torch.from_numpy( + self.get_text_embeddings(torch.cat(cur_input_ids_noim).unsqueeze(0))[0] + ) + cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0) + cur_new_input_embeds = [] + cur_new_labels = [] + + for i in range(num_images + 1): + cur_new_input_embeds.append(cur_input_embeds_no_im[i]) + cur_new_labels.append(cur_labels_noim[i]) + if i < num_images: + cur_image_features = vision_embeds[cur_image_idx] + cur_image_idx += 1 + cur_new_input_embeds.append(cur_image_features) + cur_new_labels.append( + torch.full( + (cur_image_features.shape[0],), + IGNORE_INDEX, + device=cur_labels.device, + dtype=cur_labels.dtype, + ) + ) + + cur_new_input_embeds = torch.cat(cur_new_input_embeds) + cur_new_labels = torch.cat(cur_new_labels) + + new_input_embeds.append(cur_new_input_embeds) + new_labels.append(cur_new_labels) + + # Truncate sequences to max length as image embeddings can make the sequence longer + tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None) + if tokenizer_model_max_length is not None: + new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds] + new_labels = [x[:tokenizer_model_max_length] for x in new_labels] + + # Combine them + max_len = max(x.shape[0] for x in new_input_embeds) + batch_size = len(new_input_embeds) + + new_input_embeds_padded = [] + new_labels_padded = torch.full( + (batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device + ) + attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device) + position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device) + + for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)): + cur_len = cur_new_embed.shape[0] + if getattr(self.config, "tokenizer_padding_side", "right") == "left": + new_input_embeds_padded.append( + torch.cat( + ( + torch.zeros( + (max_len - cur_len, cur_new_embed.shape[1]), + dtype=cur_new_embed.dtype, + device=cur_new_embed.device, + ), + cur_new_embed, + ), + dim=0, + ) + ) + if cur_len > 0: + new_labels_padded[i, -cur_len:] = cur_new_labels + attention_mask[i, -cur_len:] = True + position_ids[i, -cur_len:] = torch.arange( + 0, cur_len, dtype=position_ids.dtype, device=position_ids.device + ) + else: + new_input_embeds_padded.append( + torch.cat( + ( + cur_new_embed, + torch.zeros( + (max_len - cur_len, cur_new_embed.shape[1]), + dtype=cur_new_embed.dtype, + device=cur_new_embed.device, + ), + ), + dim=0, + ) + ) + if cur_len > 0: + new_labels_padded[i, :cur_len] = cur_new_labels + attention_mask[i, :cur_len] = True + position_ids[i, :cur_len] = torch.arange( + 0, cur_len, dtype=position_ids.dtype, device=position_ids.device + ) + + new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) + + return new_input_embeds, attention_mask, position_ids + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, "internvl_chat": _OvInternVLForCausalLM, + "minicpmv": _OVMiniCPMVForCausalLM, } diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 8b4258adf..0dcfaac71 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -50,6 +50,7 @@ AutoModelForSpeechSeq2Seq, AutoModelForTokenClassification, AutoModelForVision2Seq, + AutoProcessor, AutoTokenizer, GenerationConfig, Pix2StructForConditionalGeneration, @@ -1876,12 +1877,14 @@ def test_compare_with_and_without_past_key_values(self): class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = [ - "llava", - ] + SUPPORTED_ARCHITECTURES = ["llava"] + + REMOTE_CODE_MODELS = ["minicpmv"] if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ["llava_next"] + if is_transformers_version(">=", "4.45.0"): + SUPPORTED_ARCHITECTURES += ["minicpmv"] TASK = "image-text-to-text" IMAGE = Image.open( @@ -1900,19 +1903,50 @@ def get_transformer_model_class(self, model_arch): from transformers import LlavaNextForConditionalGeneration return LlavaNextForConditionalGeneration - return None + return AutoModelForCausalLM + + def gen_inputs(self, model_arch, base_text_prompt, image=None): + model_id = MODEL_NAMES[model_arch] + if "llava" in model_arch: + prompt = f"\n {base_text_prompt}" + elif "minicpmv" in model_arch: + prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n" + if model_arch != "nanollava": + processor = AutoProcessor.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + inputs = processor(images=[self.IMAGE.resize((600, 600))], text=[prompt], return_tensors="pt") + else: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + processor = AutoProcessor.from_pretrained( + config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + image_input = None + if image is not None: + image_input = processor(images=image, return_tensors="pt")["pixel_values"] + text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] + + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) + attention_mask = torch.ones_like(input_ids, dtype=torch.int64) + inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input} + return inputs @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): - prompt = "\n What is shown in this image?" model_id = MODEL_NAMES[model_arch] - processor = get_preprocessor(model_id) - transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(model_id) - inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt") - set_seed(SEED) - with torch.no_grad(): - transformers_outputs = transformers_model(**inputs) - ov_model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True) + transformers_model = self.get_transformer_model_class(model_arch).from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + if "nanollava" in model_arch: + transformers_model.get_vision_tower().load_model() + inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) + + ov_model = OVModelForVisualCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type]) self.assertIsInstance(ov_model.vision_embeddings, OVVisionEmbedding) self.assertIsInstance(ov_model.language_model, OVModelWithEmbedForCausalLM) @@ -1920,8 +1954,13 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(hasattr(ov_model, additional_part)) self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) - ov_outputs = ov_model(**inputs) - self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + # pytorch minicpmv is not designed to be used via forward + if "minicpmv" not in model_arch: + set_seed(SEED) + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs) + ov_outputs = ov_model(**inputs) + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None @@ -1930,7 +1969,6 @@ def test_compare_to_transformers(self, model_arch): gen_config = GenerationConfig( max_new_tokens=30, min_new_tokens=30, - num_beams=3, do_sample=False, eos_token_id=None, ) @@ -1938,6 +1976,9 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) set_seed(SEED) transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) + # original minicpmv always skip input tokens in generation results, while transformers based approach provide them + if model_arch == "minicpmv": + ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( torch.equal(ov_outputs, transformers_outputs), f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}", @@ -1951,20 +1992,25 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] - model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True) - preprocessor = get_preprocessor(model_id) - question = "\nDescribe image" - inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") - + model = OVModelForVisualCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE) # General case outputs = model.generate(**inputs, max_new_tokens=10) - outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True) + # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 + outputs = outputs[:, inputs["input_ids"].shape[1] :] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) self.assertIsInstance(outputs[0], str) + # No input image case question = "Hi, how are you?" - inputs = preprocessor(images=None, text=question, return_tensors="pt") + inputs = self.gen_inputs(model_arch, question, None) outputs = model.generate(**inputs, max_new_tokens=10) - outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True) + # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 + outputs = outputs[:, inputs["input_ids"].shape[1] :] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) self.assertIsInstance(outputs[0], str) del model diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index e5a9f73a6..ec0ca3981 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -84,6 +84,7 @@ "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", "minicpm": "katuni4ka/tiny-random-minicpm", + "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6", "mistral": "echarlaix/tiny-random-mistral", "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo", "mixtral": "TitanML/tiny-mixtral",