Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
110 commits
Select commit Hold shift + click to select a range
b3bfa35
Add Ovis2 model and processor implementation
thisisiron Mar 28, 2025
51c9efd
Apply style fixes
thisisiron Mar 28, 2025
9891508
Add unit tests for Ovis2 image processing and processor
thisisiron Mar 29, 2025
fde1b2a
Refactor image processing functions for clarity and efficiency
thisisiron Mar 29, 2025
6b0e5d4
Add Ovis2 ImageProcessorFast
thisisiron Mar 30, 2025
6b8ae7e
Refactor Ovis2 code
thisisiron Mar 31, 2025
91f72b2
Refactor Ovis2 model components and update processor functionality
thisisiron Mar 31, 2025
aacbab3
Fix repo consistency issues for Ovis2: docstring, config cleanup
thisisiron Mar 31, 2025
7305a22
Update Ovis2 model integration tests
thisisiron Mar 31, 2025
355a91c
Update Ovis2 configuration and processing classes for improved docume…
thisisiron Mar 31, 2025
ac232e0
Remove duplicate entry for 'ovis2' in VLM_CLASS_NAMES
thisisiron Apr 1, 2025
16d71f8
Fix conflict
thisisiron Apr 1, 2025
a7b5094
Fix import order
thisisiron Apr 1, 2025
4d56043
Update image processor class names
thisisiron Apr 4, 2025
7f1cbc0
Update Ovis2 model structure
thisisiron May 5, 2025
a4e37e6
Refactor Ovis2 configuration
thisisiron May 5, 2025
11a2a09
Merge remote-tracking branch 'upstream/main' into add-ovis2
thisisiron May 5, 2025
5999659
Fix typos
thisisiron May 5, 2025
f66426c
Refactor Ovis2 model classes and remove unused code
thisisiron May 5, 2025
ae1ea0d
Fix typos
thisisiron May 5, 2025
4e540b5
Refactor Ovis2 model initialization
thisisiron May 5, 2025
83a7cca
Fiix typos
thisisiron May 5, 2025
234edb2
Merge branch 'main' into add-ovis2
thisisiron May 29, 2025
db59777
Remove Ovis2 model mapping from MODEL_MAPPING_NAMES in modeling_auto.py
thisisiron May 29, 2025
b604a70
Add license and update type hints
thisisiron May 29, 2025
f26717d
Refactor token function and update docstring handling
thisisiron May 29, 2025
890abdc
Add license
thisisiron May 30, 2025
97e84a4
Merge branch 'main' into add-ovis2
thisisiron May 30, 2025
67a45ab
Merge branch 'main' into add-ovis2
thisisiron May 31, 2025
764e74f
Merge branch 'main' into add-ovis2
thisisiron Jun 2, 2025
178fc10
Add Ovis2 model support and update documentation
thisisiron Jun 23, 2025
2e278a4
Refactor Ovis2 model structure and enhance multimodal capabilities
thisisiron Jun 23, 2025
17afef9
Update Ovis2 weight mapping for consistency and clarity in key patterns
thisisiron Jun 23, 2025
1a87ab3
Remove unused 'grids' parameter from Ovis2 model and Update processin…
thisisiron Jun 23, 2025
f3c498e
Refactor Ovis2 model test structure to include Ovis2Model
thisisiron Jun 23, 2025
ec0ffd5
Merge branch 'main' into add-ovis2
thisisiron Jun 23, 2025
0f418e8
Add optional disable_grouping param to Ovis2ImageProcessorFast
thisisiron Jun 23, 2025
afd50aa
Refactor type hints in Ovis2 modules
thisisiron Jun 23, 2025
bdbcb22
Add licensing information in Ovis2 modules and tests
thisisiron Jun 25, 2025
cd369a6
Refactor Ovis2 model by removing unused methods
thisisiron Jun 25, 2025
b459f50
Refactor Ovis2 model tests by renaming test classes and removing skip…
thisisiron Jun 25, 2025
4ae2f70
Merge branch 'main' into add-ovis2
thisisiron Jun 25, 2025
57abe35
Refactor Ovis2 model output classes
thisisiron Jun 25, 2025
541dc7f
Refactor Ovis2 weight conversion and Update model embedding classes
thisisiron Jun 28, 2025
5e7846c
Merge branch 'main' into add-ovis2
thisisiron Jun 28, 2025
d13eaea
Refactor Ovis2 model imports and remove unused functions
thisisiron Jun 28, 2025
a10e3db
Enhance vision configuration extraction in Ovis2 weight conversion
thisisiron Jun 28, 2025
0501e0f
Refactor Ovis2 model's forward method to remove interpolation option
thisisiron Jun 28, 2025
c19231f
Update Ovis2 model documentation
thisisiron Jun 28, 2025
6083141
Merge branch 'main' into add-ovis2
thisisiron Jul 3, 2025
c27bf25
Refactor Ovis2 model input handling and tokenizer configuration
thisisiron Jul 4, 2025
58c0c0a
Merge branch 'main' into add-ovis2
thisisiron Jul 4, 2025
94fd529
Update return type hints in Ovis2 model
thisisiron Jul 4, 2025
8402244
Merge branch 'main' into add-ovis2
thisisiron Jul 4, 2025
2cd3837
Remove commented-out code
thisisiron Jul 4, 2025
1a5f6a9
fix config for tests and remove key mappings
Cyrilvallez Jul 8, 2025
e919722
Update tokenizer configuration to use add_special_tokens method
thisisiron Jul 8, 2025
2de5a94
Merge branch 'main' into add-ovis2
thisisiron Jul 8, 2025
e7e2464
Merge branch 'add-ovis2' of https://github.com/thisisiron/transformer…
thisisiron Jul 8, 2025
d9a8599
skip torchscript
Cyrilvallez Jul 8, 2025
94ba3aa
Fix image placeholder generation in Ovis2Processor
thisisiron Jul 8, 2025
8392223
Merge branch 'add-ovis2' of https://github.com/thisisiron/transformer…
thisisiron Jul 8, 2025
0f19c79
Merge branch 'main' into add-ovis2
thisisiron Jul 9, 2025
d335aaa
Refactor Ovis2 model to rename visual_table to visual_embeddings_table
thisisiron Jul 9, 2025
91e924c
Enhance Ovis2 model by adding vision_feature_select_strategy parameter
thisisiron Jul 9, 2025
3b02fe1
Refactor Ovis2 model weights conversion and architecture
thisisiron Jul 9, 2025
7376160
Refactor Ovis2 model by removing vision_feature_select_strategy param…
thisisiron Jul 9, 2025
683d3e9
Merge branch 'main' into add-ovis2
thisisiron Jul 9, 2025
a8ffbd4
Update Ovis2 model examples
thisisiron Jul 9, 2025
432a718
Refactor Ovis2 model
thisisiron Jul 12, 2025
1d4a1e9
Update Ovis2 model
thisisiron Jul 12, 2025
933cadd
Update Ovis2 model configuration
thisisiron Jul 12, 2025
9ecdd76
Merge branch 'main' into add-ovis2
thisisiron Jul 12, 2025
c024a10
Refactor Ovis2 model test setup
thisisiron Jul 12, 2025
5fb7870
Merge branch 'main' into add-ovis2
thisisiron Jul 16, 2025
3fcdb3a
Merge branch 'main' into add-ovis2
thisisiron Jul 27, 2025
a48468a
Refactor flash attention support
thisisiron Jul 28, 2025
5b02165
Merge branch 'main' into add-ovis2
thisisiron Jul 28, 2025
b5b2eb6
Refactor
thisisiron Jul 28, 2025
5e9c276
Fix typo
thisisiron Jul 28, 2025
0f3163a
Refactor
thisisiron Jul 28, 2025
0c13cfc
Refactor model classes
thisisiron Jul 29, 2025
8d495ee
Update expected output in Ovis2
thisisiron Jul 29, 2025
9d995c3
Refactor docstrings
thisisiron Jul 29, 2025
ccfdb43
Fix
thisisiron Jul 29, 2025
192cc10
Merge branch 'main' into add-ovis2
thisisiron Jul 29, 2025
cfe3a3b
Fix
thisisiron Jul 29, 2025
530aad0
Fix
thisisiron Jul 29, 2025
5d92825
Update input in tests
thisisiron Jul 29, 2025
7bb0e2b
Merge branch 'main' into add-ovis2
thisisiron Jul 29, 2025
c4a83b6
Fix
thisisiron Jul 29, 2025
ac31c2a
Merge branch 'main' into add-ovis2
thisisiron Jul 31, 2025
7b78029
Fix get_decoder method
thisisiron Jul 31, 2025
c230e72
Refactor
thisisiron Jul 31, 2025
3b0a94a
Refactor Ovis2
thisisiron Aug 8, 2025
7cff46b
Merge branch 'main' into add-ovis2
thisisiron Aug 8, 2025
9afdbad
Fix
thisisiron Aug 8, 2025
bd69fb5
Fix
thisisiron Aug 8, 2025
3ed0cb6
Fix test
thisisiron Aug 8, 2025
2b0621c
Add get_placeholder_mask
thisisiron Aug 8, 2025
11802a4
Merge branch 'main' into add-ovis2
thisisiron Aug 8, 2025
38b6f15
Merge branch 'main' into add-ovis2
thisisiron Aug 8, 2025
0e7d6ed
Refactor Ovis2 model tests
thisisiron Aug 14, 2025
7ce5c4e
Fix
thisisiron Aug 14, 2025
0c6571d
Refactor
thisisiron Aug 14, 2025
13010fa
Merge branch 'main' into add-ovis2
thisisiron Aug 14, 2025
2773182
Fix
thisisiron Aug 14, 2025
8642f7d
Fix
thisisiron Aug 14, 2025
62f2023
Fix Ovis2 test
thisisiron Aug 18, 2025
dd47f25
Merge branch 'main' into add-ovis2
thisisiron Aug 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add Ovis2 model and processor implementation
  • Loading branch information
thisisiron committed Apr 1, 2025
commit b3bfa35d537e3d16e9a7fc654086162dd7189b61
2 changes: 2 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,8 @@
title: OneFormer
- local: model_doc/owlvit
title: OWL-ViT
- local: model_doc/ovis2
title: Ovis2
- local: model_doc/owlv2
title: OWLv2
- local: model_doc/paligemma
Expand Down
55 changes: 55 additions & 0 deletions docs/source/en/model_doc/ovis2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Ovis2

## Overview

The [Ovis2](https://github.com/AIDC-AI/Ovis) is an updated version of the [Ovis](https://arxiv.org/abs/2405.20797) model developed by the AIDC-AI team at Alibaba International Digital Commerce Group.

The abstract from this update is the following:

*It brings major improvements, including better performance for small models, stronger reasoning ability, advanced video and multi-image processing, wider multilingual OCR support, and improved handling of high-resolution images.*


```python

from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers.image_utils import load_images, load_video
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor

model = AutoModelForVision2Seq.from_pretrained(
"thisisiron/Ovis2-2B-hf",
torch_dtype=torch.bfloat16,
).eval().to("cuda:0")
processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Describe the image."},
],
},
]
url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
image = Image.open(requests.get(url, stream=True).raw)
messages = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(messages)

inputs = processor(
images=[image],
text=messages,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)

with torch.inference_mode():
output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(output_text)
```
21 changes: 21 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,11 @@
"OpenAIGPTTokenizer",
],
"models.opt": ["OPTConfig"],
"models.ovis2": [
"Ovis2Config",
"Ovis2Processor",
"Ovis2VisionConfig",
],
"models.owlv2": [
"Owlv2Config",
"Owlv2Processor",
Expand Down Expand Up @@ -1303,6 +1308,7 @@
_import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
_import_structure["models.nougat"].append("NougatImageProcessor")
_import_structure["models.oneformer"].extend(["OneFormerImageProcessor"])
_import_structure["models.ovis2"].extend(["Ovis2ImageProcessor"])
_import_structure["models.owlv2"].append("Owlv2ImageProcessor")
_import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
_import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
Expand Down Expand Up @@ -3231,6 +3237,13 @@
"OPTPreTrainedModel",
]
)
_import_structure["models.ovis2"].extend(
[
"Ovis2ForConditionalGeneration",
"Ovis2VisionModel",
"Ovis2PreTrainedModel",
]
)
_import_structure["models.owlv2"].extend(
[
"Owlv2ForObjectDetection",
Expand Down Expand Up @@ -5933,6 +5946,11 @@
OpenAIGPTTokenizer,
)
from .models.opt import OPTConfig
from .models.ovis2 import (
Ovis2Config,
Ovis2Processor,
Ovis2VisionConfig,
)
from .models.owlv2 import (
Owlv2Config,
Owlv2Processor,
Expand Down Expand Up @@ -8144,6 +8162,9 @@
OPTModel,
OPTPreTrainedModel,
)
from .models.ovis2 import (
Ovis2ForConditionalGeneration,
)
from .models.owlv2 import (
Owlv2ForObjectDetection,
Owlv2Model,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@
oneformer,
openai,
opt,
ovis2,
owlv2,
owlvit,
paligemma,
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@
("open-llama", "OpenLlamaConfig"),
("openai-gpt", "OpenAIGPTConfig"),
("opt", "OPTConfig"),
("ovis2", "Ovis2Config"),
("owlv2", "Owlv2Config"),
("owlvit", "OwlViTConfig"),
("paligemma", "PaliGemmaConfig"),
Expand Down Expand Up @@ -582,6 +583,7 @@
("open-llama", "OpenLlama"),
("openai-gpt", "OpenAI GPT"),
("opt", "OPT"),
("ovis2", "Ovis2"),
("owlv2", "OWLv2"),
("owlvit", "OWL-ViT"),
("paligemma", "PaliGemma"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/image_processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("nougat", ("NougatImageProcessor",)),
("oneformer", ("OneFormerImageProcessor",)),
("ovis2", "Ovis2ImageProcessor"),
("owlv2", ("Owlv2ImageProcessor",)),
("owlvit", ("OwlViTImageProcessor",)),
("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,7 @@
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
("mistral3", "Mistral3ForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
("ovis2", "Ovis2ForConditionalGeneration"),
("paligemma", "PaliGemmaForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
Expand Down Expand Up @@ -854,6 +855,7 @@
("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
("mistral3", "Mistral3ForConditionalGeneration"),
("mllama", "MllamaForConditionalGeneration"),
("ovis2", "Ovis2ForConditionalGeneration"),
("paligemma", "PaliGemmaForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("pixtral", "LlavaForConditionalGeneration"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
("mllama", "MllamaProcessor"),
("moonshine", "Wav2Vec2Processor"),
("oneformer", "OneFormerProcessor"),
("ovis2", "Ovis2Processor"),
("owlv2", "Owlv2Processor"),
("owlvit", "OwlViTProcessor"),
("paligemma", "PaliGemmaProcessor"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@
"openai-gpt",
("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None),
),
("ovis2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
Expand Down
18 changes: 18 additions & 0 deletions src/transformers/models/ovis2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

# limitations under the License.
from typing import TYPE_CHECKING

from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure


if TYPE_CHECKING:
from .configuration_ovis2 import *
from .image_processing_ovis2 import *
from .modeling_ovis2 import *
from .processing_ovis2 import *
else:
import sys

_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
132 changes: 132 additions & 0 deletions src/transformers/models/ovis2/configuration_ovis2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from ...configuration_utils import PretrainedConfig
from ..qwen2.configuration_qwen2 import Qwen2Config


class Ovis2VisionConfig(PretrainedConfig):
r"""

Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.

Example:

```python
>>> from transformers import Ovis2VisionConfig, Ovis2VisionModel

>>> # Initializing a Ovis2VisionConfig with google/ovis2-base-patch16-224 style configuration
>>> configuration = Ovis2VisionConfig()

>>> # Initializing a Ovis2VisionModel (with random weights) from the google/ovis2-base-patch16-224 style configuration
>>> model = Ovis2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "ovis2_vision_model"
base_config_key = "vision_config"

def __init__(
self,
hidden_size: int = 1024,
intermediate_size: int = 2816,
num_hidden_layers: int = 24,
num_attention_heads: int = 8,
num_channels: int = 3,
image_size: int = 224,
patch_size: int = 14,
rms_norm_eps: float = 1e-5,
attention_dropout: float = 0.0,
projection_dropout: float = 0.0,
qkv_bias: bool = False,
use_bias: bool = False,
hidden_act="silu",
vocab_size=16384,
hidden_stride=1,
vision_feature_select_strategy="full",
num_visual_indicator_tokens=5,
tokenize_function="softmax",
**kwargs,
):
super().__init__(**kwargs)

self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size

self.attention_dropout = attention_dropout
self.hidden_act = hidden_act
self.use_bias = use_bias
self.qkv_bias = qkv_bias
self.rms_norm_eps = rms_norm_eps
self.projection_dropout = projection_dropout
self.vocab_size = vocab_size
self.hidden_stride = hidden_stride
self.vision_feature_select_strategy = vision_feature_select_strategy
self.num_visual_indicator_tokens = num_visual_indicator_tokens
self.tokenize_function = tokenize_function


class Ovis2Config(PretrainedConfig):
model_type = "ovis2"
sub_configs = {"text_config": Qwen2Config, "vision_config": Ovis2VisionConfig}

def __init__(
self,
vision_config=None,
text_config=None,
image_token_id=151665,
visual_indicator_token_ids=[151666, 151667, 151668, 151669, 151670],
vocab_size=151643,
sliding_window=32768,
hidden_size=1536,
**kwargs,
):
if isinstance(vision_config, dict):
self.vision_config = Ovis2VisionConfig(**vision_config)
elif isinstance(vision_config, Ovis2VisionConfig):
self.vision_config = vision_config
if vision_config is None:
self.vision_config = Ovis2VisionConfig(num_visual_indicator_tokens=len(visual_indicator_token_ids))

if isinstance(text_config, dict):
self.text_config = Qwen2Config(**text_config)
elif isinstance(text_config, Qwen2Config):
self.text_config = text_config
elif text_config is None:
self.text_config = Qwen2Config()

self.vocab_size = vocab_size
self.sliding_window = sliding_window
self.hidden_size = hidden_size

self.image_token_id = image_token_id
self.visual_indicator_token_ids = visual_indicator_token_ids
super().__init__(**kwargs)


__all__ = ["Ovis2VisionConfig", "Ovis2Config"]
Loading