Update Ovis2 configuration and processing classes for improved documentation

thisisiron · thisisiron · commit 355a91cd2702 · 2025-04-01T11:06:14.000+09:00
diff --git a/src/transformers/models/ovis2/configuration_ovis2.py b/src/transformers/models/ovis2/configuration_ovis2.py
@@ -3,7 +3,9 @@
 
 
 class Ovis2VisionConfig(PretrainedConfig):
-    r"""
+    r"""This is the configuration class to store the configuration of a [`Ovis2VisionModel`]. It is used to instantiate a
+    Ovis2VisionModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of Ovis2.
 
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
@@ -102,7 +104,7 @@ class Ovis2Config(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-    [thisisiron/Ovis2-1B-hf](https://huggingface.co/thisisiron/Ovis2-1B-hf)
+    e.g. [thisisiron/Ovis2-1B-hf](https://huggingface.co/thisisiron/Ovis2-1B-hf)
 
     Args:
         vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `Ovis2VisionConfig`):
@@ -124,7 +126,7 @@ class Ovis2Config(PretrainedConfig):
     >>> # Initializing a Ovis2 style configuration
     >>> configuration = Ovis2Config()
 
-    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> # Initializing a model from the Ovis2-2B style configuration
     >>> model = Ovis2ForConditionalGeneration(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/ovis2/convert_ovis2_weights_to_hf.py b/src/transformers/models/ovis2/convert_ovis2_weights_to_hf.py
@@ -9,6 +9,7 @@
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForVision2Seq,
+    AutoProcessor,
     AutoTokenizer,
 )
 from transformers.models.ovis2.configuration_ovis2 import Ovis2Config, Ovis2VisionConfig
@@ -96,17 +97,15 @@ def create_tokenizer(model_name_or_path, save_dir):
     Returns:
         The configured tokenizer
     """
-    if model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, return_token_type_ids=False)
-        tokenizer.model_max_length = CONTEXT_LENGTH
-        tokenizer.add_special_tokens(
-            {"additional_special_tokens": SPECIAL_TOKENS},
-            replace_additional_special_tokens=False,
-        )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained("./ovisv2_hf/tokenizer_ovisv2", return_token_type_ids=False)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, return_token_type_ids=False)
+    tokenizer.model_max_length = CONTEXT_LENGTH
+    tokenizer.add_special_tokens(
+        {"additional_special_tokens": SPECIAL_TOKENS},
+        replace_additional_special_tokens=False,
+    )
     tokenizer.chat_template = CHAT_TEMPLATE
-    tokenizer.save_pretrained(save_dir)
+    setattr(tokenizer, "image_token", "<IMG_ATOM>")
+    setattr(tokenizer, "image_token_id", tokenizer.convert_tokens_to_ids(tokenizer.image_token))
     return tokenizer
 
 
@@ -124,9 +123,6 @@ def create_image_processor(save_dir):
         crop_to_patches=True,
         size={"height": 448, "width": 448},
     )
-
-    image_processor.save_pretrained(save_dir)
-    print(f"Image processor saved to {save_dir}")
     return image_processor
 
 
@@ -315,14 +311,15 @@ def main():
         save_dir=args.save_dir,
     )
 
+    os.makedirs(args.save_dir, exist_ok=True)
+
     # Convert and save the model
     model = convert_model(model_name_or_path=args.model_name_or_path)
+    model.save_pretrained(args.save_dir)
 
-    # Save the model and processor
-    os.makedirs(args.save_dir, exist_ok=True)
+    # Save the processor
     processor = Ovis2Processor(tokenizer=tokenizer, image_processor=image_processor, chat_template=CHAT_TEMPLATE)
     processor.save_pretrained(args.save_dir)
-    model.save_pretrained(args.save_dir)
 
     # Push to hub if requested
     if args.push_to_hub:
@@ -338,6 +335,8 @@ def main():
         .to("cuda:0")
     )
 
+    processor = AutoProcessor.from_pretrained(args.save_dir)
+
     messages = [
         {
             "role": "user",
diff --git a/src/transformers/models/ovis2/processing_ovis2.py b/src/transformers/models/ovis2/processing_ovis2.py
@@ -36,8 +36,7 @@ class Ovis2Processor(ProcessorMixin):
         image_token (`str`, *optional*, defaults to `"<image>"`):
             Special token used to denote image location.
         image_seq_length (`int`, *optional*, defaults to 256):
-            The maximum sequence length for image tokens.
-            Shoudl be same as in model's config
+            The number of image tokens to be used for each image in the input.
     """
 
     attributes = ["image_processor", "tokenizer"]
@@ -60,7 +59,12 @@ def __init__(
     ):
         self.image_seq_length = image_seq_length
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
     def __call__(
         self,