Raise conversion errors after loading (#42807)

Cyrilvallez · web-flow · commit 464dfa04460c · 2025-12-11T16:49:59.000+01:00
* raise

* comment

* fix

* add test

* fix

* add back return

* small

* raise after report

* typos

* fix

* patch

* switch name

* doc

* oupsi that was commented out
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -409,7 +409,7 @@ def convert(
         config=None,
         hf_quantizer=None,
         missing_keys: Optional[MutableSet[str]] = None,
-        misc: Optional[MutableMapping[str, str]] = None,
+        conversion_errors: Optional[MutableMapping[str, str]] = None,
     ):
         # Collect the tensors here - we use a new dictionary to avoid keeping them in memory in the internal
         # attribute during the whole process
@@ -421,7 +421,9 @@ def convert(
         collected_tensors = {target_key: collected_tensors[self.source_patterns[0]]}
 
         if hf_quantizer is not None and self.quantization_operation is not None:
-            with log_to_misc(layer_name, misc, (len(collected_tensors), layer_name), self.quantization_operation):
+            with log_conversion_errors(
+                layer_name, conversion_errors, (len(collected_tensors), layer_name), self.quantization_operation
+            ):
                 collected_tensors = self.quantization_operation.convert(
                     collected_tensors,
                     source_patterns=self.source_patterns,
@@ -432,7 +434,7 @@ def convert(
                     missing_keys=missing_keys,
                 )
 
-        return collected_tensors, misc
+        return collected_tensors, conversion_errors
 
 
 @dataclass(slots=True)
@@ -455,14 +457,14 @@ def convert(
         config=None,
         hf_quantizer=None,
         missing_keys: Optional[MutableSet[str]] = None,
-        misc: Optional[MutableMapping[str, str]] = None,
+        conversion_errors: Optional[MutableMapping[str, str]] = None,
     ):
         # Collect the tensors here - we use a new dictionary to avoid keeping them in memory in the internal
         # attribute during the whole process
         collected_tensors = self.materialize_tensors()
 
         for op in self.operations:
-            with log_to_misc(layer_name, misc, (len(collected_tensors), layer_name), op):
+            with log_conversion_errors(layer_name, conversion_errors, (len(collected_tensors), layer_name), op):
                 collected_tensors = op.convert(
                     collected_tensors,
                     source_patterns=self.source_patterns,
@@ -489,7 +491,9 @@ def convert(
             pass
 
         if hf_quantizer is not None and self.quantization_operation is not None:
-            with log_to_misc(layer_name, misc, (len(collected_tensors), layer_name), self.quantization_operation):
+            with log_conversion_errors(
+                layer_name, conversion_errors, (len(collected_tensors), layer_name), self.quantization_operation
+            ):
                 collected_tensors = self.quantization_operation.convert(
                     collected_tensors,
                     source_patterns=self.source_patterns,
@@ -499,7 +503,7 @@ def convert(
                     model=model,
                     missing_keys=missing_keys,
                 )
-        return collected_tensors, misc
+        return collected_tensors, conversion_errors
 
 
 # For I/O bound operations (i.e. here reading files), it is better to have fewer threads, e.g. 4 is a good default.
@@ -560,13 +564,14 @@ def dot_natural_key(s: str):
 
 
 @contextmanager
-def log_to_misc(
+def log_conversion_errors(
     first_target_key: str,
-    misc: MutableMapping[str, str],
+    conversion_errors: MutableMapping[str, str],
     extras: Any = None,
     op: Union[list[ConversionOps], ConversionOps, None] = None,
 ):
-    # A simple helper to handle errors with contextual messages.
+    """Catch all exceptions during `convert` calls, and log the errors for later. Re-raise a `SkipParameters` exception
+    that will be catched later to skip the parameters that raised the original Exception."""
     try:
         yield
     except Exception as e:
@@ -585,17 +590,19 @@ def _format_op_name(curr_op: Union[list[ConversionOps], ConversionOps, None]) ->
         if isinstance(extras, tuple) and len(extras) == 2:
             length, target_keys = extras
             descriptor = f"{op_name} " if op_name else ""
-            misc[first_target_key] = (
+            conversion_errors[first_target_key] = (
                 f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {length}"
             )
         elif isinstance(extras, str):
             suffix = f" via {op_name}" if op_name else ""
-            misc[first_target_key] = f"{e}\nError{suffix} when processing parameter {extras}"
+            conversion_errors[first_target_key] = f"{e}\nError{suffix} when processing parameter {extras}"
         elif extras is None and op_name:
-            misc[first_target_key] = f"{op_name}: {e}"
+            conversion_errors[first_target_key] = f"{op_name}: {e}"
         else:
-            misc[first_target_key] = f"{extras} |Error: {e}"
-        raise SkipLayer()
+            conversion_errors[first_target_key] = f"{extras} |Error: {e}"
+
+        # Raise a specific Exception that we can catch easily
+        raise SkipParameters()
 
 
 def set_param_for_module(
@@ -604,44 +611,42 @@ def set_param_for_module(
     param_value: torch.Tensor,
     mismatch_keys: MutableSet[tuple[str, torch.Size, torch.Size]],
     missing_keys: MutableSet[str],
-    misc: MutableMapping[str, Any],
     unexpected_keys: MutableSet[str],
     distributed_operation: Optional[TensorParallelLayer],
     hf_quantizer: HfQuantizer,
 ):
-    with log_to_misc(target_name, misc, target_name):
-        module_path, _, param_name = target_name.rpartition(".")
-        module_obj = model.get_submodule(module_path) if module_path else model
+    module_path, _, param_name = target_name.rpartition(".")
+    module_obj = model.get_submodule(module_path) if module_path else model
 
-        ref = getattr(module_obj, param_name)
-        if ref is None:
-            unexpected_keys.add(target_name)
+    ref = getattr(module_obj, param_name)
+    if ref is None:
+        unexpected_keys.add(target_name)
+    else:
+        use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
+        if not isinstance(param_value, torch.nn.Parameter):
+            if distributed_operation is not None:
+                param_value = DTensor.from_local(
+                    param_value,
+                    distributed_operation.device_mesh,
+                    getattr(distributed_operation, "shard", Replicate()),
+                    run_check=False,
+                    shape=ref.size(),
+                    stride=ref.stride(),
+                )
+                if not use_dtensor:
+                    # we convert to local
+                    param_value = param_value.to_local()
+            if param_name not in module_obj._buffers:
+                param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
+
+        # Remove from missing keys (it's either mismatched, or all good)
+        missing_keys.discard(target_name)
+        if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
+            mismatch_keys.add((target_name, param_value.shape, ref.shape))
         else:
-            use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
-            if not isinstance(param_value, torch.nn.Parameter):
-                if distributed_operation is not None:
-                    param_value = DTensor.from_local(
-                        param_value,
-                        distributed_operation.device_mesh,
-                        getattr(distributed_operation, "shard", Replicate()),
-                        run_check=False,
-                        shape=ref.size(),
-                        stride=ref.stride(),
-                    )
-                    if not use_dtensor:
-                        # we convert to local
-                        param_value = param_value.to_local()
-                if param_name not in module_obj._buffers:
-                    param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
-
-            # Remove from missing keys (it's either mismatched, or all good)
-            missing_keys.discard(target_name)
-            if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
-                mismatch_keys.add((target_name, param_value.shape, ref.shape))
-            else:
-                # super important otherwise _init_weight will re-init the param
-                param_value._is_hf_initialized = True
-                setattr(module_obj, param_name, param_value)
+            # super important otherwise _init_weight will re-init the param
+            param_value._is_hf_initialized = True
+            setattr(module_obj, param_name, param_value)
 
 
 def offload_and_maybe_resave_param(
@@ -663,8 +668,9 @@ def offload_and_maybe_resave_param(
     return disk_offload_index
 
 
-class SkipLayer(Exception):
-    """Control-flow sentinel: abort processing of the current layer only."""
+class SkipParameters(Exception):
+    """Control-flow sentinel: abort processing of the current parameters only (that were supposed to be created
+    by a WeightConverter)."""
 
     pass
 
@@ -818,7 +824,7 @@ def convert_and_load_state_dict_in_model(
     meta_model_state_dict = model.state_dict()
     missing_keys = set(meta_model_state_dict.keys())
 
-    misc = {}
+    conversion_errors = {}
     mismatch_keys = set()
     unexpected_keys = set()
 
@@ -925,13 +931,13 @@ def convert_and_load_state_dict_in_model(
                 pbar.set_postfix({"Materializing param": first_param_name})
                 pbar.refresh()
                 try:
-                    realized_value, misc = mapping.convert(
+                    realized_value, conversion_errors = mapping.convert(
                         first_param_name,
                         model=model,
                         config=model.config,
                         hf_quantizer=hf_quantizer,
                         missing_keys=missing_keys,
-                        misc=misc,
+                        conversion_errors=conversion_errors,
                     )
                     for target_name, param in realized_value.items():
                         param = param[0] if isinstance(param, list) else param
@@ -949,7 +955,6 @@ def convert_and_load_state_dict_in_model(
                                 param,
                                 mismatch_keys,
                                 missing_keys,
-                                misc,
                                 unexpected_keys,
                                 mapping.distributed_operation,
                                 hf_quantizer,
@@ -958,7 +963,7 @@ def convert_and_load_state_dict_in_model(
                     # Cleanup all the tensors that were gathered before next iteration
                     del realized_value
 
-                except SkipLayer:
+                except SkipParameters:
                     continue
 
     # Close the pool, independently of whether the code was interrupted or finished successfully
@@ -969,7 +974,7 @@ def convert_and_load_state_dict_in_model(
 
     # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
     model._weight_conversions = weight_mapping
-    return missing_keys, unexpected_keys, mismatch_keys, disk_offload_index, misc
+    return missing_keys, unexpected_keys, mismatch_keys, disk_offload_index, conversion_errors
 
 
 def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch.Tensor]):
@@ -1016,7 +1021,7 @@ def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch
     new_state_dict = {}
     for first_param_name, reversed_converter in conversion_mapping.items():
         # Apply the reverse converter
-        realized_value, misc = reversed_converter.convert(first_param_name, model=model, config=model.config)
+        realized_value, _ = reversed_converter.convert(first_param_name, model=model, config=model.config)
         for target_name, param in realized_value.items():
             param = param[0] if isinstance(param, list) else param
             new_state_dict[target_name] = param
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4102,7 +4102,7 @@ def _load_pretrained_model(
                 state_dict = merged_state_dict
             error_msgs += _load_state_dict_into_zero3_model(model, state_dict)
             # This is not true but for now we assume only best-case scenario with deepspeed, i.e. perfectly matching checkpoints
-            missing_keys, unexpected_keys, mismatched_keys, misc = set(), set(), set(), set()
+            missing_keys, unexpected_keys, mismatched_keys, conversion_errors = set(), set(), set(), set()
         else:
             all_pointer = set()
             # Checkpoints are safetensors
@@ -4124,7 +4124,7 @@ def _load_pretrained_model(
             else:
                 raise ValueError("Neither a state dict nor checkpoint files were found.")
 
-            missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, misc = (
+            missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, conversion_errors = (
                 convert_and_load_state_dict_in_model(
                     model,
                     merged_state_dict,
@@ -4198,7 +4198,7 @@ def _load_pretrained_model(
             missing_keys=missing_keys,
             mismatched_keys=mismatched_keys,
             mismatched_shapes=mismatched_keys,
-            misc=misc,
+            conversion_errors=conversion_errors,
             ignore_mismatched_sizes=ignore_mismatched_sizes,
         )
 
diff --git a/src/transformers/utils/loading_report.py b/src/transformers/utils/loading_report.py
@@ -148,9 +148,8 @@ def log_state_dict_report(
     mismatched_keys=None,
     mismatched_shapes=None,
     ignore_mismatched_sizes=True,
-    misc=None,
+    conversion_errors=None,
     color=True,  # allow disabling for plain logs
-    min_width_full_table=60,  # terminal min width to attempt full table
 ):
     """Log a readable report about state_dict loading issues.
 
@@ -165,12 +164,13 @@ def log_state_dict_report(
     missing_keys = missing_keys or []
     mismatched_keys = mismatched_keys or []
     mismatched_shapes = mismatched_shapes or []
-    misc = misc or {}
+    conversion_errors = conversion_errors or {}
 
     # Detect whether the current stdout supports ANSI colors; allow callers to pass `color=False` to force no color
     color_enabled = bool(color and sys.stdout.isatty())
     ansi = ANSI(color_enabled)
 
+    # Re-raise errors early if needed
     if error_msgs:
         error_msg = "\n\t".join(error_msgs)
         if "size mismatch" in error_msg:
@@ -204,9 +204,9 @@ def log_state_dict_report(
             )
             rows.append(data)
 
-    if misc:
-        for k, v in update_key_name(misc).items():
-            status = "MISC"
+    if conversion_errors:
+        for k, v in update_key_name(conversion_errors).items():
+            status = "CONVERSION"
             status = _color(status, "purple", ansi)
             _details = v[:term_w]
             rows.append([k, status, _details])
@@ -228,16 +228,25 @@ def log_state_dict_report(
     if unexpected_keys:
         tips += f"\n- {_color('UNEXPECTED', 'orange', ansi) + ansi['italic']}\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch."
     if missing_keys:
-        tips += f"\n- {_color('MISSING', 'red', ansi) + ansi['italic']}\t:those params were newly initialized because missing form the checkpoint. Consider training on your downstream task."
+        tips += f"\n- {_color('MISSING', 'red', ansi) + ansi['italic']}\t:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task."
     if mismatched_keys:
-        tips += f"\n- {_color('MISMATCH', 'yellow', ansi) + ansi['italic']}\t:ckpt weights were loaded, but they did not match the original empty weight."
-    if misc:
-        tips += f"\n- {_color('MISC', 'purple', ansi) + ansi['italic']}\t:originate from the conversion scheme"
+        tips += f"\n- {_color('MISMATCH', 'yellow', ansi) + ansi['italic']}\t:ckpt weights were loaded, but they did not match the original empty weight shapes."
+    if conversion_errors:
+        tips += f"\n- {_color('CONVERSION', 'purple', ansi) + ansi['italic']}\t:originate from the conversion scheme"
     tips += f"{ansi['reset']}"
 
+    # Log the report as warning
     logger.warning(prelude + table + tips)
+
+    # Re-raise in those case, after the report
+    if conversion_errors:
+        raise RuntimeError(
+            "We encountered some issues during automatic conversion of the weights. For details look at the `CONVERSION` entries of "
+            "the above report!"
+        )
     if not ignore_mismatched_sizes and mismatched_keys:
         raise RuntimeError(
             "You set `ignore_mismatched_sizes` to `False`, thus raising an error. For details look at the above report!"
         )
+
     return prelude + table + tips
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py