[core] fix mxfp4 (#42382)

MekkCyber · web-flow · commit cb739f861095 · 2025-11-26T12:16:33.000+01:00
* initial commit

* fix import

* fix

* add ops

* style

* decouple dequantize &amp; deserialize logic

* up
diff --git a/src/transformers/integrations/mxfp4.py b/src/transformers/integrations/mxfp4.py
@@ -18,13 +18,19 @@
 if is_torch_available():
     import torch
     from torch import nn
+from typing import Optional
+
+from ..core_model_loading import ConversionOps
+
 
 if is_accelerate_available():
     from accelerate import init_empty_weights
 
 import re
 from contextlib import contextmanager
 
+from ..quantizers.quantizers_utils import get_module_from_name
+
 
 logger = logging.get_logger(__name__)
 
@@ -70,6 +76,126 @@ def on_device(dev):
     yield
 
 
+class Mxfp4Quantize(ConversionOps):
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: Optional[torch.nn.Module] = None,
+        missing_keys: Optional[list[str]] = None,
+        full_layer_name: str | None = None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        _, value = tuple(input_dict.items())[0]
+        value = value[0] if isinstance(value, list) else value
+
+        module, _ = get_module_from_name(model, full_layer_name)
+
+        with torch.device(value.device):
+            if isinstance(module, Mxfp4GptOssExperts):
+                triton_weight_tensor, weight_scale = quantize_to_mxfp4(value.transpose(-1, -2), triton_kernels_hub)
+                PrecisionConfig, FlexCtx, InFlexData = (
+                    triton_kernels_hub.matmul_ogs.PrecisionConfig,
+                    triton_kernels_hub.matmul_ogs.FlexCtx,
+                    triton_kernels_hub.matmul_ogs.InFlexData,
+                )
+                triton_weight_tensor, weight_scale = swizzle_mxfp4(
+                    triton_weight_tensor, weight_scale, triton_kernels_hub
+                )
+
+                proj = "gate_up_proj" if "gate_up_proj" in full_layer_name else "down_proj"
+
+                if proj in module._parameters:
+                    # Remove the nn.Parameter registration so we can attach the Triton tensor
+                    del module._parameters[proj]
+
+                setattr(module, proj, triton_weight_tensor)
+                setattr(
+                    module,
+                    f"{proj}_precision_config",
+                    PrecisionConfig(weight_scale=weight_scale, flex_ctx=FlexCtx(rhs_data=InFlexData())),
+                )
+
+                missing_keys.discard(f"{full_layer_name}")
+                module._is_hf_initialized = True
+
+                return {}
+
+
+class Mxfp4Dequantize(ConversionOps):
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: Optional[torch.nn.Module] = None,
+        full_layer_name: str | None = None,
+        missing_keys=None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        param_data = {}
+        if "_blocks" in input_dict.keys():
+            if isinstance(input_dict["_blocks"], list):
+                param_data["_blocks"] = input_dict["_blocks"][0]
+            else:
+                param_data["_blocks"] = input_dict["_blocks"]
+        if "_scales" in input_dict.keys():
+            if isinstance(input_dict["_scales"], list):
+                param_data["_scales"] = input_dict["_scales"][0]
+            else:
+                param_data["_scales"] = input_dict["_scales"]
+
+        # Here we are dequantizing the weights
+        dequantized = dequantize_convertops(param_data["_blocks"], param_data["_scales"], param_data["_blocks"].device)
+        return {full_layer_name: dequantized}
+
+
+class Mxfp4Deserialize(ConversionOps):
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: Optional[torch.nn.Module] = None,
+        full_layer_name: str | None = None,
+        missing_keys: Optional[list[str]] = None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        param_data = {}
+        if "_blocks" in input_dict.keys():
+            if isinstance(input_dict["_blocks"], list):
+                param_data["_blocks"] = input_dict["_blocks"][0]
+            else:
+                param_data["_blocks"] = input_dict["_blocks"]
+        if "_scales" in input_dict.keys():
+            if isinstance(input_dict["_scales"], list):
+                param_data["_scales"] = input_dict["_scales"][0]
+            else:
+                param_data["_scales"] = input_dict["_scales"]
+
+        # Eagerly set tensors on the module and perform swizzle
+        module, _ = get_module_from_name(model, full_layer_name)
+        proj = "gate_up_proj" if "gate_up_proj" in full_layer_name else "down_proj"
+        swizzle_mxfp4_convertops(
+            param_data["_blocks"],
+            param_data["_scales"],
+            module,
+            proj,
+            param_data["_blocks"].device,
+            triton_kernels_hub,
+        )
+        missing_keys.discard(f"{full_layer_name}")
+        module._is_hf_initialized = True
+        # We return an empty mapping since the module was updated in-place. This prevents
+        # the loader from trying to materialize the original meta-parameter names again.
+        # We don't use set_param_for_module since it expects mainly a torch.nn.Parameter or a safetensors pointer
+        return {}
+
+
 # Copied from GPT_OSS repo and vllm
 def quantize_to_mxfp4(w, triton_kernels_hub):
     downcast_to_mxfp_torch = triton_kernels_hub.numerics_details.mxfp.downcast_to_mxfp_torch
@@ -110,6 +236,7 @@ def convert_moe_packed_tensors(
     """
     import math
 
+    blocks = blocks.to(torch.uint8)
     # Check if blocks and scales are on CPU, and move to GPU if so
     if not blocks.is_cuda and torch.cuda.is_available():
         blocks = blocks.cuda()
@@ -162,26 +289,20 @@ def __init__(self, config):
         self.intermediate_size = config.intermediate_size
         self.hidden_size = config.hidden_size
 
-        self.gate_up_proj_blocks = nn.Parameter(
+        self.gate_up_proj = nn.Parameter(
             torch.zeros(self.num_experts, 2 * self.intermediate_size, self.hidden_size // 32, 16, dtype=torch.uint8),
             requires_grad=False,
         )
-        self.gate_up_proj_scales = nn.Parameter(
-            torch.zeros(self.num_experts, 2 * self.intermediate_size, self.hidden_size // 32, dtype=torch.uint8),
-            requires_grad=False,
-        )
+
         self.gate_up_proj_bias = nn.Parameter(
             torch.zeros(self.num_experts, 2 * self.intermediate_size, dtype=torch.float32), requires_grad=False
         )
 
-        self.down_proj_blocks = nn.Parameter(
+        self.down_proj = nn.Parameter(
             torch.zeros((self.num_experts, self.hidden_size, self.intermediate_size // 32, 16), dtype=torch.uint8),
             requires_grad=False,
         )
-        self.down_proj_scales = nn.Parameter(
-            torch.zeros(self.num_experts, self.hidden_size, self.intermediate_size // 32, dtype=torch.uint8),
-            requires_grad=False,
-        )
+
         self.down_proj_bias = nn.Parameter(
             torch.zeros(self.num_experts, self.hidden_size, dtype=torch.float32), requires_grad=False
         )
@@ -361,6 +482,14 @@ def dequantize(module, param_name, param_value, target_device, dq_param_name, **
                 delattr(module, scales_attr)
 
 
+def dequantize_convertops(blocks, scales, target_device):
+    dequantized = convert_moe_packed_tensors(blocks, scales)
+    if target_device == "cpu" and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    dequantized = torch.nn.Parameter(dequantized.to(target_device))
+    return dequantized
+
+
 def load_and_swizzle_mxfp4(module, param_name, param_value, target_device, triton_kernels_hub, **kwargs):
     """
     This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
@@ -428,6 +557,53 @@ def load_and_swizzle_mxfp4(module, param_name, param_value, target_device, trito
         del blocks
 
 
+def swizzle_mxfp4_convertops(blocks, scales, module, proj, target_device, triton_kernels_hub):
+    """
+    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
+    """
+    PrecisionConfig, FlexCtx, InFlexData = (
+        triton_kernels_hub.matmul_ogs.PrecisionConfig,
+        triton_kernels_hub.matmul_ogs.FlexCtx,
+        triton_kernels_hub.matmul_ogs.InFlexData,
+    )
+
+    local_experts = blocks.size(0)
+    if getattr(target_device, "type", target_device) == "cpu":
+        target_device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+
+    blocks = blocks.to(target_device).contiguous()
+    scales = scales.to(target_device).contiguous()
+
+    if proj == "gate_up_proj":
+        blocks = blocks.reshape(local_experts, module.intermediate_size * 2, -1)
+    else:
+        blocks = blocks.reshape(local_experts, -1, module.intermediate_size // 2)
+    if getattr(target_device, "type", target_device) == "cpu":
+        target_device = "cuda"
+
+    with on_device(target_device):
+        triton_weight_tensor, weight_scale = swizzle_mxfp4(
+            blocks.transpose(-2, -1), scales.transpose(-2, -1), triton_kernels_hub
+        )
+    # need to overwrite the shapes for the kernels
+    if proj == "gate_up_proj":
+        triton_weight_tensor.shape = torch.Size([local_experts, module.hidden_size, module.intermediate_size * 2])
+    else:
+        triton_weight_tensor.shape = torch.Size([local_experts, module.intermediate_size, module.hidden_size])
+
+    # triton_weight_tensor is what needs to be passed in oai kernels. It stores the data, the shapes and any more objects. It's like a subtensor
+    # Since the Experts module registers gate_up_proj and down_proj as nn.Parameters, we need to remove them so we can attach the Triton tensor
+    if proj in module._parameters:
+        # Remove the nn.Parameter registration so we can attach the Triton tensor
+        del module._parameters[proj]
+    setattr(module, proj, triton_weight_tensor)
+    setattr(
+        module,
+        f"{proj}_precision_config",
+        PrecisionConfig(weight_scale=weight_scale, flex_ctx=FlexCtx(rhs_data=InFlexData())),
+    )
+
+
 def _replace_with_mxfp4_linear(
     model,
     modules_to_not_convert=None,
diff --git a/src/transformers/quantizers/quantizer_mxfp4.py b/src/transformers/quantizers/quantizer_mxfp4.py
@@ -32,6 +32,8 @@
 if is_torch_available():
     import torch
 
+    from ..core_model_loading import WeightConverter
+
 logger = logging.get_logger(__name__)
 triton_kernels_hub = None
 
@@ -157,6 +159,8 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
         from ..integrations import Mxfp4GptOssExperts
         from ..models.gpt_oss.modeling_gpt_oss import GptOssExperts
 
+        if self.pre_quantized:
+            return False
         # if we are dequantizing, the model doesn't have scales, and blocks only params like gate_up_proj and down_proj so we need to handle this case differently
         if self.quantization_config.dequantize and ("blocks" in param_name or "scales" in param_name):
             module, tensor_name = get_module_from_name(model, param_name[: -len("_blocks")])
@@ -426,3 +430,30 @@ def is_trainable(self) -> bool:
             "MXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()"
         )
         return False
+
+    def get_quantize_ops(self):
+        from ..integrations.mxfp4 import Mxfp4Quantize
+
+        return Mxfp4Quantize(self)
+
+    def get_weight_conversions(self):
+        from ..integrations.mxfp4 import Mxfp4Dequantize, Mxfp4Deserialize
+
+        if self.pre_quantized:
+            if self.quantization_config.dequantize:
+                return [
+                    WeightConverter(
+                        source_keys=["_blocks", "_scales"],
+                        target_keys="",
+                        operations=[Mxfp4Dequantize(self)],
+                    )
+                ]
+            else:
+                return [
+                    WeightConverter(
+                        source_keys=["_blocks", "_scales"],
+                        target_keys="",
+                        operations=[Mxfp4Deserialize(self)],
+                    )
+                ]
+        return []