add changed

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 0a0166a1fe80 · 2025-04-02T16:37:14.000+08:00
diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py
@@ -121,6 +121,7 @@ def __init__(
         eos_token_id=[151329, 151336, 151338],
         bos_token_id=None,
         attention_bias=True,
+        sandwich=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -139,6 +140,7 @@ def __init__(
         self.rope_theta = rope_theta
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
+        self.sandwich = sandwich
 
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
@@ -9,32 +9,36 @@
 
 from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast
 
-
 # fmt: off
 # `None` means we drop the key
-STATE_DICT_MAPPING = {
+BASE_STATE_DICT_MAPPING = {
     # CausalLM keys
-    r"transformer.output_layer.weight":                                               r"lm_head.weight",
+    r"transformer.output_layer.weight": r"lm_head.weight",
 
     # Model keys
-    r"transformer.embedding.word_embeddings.weight":                                  r"model.embed_tokens.weight",
-    r"transformer.rotary_pos_emb.inv_freq":                                           None,
-    r"transformer.encoder.final_layernorm.weight":                                    r"model.norm.weight",
+    r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight",
+    r"transformer.rotary_pos_emb.inv_freq": None,
+    r"transformer.encoder.final_layernorm.weight": r"model.norm.weight",
 
     # Layers keys
-    r"transformer.encoder.layers.(\d+).input_layernorm.weight":                       r"model.layers.\1.input_layernorm.weight",
-    r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight":              r"model.layers.\1.post_attention_layernorm.weight",
+    r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight",
+    r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight",
 
     # Attention keys
-    r"transformer.encoder.layers.(\d+).self_attention.dense.weight":                  r"model.layers.\1.self_attn.o_proj.weight",
+    r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight",
     # qkv_proj will later be split in q|k|v|_proj
     r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2",
 
     # MLP keys
-    r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight":                     r"model.layers.\1.mlp.gate_up_proj.weight",
-    r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight":                     r"model.layers.\1.mlp.down_proj.weight",
+    r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight",
+    r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight",
+}
+
+# Additional mappings for sandwich mode
+SANDWICH_STATE_DICT_MAPPING = {
+    r"transformer.encoder.layers.(\d+).post_mlp_layernorm.weight": r"model.layers.\1.post_mlp_layernorm.weight",
+    r"transformer.encoder.layers.(\d+).post_self_attn_layernorm.weight": r"model.layers.\1.post_self_attn_layernorm.weight",
 }
-# fmt: on
 
 
 def load_weights(input_dir: str):
@@ -61,8 +65,8 @@ def load_weights(input_dir: str):
         raise ValueError("No .safetensors or .bin files found in the specified directory.")
 
 
-def map_old_key_to_new(old_key):
-    for pattern, replacement in STATE_DICT_MAPPING.items():
+def map_old_key_to_new(old_key, state_dict_mapping):
+    for pattern, replacement in state_dict_mapping.items():
         if replacement is None:
             if re.fullmatch(pattern, old_key):
                 return None
@@ -75,33 +79,43 @@ def map_old_key_to_new(old_key):
     raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
 
 
-def convert_state_dict(original_state_dict: dict, config: GlmConfig):
+def convert_state_dict(original_state_dict: dict, config: GlmConfig, use_sandwich: bool = False):
     new_dict = {}
 
     head_dim = config.hidden_size // config.num_attention_heads
     query_size = config.num_attention_heads * head_dim
     kv_size = config.num_key_value_heads * head_dim
 
+    # Combine the base mapping with sandwich mapping if sandwich mode is enabled
+    state_dict_mapping = BASE_STATE_DICT_MAPPING.copy()
+    if use_sandwich:
+        state_dict_mapping.update(SANDWICH_STATE_DICT_MAPPING)
+
     for old_key, value in original_state_dict.items():
-        new_key = map_old_key_to_new(old_key)
-        if new_key is None:
+        try:
+            new_key = map_old_key_to_new(old_key, state_dict_mapping)
+            if new_key is None:
+                continue
+
+            if "qkv_proj." in new_key:
+                q_proj, k_proj, v_proj = (
+                    value[:query_size, ...],
+                    value[query_size : query_size + kv_size, ...],
+                    value[query_size + kv_size :, ...],
+                )
+                new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
+                new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
+                new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
+            else:
+                new_dict[new_key] = value
+        except ValueError:
+            # Skip keys that couldn't be mapped
             continue
 
-        if "qkv_proj." in new_key:
-            q_proj, k_proj, v_proj = (
-                value[:query_size, ...],
-                value[query_size : query_size + kv_size, ...],
-                value[query_size + kv_size :, ...],
-            )
-            new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
-            new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
-            new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
-        else:
-            new_dict[new_key] = value
     return new_dict
 
 
-def convert_config(original_config: dict):
+def convert_config(original_config: dict, use_sandwich: bool = False):
     key_mapping = {
         "vocab_size": "padded_vocab_size",
         "intermediate_size": "ffn_hidden_size",
@@ -128,6 +142,7 @@ def convert_config(original_config: dict):
         else original_config["multi_query_group_num"]
     )
     new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1)
+    new_config_kwargs["sandwich"] = use_sandwich
 
     new_config = GlmConfig(**new_config_kwargs)
     return new_config
@@ -153,16 +168,16 @@ def convert_glm_tokenizer(input_dir, use_post_processor=False):
     return fast_tok
 
 
-def convert_glm_model(input_dir, output_dir, use_post_processor=False):
+def convert_glm_model(input_dir, output_dir, use_post_processor=False, use_sandwich=False):
     # Load and convert config
     with open(os.path.join(input_dir, "config.json")) as f:
         original_config = json.load(f)
-    config = convert_config(original_config)
+    config = convert_config(original_config, use_sandwich)
     config.save_pretrained(output_dir)
 
     # Load and convert weights
     original_state_dict = load_weights(input_dir)
-    new_dict = convert_state_dict(original_state_dict, config)
+    new_dict = convert_state_dict(original_state_dict, config, use_sandwich)
     with torch.device("meta"):
         model = GlmForCausalLM(config)
     model.load_state_dict(new_dict, strict=True, assign=True)
@@ -190,6 +205,10 @@ def convert_glm_model(input_dir, output_dir, use_post_processor=False):
         action="store_true",
         help="Whether to apply post processor with special tokens",
     )
-
+    parser.add_argument(
+        "--sandwich",
+        action="store_true",
+        help="Whether to use two GlmRMSNorm",
+    )
     args = parser.parse_args()
-    convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor)
+    convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor, args.sandwich)
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
@@ -244,6 +244,71 @@ def forward(
         return attn_output, attn_weights
 
 
+class GlmDecoderLayer(nn.Module):
+    def __init__(self, config: GlmConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.sandwich = config.sandwich
+        self.self_attn = GlmAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GlmMLP(config)
+        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        if self.sandwich:
+            self.post_self_attn_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_mlp_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.sandwich:
+            hidden_states = self.post_self_attn_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.sandwich:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
 class GlmRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -325,60 +390,6 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-class GlmDecoderLayer(nn.Module):
-    def __init__(self, config: GlmConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = GlmAttention(config=config, layer_idx=layer_idx)
-
-        self.mlp = GlmMLP(config)
-        self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        return outputs
-
-
 GLM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -765,9 +776,6 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
-
-
 class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
@@ -839,8 +847,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, GlmForCausalLM
 
-        >>> model = GlmForCausalLM.from_pretrained("meta-glm/Glm-2-7b-hf")
-        >>> tokenizer = AutoTokenizer.from_pretrained("meta-glm/Glm-2-7b-hf")
+        >>> model = GlmForCausalLM.from_pretrained("THUDM/glm-4-9b-chat-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat-hf")
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py