Fix context parallel pad length collision across shared split dims

cursoragent · cursoragent · commit 28b0ab095cfd · 2026-06-21T11:53:02.000Z
Store pre-pad sequence lengths keyed by input name instead of shard
dimension so tensors like hidden_states and encoder_hidden_states_mask
that share split_dim=1 no longer overwrite each other. The gather hook
looks up the pad length for hidden_states when trimming gathered output.
diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py
@@ -37,7 +37,7 @@
 
 logger = get_logger(__name__)  # pylint: disable=invalid-name
 
-def _get_cp_pad_lengths(parallel_config: ContextParallelConfig) -> dict[int, int]:
+def _get_cp_pad_lengths(parallel_config: ContextParallelConfig) -> dict[str, int]:
     pad_lengths = getattr(parallel_config, "_cp_pad_lengths", None)
     if pad_lengths is None:
         pad_lengths = {}
@@ -244,7 +244,8 @@ def _prepare_cp_input(self, x: torch.Tensor, cp_input: ContextParallelInput, nam
         if world_size > 1 and seq_len % world_size != 0:
             pad_value = 0 if "mask" in name.lower() else 0.0
             x = _pad_tensor_for_context_parallel(x, dim, world_size, pad_value=pad_value)
-            _get_cp_pad_lengths(self.parallel_config)[dim] = seq_len
+            if name:
+                _get_cp_pad_lengths(self.parallel_config)[name] = seq_len
 
         return EquipartitionSharder.shard(x, dim, mesh)
 
@@ -282,8 +283,9 @@ def post_forward(self, module, output):
                     output[i], cpm.gather_dim, self.parallel_config._flattened_mesh
                 )
 
-            if pad_lengths and cpm.gather_dim in pad_lengths:
-                original_len = pad_lengths.pop(cpm.gather_dim)
+            unpad_key = getattr(cpm, "unpad_key", "hidden_states")
+            if pad_lengths and unpad_key and unpad_key in pad_lengths:
+                original_len = pad_lengths.pop(unpad_key)
                 x = x.narrow(cpm.gather_dim, 0, original_len)
 
             output[i] = x
diff --git a/tests/hooks/test_hooks.py b/tests/hooks/test_hooks.py
@@ -420,7 +420,7 @@ def test_prepare_cp_input_pads_hidden_states(self):
             out = self.hook._prepare_cp_input(x, cp_input, name="hidden_states")
 
         assert out.shape[1] == 9
-        assert self.parallel_config._cp_pad_lengths[1] == 7
+        assert self.parallel_config._cp_pad_lengths["hidden_states"] == 7
 
     def test_prepare_cp_input_pads_attention_mask_with_zeros(self):
         mask = torch.ones(1, 7, dtype=torch.long)
@@ -443,7 +443,7 @@ def test_prepare_cp_input_no_pad_when_divisible(self):
         assert not hasattr(self.parallel_config, "_cp_pad_lengths")
 
     def test_gather_hook_trims_padded_output(self):
-        self.parallel_config._cp_pad_lengths = {1: 7}
+        self.parallel_config._cp_pad_lengths = {"hidden_states": 7}
         gather_hook = ContextParallelGatherHook(
             metadata=[ContextParallelOutput(gather_dim=1, expected_dims=3)],
             parallel_config=self.parallel_config,
@@ -454,5 +454,5 @@ def test_gather_hook_trims_padded_output(self):
             out = gather_hook.post_forward(self.module, x)
 
         assert out.shape[1] == 7
-        assert 1 not in getattr(self.parallel_config, "_cp_pad_lengths", {})
+        assert "hidden_states" not in getattr(self.parallel_config, "_cp_pad_lengths", {})