fix: compute image_seq_len from spatial dims, not channel dim in Lumina2 pipeline

gambletan · claude · gambletan · commit 67e69cb0fc00 · 2026-03-16T22:10:18.000+08:00
Fixes #12913 `image_seq_len` was computed as `latents.shape[1]`, which gives the channel dimension (e.g. 16) since Lumina2 latents have shape `(batch, channels, height, width)` and are NOT packed/reshaped before this point. The Lumina2 transformer internally patchifies the latents with `patch_size=2`, so the correct spatial sequence length is `(H // patch_size) * (W // patch_size)`. This incorrect value was passed to `calculate_shift()`, which computes the `mu` parameter for the flow-matching scheduler. Using channel count instead of token count produces a completely wrong shift, degrading generation quality. The fix reads `patch_size` from `self.transformer.config.patch_size` and computes `image_seq_len` from the last two (spatial) dimensions of the latents tensor, matching how the transformer itself computes its input sequence length. For reference, the Flux pipeline correctly uses `latents.shape[1]` because Flux latents are pre-packed into `(batch, seq_len, channels)` before this computation. Lumina2 does not pre-pack, so the same indexing does not apply. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/diffusers/pipelines/lumina2/pipeline_lumina2.py b/src/diffusers/pipelines/lumina2/pipeline_lumina2.py
@@ -696,7 +696,8 @@ def __call__(
 
         # 5. Prepare timesteps
         sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
-        image_seq_len = latents.shape[1]
+        patch_size = self.transformer.config.patch_size
+        image_seq_len = (latents.shape[-2] // patch_size) * (latents.shape[-1] // patch_size)
         mu = calculate_shift(
             image_seq_len,
             self.scheduler.config.get("base_image_seq_len", 256),
diff --git a/tests/pipelines/lumina2/test_pipeline_lumina2.py b/tests/pipelines/lumina2/test_pipeline_lumina2.py
@@ -99,6 +99,68 @@ def get_dummy_components(self):
         }
         return components
 
+    def test_image_seq_len_uses_spatial_dimensions(self):
+        """Test that image_seq_len is computed from spatial dims, not channel dim.
+
+        Lumina2 latents have shape (batch, channels, height, width) and are NOT
+        packed before image_seq_len is computed. The transformer patchifies
+        internally with patch_size=2, so the correct sequence length is
+        (H // patch_size) * (W // patch_size).
+
+        Previously, the code used latents.shape[1] which gives the channel
+        count (e.g. 4) instead of the spatial sequence length (e.g. 64 for
+        16x16 latents with patch_size=2). This caused calculate_shift() to
+        compute a completely wrong mu value for the scheduler.
+        """
+        components = self.get_dummy_components()
+        pipe = Lumina2Pipeline(**components)
+        pipe.to(torch.device("cpu"))
+
+        patch_size = pipe.transformer.config.patch_size  # 2
+
+        # Use height=32, width=32 -> latent size 4x4 (vae downscale 8x)
+        # With patch_size=2: seq_len = (4//2)*(4//2) = 4
+        # Channel dim = 4, which would be wrong if used as seq_len
+        # Use a larger size to make the distinction clearer
+        height, width = 64, 64
+        latent_h, latent_w = height // 8, width // 8  # 8, 8
+        expected_seq_len = (latent_h // patch_size) * (latent_w // patch_size)  # 16
+
+        # The channel dimension is 4 (from vae latent_channels)
+        # If the bug were present, image_seq_len would be 4 instead of 16
+        channels = components["vae"].config.latent_channels  # 4
+        self.assertNotEqual(channels, expected_seq_len, "Test needs channels != expected_seq_len to be meaningful")
+
+        # Capture the mu value passed to the scheduler
+        captured = {}
+        original_set_timesteps = pipe.scheduler.set_timesteps
+
+        def capture_mu_set_timesteps(*args, **kwargs):
+            captured["mu"] = kwargs.get("mu")
+            return original_set_timesteps(*args, **kwargs)
+
+        pipe.scheduler.set_timesteps = capture_mu_set_timesteps
+
+        # Run pipeline with specific dimensions
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        pipe(
+            prompt="test",
+            height=height,
+            width=width,
+            num_inference_steps=1,
+            generator=generator,
+            output_type="latent",
+        )
+
+        # Verify mu was computed using spatial seq_len, not channel dim
+        from diffusers.pipelines.lumina2.pipeline_lumina2 import calculate_shift
+
+        correct_mu = calculate_shift(expected_seq_len)
+        wrong_mu = calculate_shift(channels)
+
+        self.assertAlmostEqual(captured["mu"], correct_mu, places=5, msg="mu should use spatial sequence length")
+        self.assertNotAlmostEqual(captured["mu"], wrong_mu, places=5, msg="mu should NOT use channel dimension")
+
     def get_dummy_inputs(self, device, seed=0):
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)