From ec918b9aab6a4b3fc020361c0c5de5617ff7945b Mon Sep 17 00:00:00 2001
From: Pham Hong Vinh <vinhph3@vng.com.vn>
Date: Wed, 8 Jan 2025 00:12:33 +0700
Subject: [PATCH 1/5] add framewise decode

---
 .../models/autoencoders/autoencoder_kl_ltx.py | 58 ++++++++++++++++---
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
index 9aa53f7af243..d2562915edca 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -1010,10 +1010,12 @@ def __init__(
         # The minimal tile height and width for spatial tiling to be used
         self.tile_sample_min_height = 512
         self.tile_sample_min_width = 512
+        self.tile_sample_min_num_frames = 16
 
         # The minimal distance between two spatial tiles
         self.tile_sample_stride_height = 448
         self.tile_sample_stride_width = 448
+        self.tile_sample_stride_num_frames = 8 
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)):
@@ -1114,6 +1116,53 @@ def encode(
         if not return_dict:
             return (posterior,)
         return AutoencoderKLOutput(latent_dist=posterior)
+        
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
+                x / blend_extent
+            )
+        return b
+
+    def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        batch_size, num_channels, num_frames, height, width = z.shape
+        num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
+        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
+        blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
+
+        row = []
+        for i in range(0, num_frames, tile_latent_stride_num_frames):
+            tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
+            if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
+                decoded = self.tiled_decode(tile, temb, return_dict=True).sample
+            else:
+                print("NOT Use tile decode")
+                print(f"input tile: {tile.size()}")
+                decoded = self.decoder(tile, temb)
+                print(f"output tile: {decoded.size()}")
+            if i > 0:
+                decoded = decoded[:, :, :-1, :, :]
+            row.append(decoded)
+
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
+                tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :]
+                result_row.append(tile)
+            else:
+                result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :])
+
+        dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
 
     def _decode(
         self, z: torch.Tensor, temb: Optional[torch.Tensor] = None, return_dict: bool = True
@@ -1125,13 +1174,8 @@ def _decode(
         if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
             return self.tiled_decode(z, temb, return_dict=return_dict)
 
-        if self.use_framewise_decoding:
-            # TODO(aryan): requires investigation
-            raise NotImplementedError(
-                "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                "quality issues caused by splitting inference across frame dimension. If you believe this "
-                "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-            )
+        if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
+            dec = self._temporal_tiled_decode(z, temb, return_dict=False)[0]
         else:
             dec = self.decoder(z, temb)
 

From 64a08490acb3d94c6cc1f885f5689dd2a9df9707 Mon Sep 17 00:00:00 2001
From: Pham Hong Vinh <phamvinh257@gmail.com>
Date: Wed, 8 Jan 2025 11:09:55 +0700
Subject: [PATCH 2/5] add framewise encode, refactor tiled encode/decode

---
 .../models/autoencoders/autoencoder_kl_ltx.py | 174 +++++++++---------
 1 file changed, 92 insertions(+), 82 deletions(-)

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
index d2562915edca..84eeb9399733 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -1025,8 +1025,10 @@ def enable_tiling(
         self,
         tile_sample_min_height: Optional[int] = None,
         tile_sample_min_width: Optional[int] = None,
+        tile_sample_min_num_frames: Optional[int] = None,
         tile_sample_stride_height: Optional[float] = None,
         tile_sample_stride_width: Optional[float] = None,
+        tile_sample_stride_num_frames: Optional[float] = None,
     ) -> None:
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
@@ -1048,8 +1050,10 @@ def enable_tiling(
         self.use_tiling = True
         self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
         self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames
         self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
         self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
+        self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames
 
     def disable_tiling(self) -> None:
         r"""
@@ -1075,18 +1079,13 @@ def disable_slicing(self) -> None:
     def _encode(self, x: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, num_frames, height, width = x.shape
 
+        if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames:
+            return self._temporal_tiled_encode(x)
+
         if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
             return self.tiled_encode(x)
 
-        if self.use_framewise_encoding:
-            # TODO(aryan): requires investigation
-            raise NotImplementedError(
-                "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                "quality issues caused by splitting inference across frame dimension. If you believe this "
-                "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-            )
-        else:
-            enc = self.encoder(x)
+        enc = self.encoder(x)
 
         return enc
 
@@ -1116,53 +1115,6 @@ def encode(
         if not return_dict:
             return (posterior,)
         return AutoencoderKLOutput(latent_dist=posterior)
-        
-    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
-                x / blend_extent
-            )
-        return b
-
-    def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        batch_size, num_channels, num_frames, height, width = z.shape
-        num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
-
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
-        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
-        blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
-
-        row = []
-        for i in range(0, num_frames, tile_latent_stride_num_frames):
-            tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
-            if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
-                decoded = self.tiled_decode(tile, temb, return_dict=True).sample
-            else:
-                print("NOT Use tile decode")
-                print(f"input tile: {tile.size()}")
-                decoded = self.decoder(tile, temb)
-                print(f"output tile: {decoded.size()}")
-            if i > 0:
-                decoded = decoded[:, :, :-1, :, :]
-            row.append(decoded)
-
-        result_row = []
-        for i, tile in enumerate(row):
-            if i > 0:
-                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
-                tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :]
-                result_row.append(tile)
-            else:
-                result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :])
-
-        dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
-
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
 
     def _decode(
         self, z: torch.Tensor, temb: Optional[torch.Tensor] = None, return_dict: bool = True
@@ -1171,13 +1123,13 @@ def _decode(
         tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
         tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio
 
+        if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
+            return self._temporal_tiled_decode(z, temb, return_dict=return_dict)
+
         if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
             return self.tiled_decode(z, temb, return_dict=return_dict)
 
-        if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
-            dec = self._temporal_tiled_decode(z, temb, return_dict=False)[0]
-        else:
-            dec = self.decoder(z, temb)
+        dec = self.decoder(z, temb)
 
         if not return_dict:
             return (dec,)
@@ -1232,6 +1184,14 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
                 x / blend_extent
             )
         return b
+        
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
+                x / blend_extent
+            )
+        return b
 
     def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         r"""Encode a batch of images using a tiled encoder.
@@ -1261,17 +1221,9 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         for i in range(0, height, self.tile_sample_stride_height):
             row = []
             for j in range(0, width, self.tile_sample_stride_width):
-                if self.use_framewise_encoding:
-                    # TODO(aryan): requires investigation
-                    raise NotImplementedError(
-                        "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                        "quality issues caused by splitting inference across frame dimension. If you believe this "
-                        "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-                    )
-                else:
-                    time = self.encoder(
-                        x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
-                    )
+                time = self.encoder(
+                    x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
+                )
 
                 row.append(time)
             rows.append(row)
@@ -1327,17 +1279,9 @@ def tiled_decode(
         for i in range(0, height, tile_latent_stride_height):
             row = []
             for j in range(0, width, tile_latent_stride_width):
-                if self.use_framewise_decoding:
-                    # TODO(aryan): requires investigation
-                    raise NotImplementedError(
-                        "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                        "quality issues caused by splitting inference across frame dimension. If you believe this "
-                        "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-                    )
-                else:
-                    time = self.decoder(
-                        z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb
-                    )
+                time = self.decoder(
+                    z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb
+                )
 
                 row.append(time)
             rows.append(row)
@@ -1362,6 +1306,72 @@ def tiled_decode(
 
         return DecoderOutput(sample=dec)
 
+    def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        batch_size, num_channels, num_frames, height, width = x.shape
+        latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1
+
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
+        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
+        blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames
+
+        row = []
+        for i in range(0, num_frames, self.tile_sample_stride_num_frames):
+            tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :]
+            if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width):
+                tile = self.tiled_encode(tile)
+            else:
+                tile = self.encoder(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
+                result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :])
+            else:
+                result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :])
+
+        enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames]
+        return enc
+
+    def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        batch_size, num_channels, num_frames, height, width = z.shape
+        num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
+        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
+        blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
+
+        row = []
+        for i in range(0, num_frames, tile_latent_stride_num_frames):
+            tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
+            if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
+                decoded = self.tiled_decode(tile, temb, return_dict=True).sample
+            else:
+                decoded = self.decoder(tile, temb)
+            if i > 0:
+                decoded = decoded[:, :, :-1, :, :]
+            row.append(decoded)
+
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
+                tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :]
+                result_row.append(tile)
+            else:
+                result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :])
+
+        dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+
     def forward(
         self,
         sample: torch.Tensor,

From e79162c1e57c252c17ceb241efd99abce8b8557a Mon Sep 17 00:00:00 2001
From: Pham Hong Vinh <phamvinh257@gmail.com>
Date: Wed, 8 Jan 2025 14:50:11 +0700
Subject: [PATCH 3/5] add sanity test tiling for ltx

---
 .../models/autoencoders/autoencoder_kl_ltx.py |  7 +++--
 .../test_models_autoencoder_ltx_video.py      | 31 +++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
index 84eeb9399733..7004f7199fad 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -998,8 +998,8 @@ def __init__(
 
         # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames
         # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered.
-        self.use_framewise_encoding = False
-        self.use_framewise_decoding = False
+        self.use_framewise_encoding = True
+        self.use_framewise_decoding = True
 
         # This can be configured based on the amount of GPU memory available.
         # `16` for sample frames and `2` for latent frames are sensible defaults for consumer GPUs.
@@ -1122,6 +1122,7 @@ def _decode(
         batch_size, num_channels, num_frames, height, width = z.shape
         tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
         tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
 
         if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
             return self._temporal_tiled_decode(z, temb, return_dict=return_dict)
@@ -1388,5 +1389,5 @@ def forward(
             z = posterior.mode()
         dec = self.decode(z, temb)
         if not return_dict:
-            return (dec,)
+            return (dec.sample,)
         return dec
diff --git a/tests/models/autoencoders/test_models_autoencoder_ltx_video.py b/tests/models/autoencoders/test_models_autoencoder_ltx_video.py
index 37f9837c8245..66d170b28eee 100644
--- a/tests/models/autoencoders/test_models_autoencoder_ltx_video.py
+++ b/tests/models/autoencoders/test_models_autoencoder_ltx_video.py
@@ -167,3 +167,34 @@ def test_outputs_equivalence(self):
     @unittest.skip("AutoencoderKLLTXVideo does not support `norm_num_groups` because it does not use GroupNorm.")
     def test_forward_with_norm_groups(self):
         pass
+
+    def test_enable_disable_tiling(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict).to(torch_device)
+
+        inputs_dict.update({"return_dict": False})
+
+        torch.manual_seed(0)
+        output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        torch.manual_seed(0)
+        model.enable_tiling()
+        output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertLess(
+            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
+            0.5,
+            "VAE tiling should not affect the inference results",
+        )
+
+        torch.manual_seed(0)
+        model.disable_tiling()
+        output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
+
+        self.assertEqual(
+            output_without_tiling.detach().cpu().numpy().all(),
+            output_without_tiling_2.detach().cpu().numpy().all(),
+            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
+        )

From c5e6d62f1171f1f2bdebb1df8c025d7b0d86a060 Mon Sep 17 00:00:00 2001
From: Pham Hong Vinh <phamvinh257@gmail.com>
Date: Fri, 10 Jan 2025 00:07:44 +0700
Subject: [PATCH 4/5] run make style

---
 .../models/autoencoders/autoencoder_kl_ltx.py      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
index 7004f7199fad..088e2832c4a5 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -1015,7 +1015,7 @@ def __init__(
         # The minimal distance between two spatial tiles
         self.tile_sample_stride_height = 448
         self.tile_sample_stride_width = 448
-        self.tile_sample_stride_num_frames = 8 
+        self.tile_sample_stride_num_frames = 8
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)):
@@ -1185,7 +1185,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
                 x / blend_extent
             )
         return b
-        
+
     def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
         for x in range(blend_extent):
@@ -1280,9 +1280,7 @@ def tiled_decode(
         for i in range(0, height, tile_latent_stride_height):
             row = []
             for j in range(0, width, tile_latent_stride_width):
-                time = self.decoder(
-                    z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb
-                )
+                time = self.decoder(z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb)
 
                 row.append(time)
             rows.append(row)
@@ -1337,7 +1335,9 @@ def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
         enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames]
         return enc
 
-    def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+    def _temporal_tiled_decode(
+        self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.Tensor]:
         batch_size, num_channels, num_frames, height, width = z.shape
         num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
 
@@ -1365,7 +1365,7 @@ def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor],
                 tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :]
                 result_row.append(tile)
             else:
-                result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :])
+                result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :])
 
         dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
 

From 88bfc368fee8ed537950620af9e5da01549f5c93 Mon Sep 17 00:00:00 2001
From: "Vinh H. Pham" <phamvinh257@gmail.com>
Date: Sat, 11 Jan 2025 15:19:27 +0700
Subject: [PATCH 5/5] Update
 src/diffusers/models/autoencoders/autoencoder_kl_ltx.py

Co-authored-by: Aryan <contact.aryanvs@gmail.com>
---
 src/diffusers/models/autoencoders/autoencoder_kl_ltx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
index 088e2832c4a5..25753afd5ce6 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py
@@ -998,8 +998,8 @@ def __init__(
 
         # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames
         # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered.
-        self.use_framewise_encoding = True
-        self.use_framewise_decoding = True
+        self.use_framewise_encoding = False
+        self.use_framewise_decoding = False
 
         # This can be configured based on the amount of GPU memory available.
         # `16` for sample frames and `2` for latent frames are sensible defaults for consumer GPUs.