From ec918b9aab6a4b3fc020361c0c5de5617ff7945b Mon Sep 17 00:00:00 2001 From: Pham Hong Vinh Date: Wed, 8 Jan 2025 00:12:33 +0700 Subject: [PATCH 1/5] add framewise decode --- .../models/autoencoders/autoencoder_kl_ltx.py | 58 ++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index 9aa53f7af243..d2562915edca 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -1010,10 +1010,12 @@ def __init__( # The minimal tile height and width for spatial tiling to be used self.tile_sample_min_height = 512 self.tile_sample_min_width = 512 + self.tile_sample_min_num_frames = 16 # The minimal distance between two spatial tiles self.tile_sample_stride_height = 448 self.tile_sample_stride_width = 448 + self.tile_sample_stride_num_frames = 8 def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)): @@ -1114,6 +1116,53 @@ def encode( if not return_dict: return (posterior,) return AutoencoderKLOutput(latent_dist=posterior) + + def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[-3], b.shape[-3], blend_extent) + for x in range(blend_extent): + b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * ( + x / blend_extent + ) + return b + + def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: + batch_size, num_channels, num_frames, height, width = z.shape + num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 + + tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio + tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio + tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio + blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames + + row = [] + for i in range(0, num_frames, tile_latent_stride_num_frames): + tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :] + if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height): + decoded = self.tiled_decode(tile, temb, return_dict=True).sample + else: + print("NOT Use tile decode") + print(f"input tile: {tile.size()}") + decoded = self.decoder(tile, temb) + print(f"output tile: {decoded.size()}") + if i > 0: + decoded = decoded[:, :, :-1, :, :] + row.append(decoded) + + result_row = [] + for i, tile in enumerate(row): + if i > 0: + tile = self.blend_t(row[i - 1], tile, blend_num_frames) + tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :] + result_row.append(tile) + else: + result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :]) + + dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames] + + if not return_dict: + return (dec,) + return DecoderOutput(sample=dec) def _decode( self, z: torch.Tensor, temb: Optional[torch.Tensor] = None, return_dict: bool = True @@ -1125,13 +1174,8 @@ def _decode( if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height): return self.tiled_decode(z, temb, return_dict=return_dict) - if self.use_framewise_decoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) + if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames: + dec = self._temporal_tiled_decode(z, temb, return_dict=False)[0] else: dec = self.decoder(z, temb) From 64a08490acb3d94c6cc1f885f5689dd2a9df9707 Mon Sep 17 00:00:00 2001 From: Pham Hong Vinh Date: Wed, 8 Jan 2025 11:09:55 +0700 Subject: [PATCH 2/5] add framewise encode, refactor tiled encode/decode --- .../models/autoencoders/autoencoder_kl_ltx.py | 174 +++++++++--------- 1 file changed, 92 insertions(+), 82 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index d2562915edca..84eeb9399733 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -1025,8 +1025,10 @@ def enable_tiling( self, tile_sample_min_height: Optional[int] = None, tile_sample_min_width: Optional[int] = None, + tile_sample_min_num_frames: Optional[int] = None, tile_sample_stride_height: Optional[float] = None, tile_sample_stride_width: Optional[float] = None, + tile_sample_stride_num_frames: Optional[float] = None, ) -> None: r""" Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to @@ -1048,8 +1050,10 @@ def enable_tiling( self.use_tiling = True self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width + self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames def disable_tiling(self) -> None: r""" @@ -1075,18 +1079,13 @@ def disable_slicing(self) -> None: def _encode(self, x: torch.Tensor) -> torch.Tensor: batch_size, num_channels, num_frames, height, width = x.shape + if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames: + return self._temporal_tiled_encode(x) + if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height): return self.tiled_encode(x) - if self.use_framewise_encoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - enc = self.encoder(x) + enc = self.encoder(x) return enc @@ -1116,53 +1115,6 @@ def encode( if not return_dict: return (posterior,) return AutoencoderKLOutput(latent_dist=posterior) - - def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: - blend_extent = min(a.shape[-3], b.shape[-3], blend_extent) - for x in range(blend_extent): - b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * ( - x / blend_extent - ) - return b - - def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: - batch_size, num_channels, num_frames, height, width = z.shape - num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 - - tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio - tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio - tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio - tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio - blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames - - row = [] - for i in range(0, num_frames, tile_latent_stride_num_frames): - tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :] - if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height): - decoded = self.tiled_decode(tile, temb, return_dict=True).sample - else: - print("NOT Use tile decode") - print(f"input tile: {tile.size()}") - decoded = self.decoder(tile, temb) - print(f"output tile: {decoded.size()}") - if i > 0: - decoded = decoded[:, :, :-1, :, :] - row.append(decoded) - - result_row = [] - for i, tile in enumerate(row): - if i > 0: - tile = self.blend_t(row[i - 1], tile, blend_num_frames) - tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :] - result_row.append(tile) - else: - result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :]) - - dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames] - - if not return_dict: - return (dec,) - return DecoderOutput(sample=dec) def _decode( self, z: torch.Tensor, temb: Optional[torch.Tensor] = None, return_dict: bool = True @@ -1171,13 +1123,13 @@ def _decode( tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio + if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames: + return self._temporal_tiled_decode(z, temb, return_dict=return_dict) + if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height): return self.tiled_decode(z, temb, return_dict=return_dict) - if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames: - dec = self._temporal_tiled_decode(z, temb, return_dict=False)[0] - else: - dec = self.decoder(z, temb) + dec = self.decoder(z, temb) if not return_dict: return (dec,) @@ -1232,6 +1184,14 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. x / blend_extent ) return b + + def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[-3], b.shape[-3], blend_extent) + for x in range(blend_extent): + b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * ( + x / blend_extent + ) + return b def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. @@ -1261,17 +1221,9 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: for i in range(0, height, self.tile_sample_stride_height): row = [] for j in range(0, width, self.tile_sample_stride_width): - if self.use_framewise_encoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - time = self.encoder( - x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width] - ) + time = self.encoder( + x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width] + ) row.append(time) rows.append(row) @@ -1327,17 +1279,9 @@ def tiled_decode( for i in range(0, height, tile_latent_stride_height): row = [] for j in range(0, width, tile_latent_stride_width): - if self.use_framewise_decoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - time = self.decoder( - z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb - ) + time = self.decoder( + z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb + ) row.append(time) rows.append(row) @@ -1362,6 +1306,72 @@ def tiled_decode( return DecoderOutput(sample=dec) + def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput: + batch_size, num_channels, num_frames, height, width = x.shape + latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1 + + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio + tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio + blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames + + row = [] + for i in range(0, num_frames, self.tile_sample_stride_num_frames): + tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :] + if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width): + tile = self.tiled_encode(tile) + else: + tile = self.encoder(tile) + if i > 0: + tile = tile[:, :, 1:, :, :] + row.append(tile) + + result_row = [] + for i, tile in enumerate(row): + if i > 0: + tile = self.blend_t(row[i - 1], tile, blend_num_frames) + result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :]) + else: + result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :]) + + enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames] + return enc + + def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: + batch_size, num_channels, num_frames, height, width = z.shape + num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 + + tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio + tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio + tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio + blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames + + row = [] + for i in range(0, num_frames, tile_latent_stride_num_frames): + tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :] + if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height): + decoded = self.tiled_decode(tile, temb, return_dict=True).sample + else: + decoded = self.decoder(tile, temb) + if i > 0: + decoded = decoded[:, :, :-1, :, :] + row.append(decoded) + + result_row = [] + for i, tile in enumerate(row): + if i > 0: + tile = self.blend_t(row[i - 1], tile, blend_num_frames) + tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :] + result_row.append(tile) + else: + result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :]) + + dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames] + + if not return_dict: + return (dec,) + return DecoderOutput(sample=dec) + def forward( self, sample: torch.Tensor, From e79162c1e57c252c17ceb241efd99abce8b8557a Mon Sep 17 00:00:00 2001 From: Pham Hong Vinh Date: Wed, 8 Jan 2025 14:50:11 +0700 Subject: [PATCH 3/5] add sanity test tiling for ltx --- .../models/autoencoders/autoencoder_kl_ltx.py | 7 +++-- .../test_models_autoencoder_ltx_video.py | 31 +++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index 84eeb9399733..7004f7199fad 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -998,8 +998,8 @@ def __init__( # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered. - self.use_framewise_encoding = False - self.use_framewise_decoding = False + self.use_framewise_encoding = True + self.use_framewise_decoding = True # This can be configured based on the amount of GPU memory available. # `16` for sample frames and `2` for latent frames are sensible defaults for consumer GPUs. @@ -1122,6 +1122,7 @@ def _decode( batch_size, num_channels, num_frames, height, width = z.shape tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames: return self._temporal_tiled_decode(z, temb, return_dict=return_dict) @@ -1388,5 +1389,5 @@ def forward( z = posterior.mode() dec = self.decode(z, temb) if not return_dict: - return (dec,) + return (dec.sample,) return dec diff --git a/tests/models/autoencoders/test_models_autoencoder_ltx_video.py b/tests/models/autoencoders/test_models_autoencoder_ltx_video.py index 37f9837c8245..66d170b28eee 100644 --- a/tests/models/autoencoders/test_models_autoencoder_ltx_video.py +++ b/tests/models/autoencoders/test_models_autoencoder_ltx_video.py @@ -167,3 +167,34 @@ def test_outputs_equivalence(self): @unittest.skip("AutoencoderKLLTXVideo does not support `norm_num_groups` because it does not use GroupNorm.") def test_forward_with_norm_groups(self): pass + + def test_enable_disable_tiling(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_tiling() + output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(), + 0.5, + "VAE tiling should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_tiling() + output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_tiling.detach().cpu().numpy().all(), + output_without_tiling_2.detach().cpu().numpy().all(), + "Without tiling outputs should match with the outputs when tiling is manually disabled.", + ) From c5e6d62f1171f1f2bdebb1df8c025d7b0d86a060 Mon Sep 17 00:00:00 2001 From: Pham Hong Vinh Date: Fri, 10 Jan 2025 00:07:44 +0700 Subject: [PATCH 4/5] run make style --- .../models/autoencoders/autoencoder_kl_ltx.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index 7004f7199fad..088e2832c4a5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -1015,7 +1015,7 @@ def __init__( # The minimal distance between two spatial tiles self.tile_sample_stride_height = 448 self.tile_sample_stride_width = 448 - self.tile_sample_stride_num_frames = 8 + self.tile_sample_stride_num_frames = 8 def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)): @@ -1185,7 +1185,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. x / blend_extent ) return b - + def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: blend_extent = min(a.shape[-3], b.shape[-3], blend_extent) for x in range(blend_extent): @@ -1280,9 +1280,7 @@ def tiled_decode( for i in range(0, height, tile_latent_stride_height): row = [] for j in range(0, width, tile_latent_stride_width): - time = self.decoder( - z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb - ) + time = self.decoder(z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb) row.append(time) rows.append(row) @@ -1337,7 +1335,9 @@ def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput: enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames] return enc - def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: + def _temporal_tiled_decode( + self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True + ) -> Union[DecoderOutput, torch.Tensor]: batch_size, num_channels, num_frames, height, width = z.shape num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 @@ -1365,7 +1365,7 @@ def _temporal_tiled_decode(self, z: torch.Tensor, temb: Optional[torch.Tensor], tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :] result_row.append(tile) else: - result_row.append(tile[:, :, :self.tile_sample_stride_num_frames + 1, :, :]) + result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :]) dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames] From 88bfc368fee8ed537950620af9e5da01549f5c93 Mon Sep 17 00:00:00 2001 From: "Vinh H. Pham" Date: Sat, 11 Jan 2025 15:19:27 +0700 Subject: [PATCH 5/5] Update src/diffusers/models/autoencoders/autoencoder_kl_ltx.py Co-authored-by: Aryan --- src/diffusers/models/autoencoders/autoencoder_kl_ltx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index 088e2832c4a5..25753afd5ce6 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -998,8 +998,8 @@ def __init__( # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered. - self.use_framewise_encoding = True - self.use_framewise_decoding = True + self.use_framewise_encoding = False + self.use_framewise_decoding = False # This can be configured based on the amount of GPU memory available. # `16` for sample frames and `2` for latent frames are sensible defaults for consumer GPUs.