diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py index e91b0689b4ce..3eefa97663e6 100644 --- a/tests/lora/test_lora_layers_sd.py +++ b/tests/lora/test_lora_layers_sd.py @@ -33,11 +33,12 @@ ) from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.testing_utils import ( + backend_empty_cache, load_image, nightly, numpy_cosine_similarity_distance, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -101,7 +102,7 @@ def tearDown(self): # Keeping this test here makes sense because it doesn't look any integration # (value assertions on logits). @slow - @require_torch_gpu + @require_torch_accelerator def test_integration_move_lora_cpu(self): path = "stable-diffusion-v1-5/stable-diffusion-v1-5" lora_id = "takuma104/lora-test-text-encoder-lora-target" @@ -158,7 +159,7 @@ def test_integration_move_lora_cpu(self): self.assertTrue(m.weight.device != torch.device("cpu")) @slow - @require_torch_gpu + @require_torch_accelerator def test_integration_move_lora_dora_cpu(self): from peft import LoraConfig @@ -209,18 +210,18 @@ def test_integration_move_lora_dora_cpu(self): @slow @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend class LoraIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_integration_logits_with_scale(self): path = "stable-diffusion-v1-5/stable-diffusion-v1-5" @@ -378,7 +379,7 @@ def test_a1111_with_model_cpu_offload(self): generator = torch.Generator().manual_seed(0) pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) lora_model_id = "hf-internal-testing/civitai-light-shadow-lora" lora_filename = "light_and_shadow.safetensors" pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) @@ -400,7 +401,7 @@ def test_a1111_with_sequential_cpu_offload(self): generator = torch.Generator().manual_seed(0) pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) lora_model_id = "hf-internal-testing/civitai-light-shadow-lora" lora_filename = "light_and_shadow.safetensors" pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) @@ -656,7 +657,7 @@ def test_sd_load_civitai_empty_network_alpha(self): See: https://github.com/huggingface/diffusers/issues/5606 """ pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") - pipeline.enable_sequential_cpu_offload() + pipeline.enable_sequential_cpu_offload(device=torch_device) civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors") pipeline.load_lora_weights(civitai_path, adapter_name="ahri") diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index a04285465951..90aaa3bcfe78 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -30,12 +30,13 @@ from diffusers.utils import load_image from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.testing_utils import ( + backend_empty_cache, is_flaky, nightly, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -93,7 +94,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): def output_shape(self): return (1, 32, 32, 3) - @require_torch_gpu + @require_torch_accelerator def test_sd3_lora(self): """ Test loading the loras that are saved with the diffusers and peft formats. @@ -135,7 +136,7 @@ def test_multiple_wrong_adapter_name_raises_error(self): @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend @require_big_gpu_with_torch_cuda @pytest.mark.big_gpu_with_torch_cuda @@ -146,12 +147,12 @@ class SD3LoraIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): init_image = load_image( diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py index 57f6e4ee440b..8e1187f11468 100644 --- a/tests/models/unets/test_models_unet_2d_condition.py +++ b/tests/models/unets/test_models_unet_2d_condition.py @@ -36,6 +36,9 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, is_peft_available, @@ -1002,7 +1005,7 @@ def test_load_sharded_checkpoint_from_hub_subfolder(self, repo_id, variant): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_from_hub_local(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy") @@ -1013,7 +1016,7 @@ def test_load_sharded_checkpoint_from_hub_local(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_from_hub_local_subfolder(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder") @@ -1024,7 +1027,7 @@ def test_load_sharded_checkpoint_from_hub_local_subfolder(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator @parameterized.expand( [ ("hf-internal-testing/unet2d-sharded-dummy", None), @@ -1039,7 +1042,7 @@ def test_load_sharded_checkpoint_device_map_from_hub(self, repo_id, variant): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator @parameterized.expand( [ ("hf-internal-testing/unet2d-sharded-dummy-subfolder", None), @@ -1054,7 +1057,7 @@ def test_load_sharded_checkpoint_device_map_from_hub_subfolder(self, repo_id, va assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_device_map_from_hub_local(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy") @@ -1064,7 +1067,7 @@ def test_load_sharded_checkpoint_device_map_from_hub_local(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_device_map_from_hub_local_subfolder(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder") @@ -1164,11 +1167,11 @@ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"): return model - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_auto(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) unet = self.get_unet_model() unet.set_attention_slice("auto") @@ -1180,15 +1183,15 @@ def test_set_attention_slice_auto(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_max(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) unet = self.get_unet_model() unet.set_attention_slice("max") @@ -1200,15 +1203,15 @@ def test_set_attention_slice_max(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_int(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) unet = self.get_unet_model() unet.set_attention_slice(2) @@ -1220,15 +1223,15 @@ def test_set_attention_slice_int(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_list(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # there are 32 sliceable layers slice_list = 16 * [2, 3] @@ -1242,7 +1245,7 @@ def test_set_attention_slice_list(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 157eefd3154b..bb21c9ac8dcb 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -79,7 +79,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.to("cuda") + pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe.unet.to(memory_format=torch.channels_last) diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py index 6e752804e2e0..ca05db504485 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py @@ -40,7 +40,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, floats_tensor, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -245,7 +245,7 @@ def test_xformers_attention_forwardGenerator_pass(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -254,12 +254,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index 1e540738b60e..503db2f574e2 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -223,12 +223,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py index a7e2c10489f6..9a270c2bbf07 100644 --- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py +++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py @@ -31,6 +31,7 @@ from diffusers.models import FluxControlNetModel from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, nightly, numpy_cosine_similarity_distance, @@ -217,12 +218,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = FluxControlNetModel.from_pretrained( diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py index 04daca27c3dd..ca940dd56788 100644 --- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py +++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py @@ -239,7 +239,7 @@ def test_canny(self): pipe = StableDiffusion3ControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py index 2df39e73476d..d5f7d7577fc7 100644 --- a/tests/pipelines/flux/test_pipeline_flux.py +++ b/tests/pipelines/flux/test_pipeline_flux.py @@ -9,6 +9,7 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel from diffusers.utils.testing_utils import ( + backend_empty_cache, nightly, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, @@ -212,12 +213,12 @@ class FluxPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): generator = torch.Generator(device="cpu").manual_seed(seed) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index a8180a3bc27f..401fab6c2c96 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -34,11 +34,12 @@ from diffusers.image_processor import IPAdapterMaskProcessor from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, is_flaky, load_pt, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -54,13 +55,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_image_encoder(self, repo_id, subfolder): image_encoder = CLIPVisionModelWithProjection.from_pretrained( @@ -165,7 +166,7 @@ def get_dummy_inputs( @slow -@require_torch_gpu +@require_torch_accelerator class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin): def test_text_to_image(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -280,7 +281,7 @@ def test_text_to_image_model_cpu_offload(self): inputs = self.get_dummy_inputs() output_without_offload = pipeline(**inputs).images - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) inputs = self.get_dummy_inputs() output_with_offload = pipeline(**inputs).images max_diff = np.abs(output_with_offload - output_without_offload).max() @@ -391,7 +392,7 @@ def test_text_to_image_face_id(self): @slow -@require_torch_gpu +@require_torch_accelerator class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin): def test_text_to_image_sdxl(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder") @@ -403,7 +404,7 @@ def test_text_to_image_sdxl(self): feature_extractor=feature_extractor, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs() @@ -461,7 +462,7 @@ def test_image_to_image_sdxl(self): feature_extractor=feature_extractor, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs(for_image_to_image=True) @@ -530,7 +531,7 @@ def test_inpainting_sdxl(self): feature_extractor=feature_extractor, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs(for_inpainting=True) @@ -578,7 +579,7 @@ def test_ip_adapter_mask(self): image_encoder=image_encoder, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( "h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors" ) @@ -606,7 +607,7 @@ def test_ip_adapter_multiple_masks(self): image_encoder=image_encoder, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2 ) @@ -633,7 +634,7 @@ def test_instant_style_multiple_masks(self): pipeline = StableDiffusionXLPipeline.from_pretrained( "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16" ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( ["ostris/ip-composition-adapter", "h94/IP-Adapter"], @@ -674,7 +675,7 @@ def test_ip_adapter_multiple_masks_one_adapter(self): image_encoder=image_encoder, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] ) diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py index 1a13ec75d082..30144e37a9d4 100644 --- a/tests/pipelines/kandinsky/test_kandinsky.py +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -24,10 +24,11 @@ from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -246,7 +247,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -255,12 +256,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -275,19 +276,19 @@ def test_offloads(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_text2img(self): expected_image = load_numpy( @@ -306,7 +307,7 @@ def test_kandinsky_text2img(self): prompt = "red cat, 4k photo" - generator = torch.Generator(device="cuda").manual_seed(0) + generator = torch.Generator(device=torch_device).manual_seed(0) image_emb, zero_image_emb = pipe_prior( prompt, generator=generator, @@ -314,7 +315,7 @@ def test_kandinsky_text2img(self): negative_prompt="", ).to_tuple() - generator = torch.Generator(device="cuda").manual_seed(0) + generator = torch.Generator(device=torch_device).manual_seed(0) output = pipeline( prompt, image_embeds=image_emb, diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index 3c8767a708d4..c5f27a9cc9a9 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -18,7 +18,7 @@ import numpy as np from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin from .test_kandinsky import Dummies @@ -105,7 +105,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -114,12 +114,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -213,7 +213,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -222,12 +222,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -325,7 +325,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -334,12 +334,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py index 23f13ffee223..26361ce18b82 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -32,12 +32,13 @@ ) from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, nightly, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -267,7 +268,7 @@ def test_kandinsky_img2img(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -299,19 +300,19 @@ def test_dict_tuple_outputs_equivalent(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_img2img(self): expected_image = load_numpy( @@ -365,19 +366,19 @@ def test_kandinsky_img2img(self): @nightly -@require_torch_gpu +@require_torch_accelerator class KandinskyImg2ImgPipelineNightlyTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_img2img_ddpm(self): expected_image = load_numpy( diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py index ebb1a4d88739..e30c601b6011 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -25,12 +25,13 @@ from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, nightly, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -265,7 +266,7 @@ def test_kandinsky_inpaint(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -274,12 +275,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -297,19 +298,19 @@ def test_float16_inference(self): @nightly -@require_torch_gpu +@require_torch_accelerator class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_inpaint(self): expected_image = load_numpy( diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky.py b/tests/pipelines/kandinsky2_2/test_kandinsky.py index cbd9166efada..fea49d47b7bb 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky.py @@ -22,12 +22,14 @@ from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -221,19 +223,19 @@ def test_float16_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyV22PipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_text2img(self): expected_image = load_numpy( @@ -244,12 +246,12 @@ def test_kandinsky_text2img(self): pipe_prior = KandinskyV22PriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 ) - pipe_prior.enable_model_cpu_offload() + pipe_prior.enable_model_cpu_offload(device=torch_device) pipeline = KandinskyV22Pipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) prompt = "red cat, 4k photo" diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py index bbf2f08a7b08..90f8b2034109 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py @@ -22,7 +22,7 @@ KandinskyV22Img2ImgCombinedPipeline, KandinskyV22InpaintCombinedPipeline, ) -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin from .test_kandinsky import Dummies @@ -110,7 +110,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -119,12 +119,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -234,7 +234,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -243,12 +243,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -357,7 +357,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -366,12 +366,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py index 26d8b45cf900..4702f473a992 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py @@ -29,13 +29,15 @@ VQModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -238,19 +240,19 @@ def test_float16_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_img2img(self): expected_image = load_numpy( @@ -266,12 +268,12 @@ def test_kandinsky_img2img(self): pipe_prior = KandinskyV22PriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 ) - pipe_prior.enable_model_cpu_offload() + pipe_prior.enable_model_cpu_offload(device=torch_device) pipeline = KandinskyV22Img2ImgPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py index 25cf4bbed456..9a7f659e533c 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py @@ -29,13 +29,14 @@ VQModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, is_flaky, load_image, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -292,19 +293,19 @@ def callback_inputs_test(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_inpaint(self): expected_image = load_numpy( diff --git a/tests/pipelines/kandinsky3/test_kandinsky3.py b/tests/pipelines/kandinsky3/test_kandinsky3.py index 941ef9093361..af1d45ff8975 100644 --- a/tests/pipelines/kandinsky3/test_kandinsky3.py +++ b/tests/pipelines/kandinsky3/test_kandinsky3.py @@ -31,10 +31,12 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.schedulers.scheduling_ddpm import DDPMScheduler from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_image, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..pipeline_params import ( @@ -167,25 +169,25 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class Kandinsky3PipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinskyV3(self): pipe = AutoPipelineForText2Image.from_pretrained( "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." @@ -211,7 +213,7 @@ def test_kandinskyV3_img2img(self): pipe = AutoPipelineForImage2Image.from_pretrained( "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py index 8c817df32e0c..e00948621a06 100644 --- a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py +++ b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py @@ -31,10 +31,11 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.schedulers.scheduling_ddpm import DDPMScheduler from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -192,25 +193,25 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class Kandinsky3Img2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinskyV3_img2img(self): pipe = AutoPipelineForImage2Image.from_pretrained( "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py index 4db79ad16a03..570fa8fadf39 100644 --- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py +++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py @@ -13,8 +13,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -222,11 +223,11 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class LatentConsistencyModelPipelineSlowTests(unittest.TestCase): def setUp(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py index 1187d555bb5e..88e31a97aac5 100644 --- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py +++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py @@ -14,10 +14,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -229,11 +230,11 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class LatentConsistencyModelImg2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py index d6001cfed0f5..537d352162a4 100644 --- a/tests/pipelines/latte/test_latte.py +++ b/tests/pipelines/latte/test_latte.py @@ -30,9 +30,10 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -218,25 +219,25 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class LattePipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_latte(self): generator = torch.Generator("cpu").manual_seed(0) pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt videos = pipe( diff --git a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py index 4aa48a920fad..342561d4f5e9 100644 --- a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py +++ b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py @@ -29,10 +29,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -202,17 +203,17 @@ def test_ledits_pp_warmup_steps(self): @slow -@require_torch_gpu +@require_torch_accelerator class LEditsPPPipelineStableDiffusionSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @classmethod def setUpClass(cls): diff --git a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py index da694175a9f1..75795a33422b 100644 --- a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py +++ b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py @@ -41,7 +41,7 @@ enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -253,7 +253,7 @@ def test_ledits_pp_warmup_steps(self): @slow -@require_torch_gpu +@require_torch_accelerator class LEditsPPPipelineStableDiffusionXLSlowTests(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/tests/pipelines/lumina/test_lumina_nextdit.py b/tests/pipelines/lumina/test_lumina_nextdit.py index e3a364f38e0a..034a0185d338 100644 --- a/tests/pipelines/lumina/test_lumina_nextdit.py +++ b/tests/pipelines/lumina/test_lumina_nextdit.py @@ -7,8 +7,9 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, LuminaNextDiT2DModel, LuminaText2ImgPipeline from diffusers.utils.testing_utils import ( + backend_empty_cache, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -100,7 +101,7 @@ def test_xformers_attention_forwardGenerator_pass(self): @slow -@require_torch_gpu +@require_torch_accelerator class LuminaText2ImgPipelineSlowTests(unittest.TestCase): pipeline_class = LuminaText2ImgPipeline repo_id = "Alpha-VLLM/Lumina-Next-SFT-diffusers" @@ -108,12 +109,12 @@ class LuminaText2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): if str(device).startswith("mps"): @@ -131,7 +132,7 @@ def get_inputs(self, device, seed=0): def test_lumina_inference(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/marigold/test_marigold_depth.py b/tests/pipelines/marigold/test_marigold_depth.py index a5700bae7bb5..13f9a421861b 100644 --- a/tests/pipelines/marigold/test_marigold_depth.py +++ b/tests/pipelines/marigold/test_marigold_depth.py @@ -32,12 +32,14 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, is_flaky, load_image, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -288,17 +290,17 @@ def test_marigold_depth_dummy_no_processing_resolution(self): @slow -@require_torch_gpu +@require_torch_accelerator class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def _test_marigold_depth( self, @@ -317,8 +319,7 @@ def _test_marigold_depth( from_pretrained_kwargs["torch_dtype"] = torch.float16 pipe = MarigoldDepthPipeline.from_pretrained(model_id, **from_pretrained_kwargs) - if device == "cuda": - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device=device).manual_seed(generator_seed) @@ -358,7 +359,7 @@ def test_marigold_depth_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self): def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=False, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1244, 0.1265, 0.1292, 0.1240, 0.1252, 0.1266, 0.1246, 0.1226, 0.1180]), num_inference_steps=1, @@ -371,7 +372,7 @@ def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1241, 0.1262, 0.1290, 0.1238, 0.1250, 0.1265, 0.1244, 0.1225, 0.1179]), num_inference_steps=1, @@ -384,7 +385,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=2024, expected_slice=np.array([0.1710, 0.1725, 0.1738, 0.1700, 0.1700, 0.1696, 0.1698, 0.1663, 0.1592]), num_inference_steps=1, @@ -397,7 +398,7 @@ def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]), num_inference_steps=2, @@ -410,7 +411,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.2683, 0.2693, 0.2698, 0.2666, 0.2632, 0.2615, 0.2656, 0.2603, 0.2573]), num_inference_steps=1, @@ -423,7 +424,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1200, 0.1215, 0.1237, 0.1193, 0.1197, 0.1202, 0.1196, 0.1166, 0.1109]), num_inference_steps=1, @@ -437,7 +438,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1121, 0.1135, 0.1155, 0.1111, 0.1115, 0.1118, 0.1111, 0.1079, 0.1019]), num_inference_steps=1, @@ -451,7 +452,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.2671, 0.2690, 0.2720, 0.2659, 0.2676, 0.2739, 0.2664, 0.2686, 0.2573]), num_inference_steps=1, diff --git a/tests/pipelines/marigold/test_marigold_normals.py b/tests/pipelines/marigold/test_marigold_normals.py index bc2662196c38..1797f99b213b 100644 --- a/tests/pipelines/marigold/test_marigold_normals.py +++ b/tests/pipelines/marigold/test_marigold_normals.py @@ -32,11 +32,13 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -285,17 +287,17 @@ def test_marigold_depth_dummy_no_processing_resolution(self): @slow -@require_torch_gpu +@require_torch_accelerator class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def _test_marigold_normals( self, @@ -314,8 +316,7 @@ def _test_marigold_normals( from_pretrained_kwargs["torch_dtype"] = torch.float16 pipe = MarigoldNormalsPipeline.from_pretrained(model_id, **from_pretrained_kwargs) - if device == "cuda": - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device=device).manual_seed(generator_seed) @@ -342,7 +343,7 @@ def _test_marigold_normals( def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self): self._test_marigold_normals( is_fp16=False, - device="cpu", + device=torch_device, generator_seed=0, expected_slice=np.array([0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971]), num_inference_steps=1, @@ -355,7 +356,7 @@ def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self): def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=False, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7980, 0.7952, 0.7914, 0.7931, 0.7871, 0.7816, 0.7844, 0.7710, 0.7601]), num_inference_steps=1, @@ -368,7 +369,7 @@ def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7979, 0.7949, 0.7915, 0.7930, 0.7871, 0.7817, 0.7842, 0.7710, 0.7603]), num_inference_steps=1, @@ -381,7 +382,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=2024, expected_slice=np.array([0.8428, 0.8428, 0.8433, 0.8369, 0.8325, 0.8315, 0.8271, 0.8135, 0.8057]), num_inference_steps=1, @@ -394,7 +395,7 @@ def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7095, 0.7095, 0.7104, 0.7070, 0.7051, 0.7061, 0.7017, 0.6938, 0.6914]), num_inference_steps=2, @@ -407,7 +408,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7168, 0.7163, 0.7163, 0.7080, 0.7061, 0.7046, 0.7031, 0.7007, 0.6987]), num_inference_steps=1, @@ -420,7 +421,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7114, 0.7124, 0.7144, 0.7085, 0.7070, 0.7080, 0.7051, 0.6958, 0.6924]), num_inference_steps=1, @@ -434,7 +435,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7412, 0.7441, 0.7490, 0.7383, 0.7388, 0.7437, 0.7329, 0.7271, 0.7300]), num_inference_steps=1, @@ -448,7 +449,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7188, 0.7144, 0.7134, 0.7178, 0.7207, 0.7222, 0.7231, 0.7041, 0.6987]), num_inference_steps=1, diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py index ed41e82aca9f..32d09155cdeb 100644 --- a/tests/pipelines/mochi/test_mochi.py +++ b/tests/pipelines/mochi/test_mochi.py @@ -23,6 +23,7 @@ from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, nightly, numpy_cosine_similarity_distance, @@ -274,18 +275,18 @@ class MochiPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_mochi(self): generator = torch.Generator("cpu").manual_seed(0) pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt videos = pipe( diff --git a/tests/pipelines/pag/test_pag_sd.py b/tests/pipelines/pag/test_pag_sd.py index 8c3818c1c125..d4cf00b034ff 100644 --- a/tests/pipelines/pag/test_pag_sd.py +++ b/tests/pipelines/pag/test_pag_sd.py @@ -30,8 +30,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -285,7 +286,7 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionPAGPipeline repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" @@ -293,12 +294,12 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=1, guidance_scale=7.0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -315,7 +316,7 @@ def get_inputs(self, device, generator_device="cpu", seed=1, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -333,7 +334,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sd3_img2img.py b/tests/pipelines/pag/test_pag_sd3_img2img.py index bffcd254e2c5..592e94953ecc 100644 --- a/tests/pipelines/pag/test_pag_sd3_img2img.py +++ b/tests/pipelines/pag/test_pag_sd3_img2img.py @@ -16,10 +16,11 @@ StableDiffusion3PAGImg2ImgPipeline, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -193,7 +194,7 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusion3PAGImg2ImgPipeline repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" @@ -201,12 +202,12 @@ class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs( self, device, generator_device="cpu", dtype=torch.float32, seed=0, guidance_scale=7.0, pag_scale=0.7 @@ -233,7 +234,7 @@ def test_pag_cfg(self): pipeline = AutoPipelineForImage2Image.from_pretrained( self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.17"] ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/pag/test_pag_sd_img2img.py b/tests/pipelines/pag/test_pag_sd_img2img.py index 8b13a76907af..d000493d6bd1 100644 --- a/tests/pipelines/pag/test_pag_sd_img2img.py +++ b/tests/pipelines/pag/test_pag_sd_img2img.py @@ -32,10 +32,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -219,7 +220,7 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionPAGImg2ImgPipeline repo_id = "Jiali/stable-diffusion-1.5" @@ -227,12 +228,12 @@ class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -254,7 +255,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0 def test_pag_cfg(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -272,7 +273,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sd_inpaint.py b/tests/pipelines/pag/test_pag_sd_inpaint.py index 93b562792c14..06682c111d37 100644 --- a/tests/pipelines/pag/test_pag_sd_inpaint.py +++ b/tests/pipelines/pag/test_pag_sd_inpaint.py @@ -30,10 +30,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -251,7 +252,7 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionPAGInpaintPipeline repo_id = "runwayml/stable-diffusion-v1-5" @@ -259,12 +260,12 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" @@ -289,7 +290,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -307,7 +308,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sdxl.py b/tests/pipelines/pag/test_pag_sdxl.py index 1d7dfb95a993..b35b2b1d2f7e 100644 --- a/tests/pipelines/pag/test_pag_sdxl.py +++ b/tests/pipelines/pag/test_pag_sdxl.py @@ -30,8 +30,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -289,7 +290,7 @@ def test_save_load_optional_components(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionXLPAGPipeline repo_id = "stabilityai/stable-diffusion-xl-base-1.0" @@ -297,12 +298,12 @@ class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -319,7 +320,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -336,7 +337,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sdxl_img2img.py b/tests/pipelines/pag/test_pag_sdxl_img2img.py index ffaeaa749ce4..c94a6836de7f 100644 --- a/tests/pipelines/pag/test_pag_sdxl_img2img.py +++ b/tests/pipelines/pag/test_pag_sdxl_img2img.py @@ -39,10 +39,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -268,19 +269,19 @@ def test_save_load_optional_components(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase): repo_id = "stabilityai/stable-diffusion-xl-base-1.0" def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): img_url = ( @@ -304,7 +305,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -321,7 +322,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sdxl_inpaint.py b/tests/pipelines/pag/test_pag_sdxl_inpaint.py index 191b44118ef8..cca5292288b0 100644 --- a/tests/pipelines/pag/test_pag_sdxl_inpaint.py +++ b/tests/pipelines/pag/test_pag_sdxl_inpaint.py @@ -40,10 +40,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -273,19 +274,19 @@ def test_save_load_optional_components(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase): repo_id = "stabilityai/stable-diffusion-xl-base-1.0" def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" @@ -310,7 +311,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -327,7 +328,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py index 6b71f8bb8197..4b5ccd110bbe 100644 --- a/tests/pipelines/pixart_alpha/test_pixart.py +++ b/tests/pipelines/pixart_alpha/test_pixart.py @@ -28,9 +28,10 @@ PixArtTransformer2DModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -254,7 +255,7 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class PixArtAlphaPipelineIntegrationTests(unittest.TestCase): ckpt_id_1024 = "PixArt-alpha/PixArt-XL-2-1024-MS" ckpt_id_512 = "PixArt-alpha/PixArt-XL-2-512x512" @@ -263,18 +264,18 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_pixart_1024(self): generator = torch.Generator("cpu").manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images @@ -289,7 +290,7 @@ def test_pixart_512(self): generator = torch.Generator("cpu").manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt @@ -305,7 +306,7 @@ def test_pixart_1024_without_resolution_binning(self): generator = torch.manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 1024, 768 @@ -339,7 +340,7 @@ def test_pixart_512_without_resolution_binning(self): generator = torch.manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 512, 768 diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py index ca2d1cbb8474..db310b0333f6 100644 --- a/tests/pipelines/pixart_sigma/test_pixart.py +++ b/tests/pipelines/pixart_sigma/test_pixart.py @@ -28,9 +28,10 @@ PixArtTransformer2DModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -283,7 +284,7 @@ def test_fused_qkv_projections(self): @slow -@require_torch_gpu +@require_torch_accelerator class PixArtSigmaPipelineIntegrationTests(unittest.TestCase): ckpt_id_1024 = "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS" ckpt_id_512 = "PixArt-alpha/PixArt-Sigma-XL-2-512-MS" @@ -292,18 +293,18 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_pixart_1024(self): generator = torch.Generator("cpu").manual_seed(0) pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images @@ -323,7 +324,7 @@ def test_pixart_512(self): pipe = PixArtSigmaPipeline.from_pretrained( self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt @@ -339,7 +340,7 @@ def test_pixart_1024_without_resolution_binning(self): generator = torch.manual_seed(0) pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 1024, 768 @@ -378,7 +379,7 @@ def test_pixart_512_without_resolution_binning(self): pipe = PixArtSigmaPipeline.from_pretrained( self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 512, 768 diff --git a/tests/pipelines/sana/test_sana.py b/tests/pipelines/sana/test_sana.py index 34df808d3320..aa5d5c7ce463 100644 --- a/tests/pipelines/sana/test_sana.py +++ b/tests/pipelines/sana/test_sana.py @@ -22,8 +22,9 @@ from diffusers import AutoencoderDC, FlowMatchEulerDiscreteScheduler, SanaPipeline, SanaTransformer2DModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -305,19 +306,19 @@ def test_float16_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class SanaPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_sana_1024(self): generator = torch.Generator("cpu").manual_seed(0) @@ -325,7 +326,7 @@ def test_sana_1024(self): pipe = SanaPipeline.from_pretrained( "Efficient-Large-Model/Sana_1600M_1024px_diffusers", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) image = pipe( prompt=self.prompt, @@ -351,7 +352,7 @@ def test_sana_512(self): pipe = SanaPipeline.from_pretrained( "Efficient-Large-Model/Sana_1600M_512px_diffusers", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) image = pipe( prompt=self.prompt, diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py index e220e441a350..1765f3a02242 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py @@ -22,7 +22,7 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadeCombinedPipeline from diffusers.models import StableCascadeUNet from diffusers.pipelines.wuerstchen import PaellaVQModel -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin @@ -205,7 +205,7 @@ def test_stable_cascade(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -214,12 +214,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py index 87c1a76cb277..afcd8fca71ca 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py @@ -24,11 +24,12 @@ from diffusers.models import StableCascadeUNet from diffusers.pipelines.wuerstchen import PaellaVQModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_numpy, load_pt, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -278,25 +279,25 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_cascade_decoder(self): pipe = StableCascadeDecoderPipeline.from_pretrained( "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py index fb879eb5a29b..0374de9b0219 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py @@ -24,11 +24,12 @@ from diffusers.models import StableCascadeUNet from diffusers.utils.import_utils import is_peft_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_numpy, numpy_cosine_similarity_distance, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -246,25 +247,25 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableCascadePriorPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_cascade_prior(self): pipe = StableCascadePriorPipeline.from_pretrained( "stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index c4ce562c3f0f..42a18221ea6d 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -44,6 +44,10 @@ ) from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, is_torch_compile, load_image, @@ -52,7 +56,7 @@ numpy_cosine_similarity_distance, require_accelerate_version_greater, require_torch_2, - require_torch_gpu, + require_torch_accelerator, require_torch_multi_gpu, run_test_in_subprocess, skip_mps, @@ -781,11 +785,11 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineSlowTests(unittest.TestCase): def setUp(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -887,7 +891,7 @@ def test_stable_diffusion_dpm(self): assert np.abs(image_slice - expected_slice).max() < 3e-3 def test_stable_diffusion_attention_slicing(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe.unet.set_default_attn_processor() pipe = pipe.to(torch_device) @@ -898,8 +902,8 @@ def test_stable_diffusion_attention_slicing(self): inputs = self.get_inputs(torch_device, dtype=torch.float16) image_sliced = pipe(**inputs).images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 3.75 GB is allocated assert mem_bytes < 3.75 * 10**9 @@ -910,13 +914,13 @@ def test_stable_diffusion_attention_slicing(self): image = pipe(**inputs).images # make sure that more than 3.75 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3.75 * 10**9 max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) assert max_diff < 1e-3 def test_stable_diffusion_vae_slicing(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -929,8 +933,8 @@ def test_stable_diffusion_vae_slicing(self): inputs["latents"] = torch.cat([inputs["latents"]] * 4) image_sliced = pipe(**inputs).images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 4 GB is allocated assert mem_bytes < 4e9 @@ -942,14 +946,14 @@ def test_stable_diffusion_vae_slicing(self): image = pipe(**inputs).images # make sure that more than 4 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 4e9 # There is a small discrepancy at the image borders vs. a fully batched version. max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) assert max_diff < 1e-2 def test_stable_diffusion_vae_tiling(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) model_id = "CompVis/stable-diffusion-v1-4" pipe = StableDiffusionPipeline.from_pretrained( model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None @@ -963,7 +967,7 @@ def test_stable_diffusion_vae_tiling(self): # enable vae tiling pipe.enable_vae_tiling() - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) generator = torch.Generator(device="cpu").manual_seed(0) output_chunked = pipe( [prompt], @@ -976,7 +980,7 @@ def test_stable_diffusion_vae_tiling(self): ) image_chunked = output_chunked.images - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # disable vae tiling pipe.disable_vae_tiling() @@ -1069,26 +1073,25 @@ def test_stable_diffusion_low_cpu_mem_usage(self): assert 2 * low_cpu_mem_usage_time < normal_load_time def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.8 GB is allocated assert mem_bytes < 2.8 * 10**9 def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) @@ -1102,7 +1105,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) outputs = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # With model offloading @@ -1113,16 +1116,16 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): ) pipe.unet.set_default_attn_processor() - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, dtype=torch.float16) outputs_offloaded = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() + mem_bytes_offloaded = backend_max_memory_allocated(torch_device) images = outputs.images offloaded_images = outputs_offloaded.images @@ -1135,13 +1138,13 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): assert module.device == torch.device("cpu") # With attention slicing - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe.enable_attention_slicing() _ = pipe(**inputs) - mem_bytes_slicing = torch.cuda.max_memory_allocated() + mem_bytes_slicing = backend_max_memory_allocated(torch_device) assert mem_bytes_slicing < mem_bytes_offloaded assert mem_bytes_slicing < 3 * 10**9 @@ -1156,7 +1159,7 @@ def test_stable_diffusion_textual_inversion(self): ) pipe.load_textual_inversion(a111_file) pipe.load_textual_inversion(a111_file_neg) - pipe.to("cuda") + pipe.to(torch_device) generator = torch.Generator(device="cpu").manual_seed(1) @@ -1173,7 +1176,7 @@ def test_stable_diffusion_textual_inversion(self): def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self): pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") @@ -1198,8 +1201,8 @@ def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self): def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self): pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - pipe.enable_sequential_cpu_offload() - pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") + pipe.enable_sequential_cpu_offload(device=torch_device) + pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons").to(torch_device) a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") a111_file_neg = hf_hub_download( @@ -1257,17 +1260,17 @@ def test_stable_diffusion_lcm(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineCkptTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_download_from_hub(self): ckpt_paths = [ @@ -1278,7 +1281,7 @@ def test_download_from_hub(self): for ckpt_path in ckpt_paths: pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.to(torch_device) image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -1294,7 +1297,7 @@ def test_download_local(self): ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16 ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.to(torch_device) image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -1302,17 +1305,17 @@ def test_download_local(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -1412,7 +1415,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, generator_device="cpu", seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index ae40822ade80..82b01a74869a 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -35,6 +35,10 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, is_torch_compile, @@ -42,7 +46,7 @@ load_numpy, nightly, require_torch_2, - require_torch_gpu, + require_torch_accelerator, run_test_in_subprocess, skip_mps, slow, @@ -400,17 +404,17 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -513,28 +517,28 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: assert number_of_steps == 2 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionImg2ImgPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) @@ -548,7 +552,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # With model offloading @@ -559,14 +563,14 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): torch_dtype=torch.float16, ) - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) _ = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() + mem_bytes_offloaded = backend_max_memory_allocated(torch_device) assert mem_bytes_offloaded < mem_bytes for module in pipe.text_encoder, pipe.unet, pipe.vae: @@ -663,17 +667,17 @@ def test_img2img_compile(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index e2a7821beb31..e21cf23b8cbf 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -37,6 +37,10 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, is_torch_compile, @@ -44,7 +48,7 @@ load_numpy, nightly, require_torch_2, - require_torch_gpu, + require_torch_accelerator, run_test_in_subprocess, slow, torch_device, @@ -602,7 +606,7 @@ def test_stable_diffusion_inpaint_euler(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() @@ -610,7 +614,7 @@ def setUp(self): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -704,21 +708,21 @@ def test_stable_diffusion_inpaint_k_lms(self): assert np.abs(expected_slice - image_slice).max() < 6e-3 def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 @@ -793,7 +797,7 @@ def test_stable_diffusion_simple_inpaint_ddim(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.TestCase): def setUp(self): super().setUp() @@ -801,7 +805,7 @@ def setUp(self): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -907,9 +911,9 @@ def test_stable_diffusion_inpaint_k_lms(self): assert np.abs(expected_slice - image_slice).max() < 6e-3 def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) vae = AsymmetricAutoencoderKL.from_pretrained( "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16 @@ -920,12 +924,12 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): pipe.vae = vae pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.45 GB is allocated assert mem_bytes < 2.45 * 10**9 @@ -1009,7 +1013,7 @@ def test_download_local(self): pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe.vae = vae pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.to(torch_device) inputs = self.get_inputs(torch_device) inputs["num_inference_steps"] = 1 @@ -1019,17 +1023,17 @@ def test_download_local(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 5690caa257b7..9721bb02ee3e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -33,10 +33,14 @@ ) from diffusers.image_processor import VaeImageProcessor from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -266,17 +270,17 @@ def callback_no_cfg(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, seed=0): generator = torch.manual_seed(seed) @@ -384,21 +388,21 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: assert number_of_steps == 3 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs() _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 5790d4dccec7..3f9f7e965b40 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -34,12 +34,13 @@ from diffusers.utils.testing_utils import ( CaptureLogger, backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, load_numpy, nightly, numpy_cosine_similarity_distance, require_torch_accelerator, - require_torch_gpu, skip_mps, slow, torch_device, @@ -330,9 +331,8 @@ def tearDown(self): backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - _generator_device = "cpu" if not generator_device.startswith("cuda") else "cuda" if not str(device).startswith("mps"): - generator = torch.Generator(device=_generator_device).manual_seed(seed) + generator = torch.Generator(device=generator_device).manual_seed(seed) else: generator = torch.manual_seed(seed) @@ -361,9 +361,9 @@ def test_stable_diffusion_default_ddim(self): expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) assert np.abs(image_slice - expected_slice).max() < 7e-3 - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_attention_slicing(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16 ) @@ -376,8 +376,8 @@ def test_stable_diffusion_attention_slicing(self): inputs = self.get_inputs(torch_device, dtype=torch.float16) image_sliced = pipe(**inputs).images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 3.3 GB is allocated assert mem_bytes < 3.3 * 10**9 @@ -388,7 +388,7 @@ def test_stable_diffusion_attention_slicing(self): image = pipe(**inputs).images # make sure that more than 3.3 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3.3 * 10**9 max_diff = numpy_cosine_similarity_distance(image.flatten(), image_sliced.flatten()) assert max_diff < 5e-3 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index e66c270a5f91..0a0051816162 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -37,6 +37,7 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, @@ -44,7 +45,7 @@ nightly, require_accelerate_version_greater, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -378,17 +379,17 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=device).manual_seed(seed) @@ -425,17 +426,17 @@ def test_stable_diffusion_depth2img_pipeline_default(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py index 567e3e2fd466..34ea56664a95 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py @@ -33,12 +33,13 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, nightly, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -299,18 +300,18 @@ def test_encode_prompt_works_in_isolation(self): return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) -@require_torch_gpu +@require_torch_accelerator @nightly class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @classmethod def setUpClass(cls): @@ -331,7 +332,7 @@ def test_stable_diffusion_diffedit_full(self): pipe.scheduler.clip_sample = True pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) source_prompt = "a bowl of fruit" @@ -377,17 +378,17 @@ def test_stable_diffusion_diffedit_full(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @classmethod def setUpClass(cls): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index e20b07640cb4..2feeaaf11c12 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -24,11 +24,14 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -161,19 +164,19 @@ def test_encode_prompt_works_in_isolation(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_inpaint_pipeline(self): init_image = load_image( @@ -248,9 +251,9 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self): assert np.abs(expected_image - image).max() < 5e-1 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" @@ -270,7 +273,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) prompt = "Face of a yellow cat, high resolution, sitting on a park bench" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 52458286df8b..22e588a9327b 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -31,11 +31,12 @@ ) from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -284,29 +285,29 @@ def test_encode_prompt_works_in_isolation(self): pass -@require_torch_gpu +@require_torch_accelerator @slow class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_latent_upscaler_fp16(self): generator = torch.manual_seed(33) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe.to("cuda") + pipe.to(torch_device) upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 ) - upscaler.to("cuda") + upscaler.to(torch_device) prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic" @@ -332,7 +333,7 @@ def test_latent_upscaler_fp16_image(self): upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 ) - upscaler.to("cuda") + upscaler.to(torch_device) prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py index 4b04169a270b..5400c21c9f87 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py @@ -25,12 +25,16 @@ from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -44,13 +48,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @property def dummy_image(self): @@ -381,19 +385,19 @@ def test_stable_diffusion_upscale_from_save_pretrained(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_upscale_pipeline(self): image = load_image( @@ -459,9 +463,9 @@ def test_stable_diffusion_upscale_pipeline_fp16(self): assert np.abs(expected_image - image).max() < 5e-1 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" @@ -475,7 +479,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) prompt = "a cat sitting on a park bench" @@ -488,6 +492,6 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): output_type="np", ) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.9 GB is allocated assert mem_bytes < 2.9 * 10**9 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index d69d1c492548..1953017c0ee8 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -31,11 +31,15 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, load_numpy, numpy_cosine_similarity_distance, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -49,13 +53,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @property def dummy_cond_unet(self): @@ -258,19 +262,19 @@ def test_stable_diffusion_v_pred_fp16(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_v_pred_default(self): sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") @@ -357,7 +361,7 @@ def test_stable_diffusion_v_pred_dpm(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 def test_stable_diffusion_attention_slicing_v_pred(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) model_id = "stabilityai/stable-diffusion-2" pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe.to(torch_device) @@ -373,8 +377,8 @@ def test_stable_diffusion_attention_slicing_v_pred(self): ) image_chunked = output_chunked.images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 5.5 GB is allocated assert mem_bytes < 5.5 * 10**9 @@ -385,7 +389,7 @@ def test_stable_diffusion_attention_slicing_v_pred(self): image = output.images # make sure that more than 3.0 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3 * 10**9 max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten()) assert max_diff < 1e-3 @@ -421,7 +425,7 @@ def test_stable_diffusion_text2img_pipeline_unflawed(self): pipe.scheduler = DDIMScheduler.from_config( pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" @@ -466,7 +470,7 @@ def test_download_local(self): pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -530,20 +534,20 @@ def test_stable_diffusion_low_cpu_mem_usage_v_pred(self): assert 2 * low_cpu_mem_usage_time < normal_load_time def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipeline_id = "stabilityai/stable-diffusion-2" prompt = "Andromeda galaxy in a bottle" pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) pipeline.enable_attention_slicing(1) - pipeline.enable_sequential_cpu_offload() + pipeline.enable_sequential_cpu_offload(device=torch_device) generator = torch.manual_seed(0) _ = pipeline(prompt, generator=generator, num_inference_steps=5) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.8 GB is allocated assert mem_bytes < 2.8 * 10**9 diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py index 340176367fd6..1e2075e510aa 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py @@ -8,6 +8,7 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline from diffusers.utils.testing_utils import ( + backend_empty_cache, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, slow, @@ -240,12 +241,12 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): if str(device).startswith("mps"): @@ -263,7 +264,7 @@ def get_inputs(self, device, seed=0): def test_sd3_inference(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py index 95c9256489b4..9973c092aae2 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py @@ -15,6 +15,7 @@ ) from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, floats_tensor, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, @@ -174,12 +175,12 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): init_image = load_image( @@ -202,7 +203,7 @@ def get_inputs(self, device, seed=0): def test_sd3_img2img_inference(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py index 3743bdd0a870..009c75df4249 100644 --- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py @@ -35,12 +35,13 @@ from diffusers.utils import logging from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -604,17 +605,17 @@ def test_inference_batch_single_identical( @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_adapter_depth_sd_v15(self): adapter_model = "TencentARC/t2iadapter_depth_sd15v2" diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index d7567afdee1f..f706e7000b28 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -30,13 +30,17 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, load_numpy, nightly, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -164,17 +168,17 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -258,37 +262,37 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: assert number_of_steps == inputs["num_inference_steps"] def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionImageVariationPipeline.from_pretrained( "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.6 GB is allocated assert mem_bytes < 2.6 * 10**9 @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index e574029acffd..c68cdf67036a 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -38,7 +38,7 @@ enable_full_determinism, load_image, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -265,7 +265,7 @@ def test_attention_slicing_forward_pass(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -274,12 +274,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py index b0a979c49360..9a141634a364 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py @@ -42,7 +42,7 @@ enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -293,7 +293,7 @@ def test_stable_diffusion_xl_img2img_tiny_autoencoder(self): assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -302,12 +302,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -596,7 +596,7 @@ def test_stable_diffusion_xl_img2img_euler(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -605,12 +605,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py index f5fba4ede207..66ae581a0529 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py @@ -41,7 +41,13 @@ UNet2DConditionModel, UniPCMultistepScheduler, ) -from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + require_torch_accelerator, + slow, + torch_device, +) from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, @@ -305,7 +311,48 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): pass - @require_torch_gpu + @require_torch_accelerator + def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self): + components = self.get_dummy_components() + sd_pipe = StableDiffusionXLInpaintPipeline(**components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + # forward without prompt embeds + inputs = self.get_dummy_inputs(torch_device) + negative_prompt = 3 * ["this is a negative prompt"] + inputs["negative_prompt"] = negative_prompt + inputs["prompt"] = 3 * [inputs["prompt"]] + + output = sd_pipe(**inputs) + image_slice_1 = output.images[0, -3:, -3:, -1] + + # forward with prompt embeds + inputs = self.get_dummy_inputs(torch_device) + negative_prompt = 3 * ["this is a negative prompt"] + prompt = 3 * [inputs.pop("prompt")] + + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt) + + output = sd_pipe( + **inputs, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + ) + image_slice_2 = output.images[0, -3:, -3:, -1] + + # make sure that it's equal + assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 + + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -314,12 +361,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLInpaintPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLInpaintPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py index 94ee9f0facc8..46f7d0e7b0b4 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py @@ -20,14 +20,20 @@ import torch from diffusers import StableDiffusionXLKDiffusionPipeline -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + require_torch_accelerator, + slow, + torch_device, +) enable_full_determinism() @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLKPipelineIntegrationTests(unittest.TestCase): dtype = torch.float16 @@ -35,13 +41,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_xl(self): sd_pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained( diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py index 352477ecec56..f77a5b1620d2 100644 --- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py +++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py @@ -22,12 +22,13 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, enable_full_determinism, floats_tensor, numpy_cosine_similarity_distance, require_accelerate_version_greater, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -515,19 +516,19 @@ def test_disable_cfg(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableVideoDiffusionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_sd_video(self): pipe = StableVideoDiffusionPipeline.from_pretrained( @@ -535,7 +536,7 @@ def test_sd_video(self): variant="fp16", torch_dtype=torch.float16, ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true" diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 6ce7c5d604f4..48c89d399216 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -1383,11 +1383,11 @@ def test_pipe_false_offload_warn(self): feature_extractor=self.dummy_extractor, ) - sd.enable_model_cpu_offload() + sd.enable_model_cpu_offload(device=torch_device) logger = logging.get_logger("diffusers.pipelines.pipeline_utils") with CaptureLogger(logger) as cap_logger: - sd.to("cuda") + sd.to(torch_device) assert "It is strongly recommended against doing so" in str(cap_logger) diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py index 7813a2c071b3..5d0f8299f68e 100644 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py +++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py @@ -23,10 +23,11 @@ from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel from diffusers.utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -184,19 +185,19 @@ def test_encode_prompt_works_in_isolation(self): @slow @skip_mps -@require_torch_gpu +@require_torch_accelerator class TextToVideoSDPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_two_step_model(self): expected_video = load_numpy( diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index e922ddd8fd6a..292978eb6eee 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -27,6 +27,7 @@ load_image, nightly, require_torch_2, + require_torch_accelerator, require_torch_gpu, run_test_in_subprocess, torch_device, @@ -501,20 +502,19 @@ def test_unidiffuser_img2text_multiple_prompts_with_latents(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=2e-4) - @require_torch_gpu - def test_unidiffuser_default_joint_v1_cuda_fp16(self): - device = "cuda" + @require_torch_accelerator + def test_unidiffuser_default_joint_v1_fp16(self): unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 ) - unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe = unidiffuser_pipe.to(torch_device) unidiffuser_pipe.set_progress_bar_config(disable=None) # Set mode to 'joint' unidiffuser_pipe.set_joint_mode() assert unidiffuser_pipe.mode == "joint" - inputs = self.get_dummy_inputs_with_latents(device) + inputs = self.get_dummy_inputs_with_latents(torch_device) # Delete prompt and image for joint inference. del inputs["prompt"] del inputs["image"] @@ -531,20 +531,19 @@ def test_unidiffuser_default_joint_v1_cuda_fp16(self): expected_text_prefix = '" This This' assert text[0][: len(expected_text_prefix)] == expected_text_prefix - @require_torch_gpu - def test_unidiffuser_default_text2img_v1_cuda_fp16(self): - device = "cuda" + @require_torch_accelerator + def test_unidiffuser_default_text2img_v1_fp16(self): unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 ) - unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe = unidiffuser_pipe.to(torch_device) unidiffuser_pipe.set_progress_bar_config(disable=None) # Set mode to 'text2img' unidiffuser_pipe.set_text_to_image_mode() assert unidiffuser_pipe.mode == "text2img" - inputs = self.get_dummy_inputs_with_latents(device) + inputs = self.get_dummy_inputs_with_latents(torch_device) # Delete prompt and image for joint inference. del inputs["image"] inputs["data_type"] = 1 @@ -556,20 +555,19 @@ def test_unidiffuser_default_text2img_v1_cuda_fp16(self): expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138]) assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - @require_torch_gpu - def test_unidiffuser_default_img2text_v1_cuda_fp16(self): - device = "cuda" + @require_torch_accelerator + def test_unidiffuser_default_img2text_v1_fp16(self): unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 ) - unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe = unidiffuser_pipe.to(torch_device) unidiffuser_pipe.set_progress_bar_config(disable=None) # Set mode to 'img2text' unidiffuser_pipe.set_image_to_text_mode() assert unidiffuser_pipe.mode == "img2text" - inputs = self.get_dummy_inputs_with_latents(device) + inputs = self.get_dummy_inputs_with_latents(torch_device) # Delete prompt and image for joint inference. del inputs["prompt"] inputs["data_type"] = 1 diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py index a0e6e1417e67..084d62a8c613 100644 --- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py +++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py @@ -21,7 +21,7 @@ from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin @@ -198,7 +198,7 @@ def test_wuerstchen(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -207,12 +207,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = []