From 8d0f3874b8219574997387c0c01da1d4b853ab24 Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Thu, 2 Jan 2025 21:36:50 -0800 Subject: [PATCH 01/33] initial comit --- tests/models/test_modeling_common.py | 16 +++--- tests/pipelines/allegro/test_allegro.py | 6 +-- .../pipelines/animatediff/test_animatediff.py | 16 ++++-- tests/pipelines/cogvideo/test_cogvideox.py | 6 +-- .../cogvideo/test_cogvideox_image2video.py | 16 ++++-- tests/pipelines/cogview3/test_cogview3plus.py | 6 +-- tests/pipelines/controlnet/test_controlnet.py | 53 ++++++++++++------- .../controlnet/test_controlnet_img2img.py | 6 +-- .../controlnet/test_controlnet_inpaint.py | 8 +-- .../controlnet/test_controlnet_sdxl.py | 20 ++++--- .../test_controlnet_sdxl_img2img.py | 13 +++-- .../controlnet_flux/test_controlnet_flux.py | 2 +- .../test_controlnet_hunyuandit.py | 22 +++++--- .../controlnet_xs/test_controlnetxs.py | 15 +++--- .../controlnet_xs/test_controlnetxs_sdxl.py | 25 ++++++--- tests/pipelines/ddim/test_ddim.py | 4 +- tests/pipelines/ddpm/test_ddpm.py | 4 +- .../pipelines/deepfloyd_if/test_if_img2img.py | 19 ++++--- .../test_if_img2img_superresolution.py | 28 +++++++--- .../pipelines/hunyuan_dit/test_hunyuan_dit.py | 6 +-- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 16 ++++-- tests/pipelines/test_pipelines.py | 25 ++++++--- 22 files changed, 210 insertions(+), 122 deletions(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 4fc14804475a..2bdd5b057119 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -57,8 +57,8 @@ get_python_version, is_torch_compile, require_torch_2, + require_torch_accelerator, require_torch_accelerator_with_training, - require_torch_gpu, require_torch_multi_gpu, run_test_in_subprocess, torch_all_close, @@ -543,7 +543,7 @@ def test_set_xformers_attn_processor_for_determinism(self): assert torch.allclose(output, output_3, atol=self.base_precision) assert torch.allclose(output_2, output_3, atol=self.base_precision) - @require_torch_gpu + @require_torch_accelerator def test_set_attn_processor_for_determinism(self): if self.uses_custom_attn_processor: return @@ -1068,7 +1068,7 @@ def test_wrong_adapter_name_raises_error(self): self.assertTrue(f"Adapter name {wrong_name} not found in the model." in str(err_context.exception)) - @require_torch_gpu + @require_torch_accelerator def test_cpu_offload(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() @@ -1098,7 +1098,7 @@ def test_cpu_offload(self): self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) - @require_torch_gpu + @require_torch_accelerator def test_disk_offload_without_safetensors(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() @@ -1132,7 +1132,7 @@ def test_disk_offload_without_safetensors(self): self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) - @require_torch_gpu + @require_torch_accelerator def test_disk_offload_with_safetensors(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() @@ -1191,7 +1191,7 @@ def test_model_parallelism(self): self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) - @require_torch_gpu + @require_torch_accelerator def test_sharded_checkpoints(self): torch.manual_seed(0) config, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -1223,7 +1223,7 @@ def test_sharded_checkpoints(self): self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) - @require_torch_gpu + @require_torch_accelerator def test_sharded_checkpoints_with_variant(self): torch.manual_seed(0) config, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -1261,7 +1261,7 @@ def test_sharded_checkpoints_with_variant(self): self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) - @require_torch_gpu + @require_torch_accelerator def test_sharded_checkpoints_device_map(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() diff --git a/tests/pipelines/allegro/test_allegro.py b/tests/pipelines/allegro/test_allegro.py index d09fc0488378..6de8327ece5c 100644 --- a/tests/pipelines/allegro/test_allegro.py +++ b/tests/pipelines/allegro/test_allegro.py @@ -24,7 +24,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -299,7 +299,7 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): @slow -@require_torch_gpu +@require_torch_accelerator class AllegroPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." @@ -317,7 +317,7 @@ def test_allegro(self): generator = torch.Generator("cpu").manual_seed(0) pipe = AllegroPipeline.from_pretrained("rhymes-ai/Allegro", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt videos = pipe( diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index c382bb5b7f30..cc247f011bd9 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -22,7 +22,7 @@ from diffusers.utils.testing_utils import ( numpy_cosine_similarity_distance, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -547,19 +547,25 @@ def test_vae_slicing(self): @slow -@require_torch_gpu +@require_torch_accelerator class AnimateDiffPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_animatediff(self): adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2") @@ -573,7 +579,7 @@ def test_animatediff(self): clip_sample=False, ) pipe.enable_vae_slicing() - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain" diff --git a/tests/pipelines/cogvideo/test_cogvideox.py b/tests/pipelines/cogvideo/test_cogvideox.py index 884ddfb2a95a..78fe9d4ef3be 100644 --- a/tests/pipelines/cogvideo/test_cogvideox.py +++ b/tests/pipelines/cogvideo/test_cogvideox.py @@ -24,7 +24,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -321,7 +321,7 @@ def test_fused_qkv_projections(self): @slow -@require_torch_gpu +@require_torch_accelerator class CogVideoXPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." @@ -339,7 +339,7 @@ def test_cogvideox(self): generator = torch.Generator("cpu").manual_seed(0) pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt videos = pipe( diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py index f7e1fe7fd6c7..d1ce9880a6f0 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py @@ -26,7 +26,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -344,25 +344,31 @@ def test_fused_qkv_projections(self): @slow -@require_torch_gpu +@require_torch_accelerator class CogVideoXImageToVideoPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_cogvideox(self): generator = torch.Generator("cpu").manual_seed(0) pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt image = load_image( diff --git a/tests/pipelines/cogview3/test_cogview3plus.py b/tests/pipelines/cogview3/test_cogview3plus.py index 8d56552ba5ee..dcb746e0a55d 100644 --- a/tests/pipelines/cogview3/test_cogview3plus.py +++ b/tests/pipelines/cogview3/test_cogview3plus.py @@ -24,7 +24,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -232,7 +232,7 @@ def test_attention_slicing_forward_pass( @slow -@require_torch_gpu +@require_torch_accelerator class CogView3PlusPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." @@ -250,7 +250,7 @@ def test_cogview3plus(self): generator = torch.Generator("cpu").manual_seed(0) pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3Plus-3b", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt images = pipe( diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index b12655d989d4..beb87729d685 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -40,6 +40,7 @@ load_image, load_numpy, require_torch_2, + require_torch_accelerator, require_torch_gpu, run_test_in_subprocess, slow, @@ -699,17 +700,23 @@ def test_save_pretrained_raise_not_implemented_exception(self): @slow -@require_torch_gpu +@require_torch_accelerator class ControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_canny(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") @@ -717,7 +724,7 @@ def test_canny(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -744,7 +751,7 @@ def test_depth(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -771,7 +778,7 @@ def test_hed(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -798,7 +805,7 @@ def test_mlsd(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -825,7 +832,7 @@ def test_normal(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -852,7 +859,7 @@ def test_openpose(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -879,7 +886,7 @@ def test_scribble(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(5) @@ -906,7 +913,7 @@ def test_seg(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(5) @@ -928,9 +935,14 @@ def test_seg(self): assert np.abs(expected_image - image).max() < 8e-2 def test_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + if torch_device == "cuda": + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + elif torch_device == "xpu": + torch.xpu.empty_cache() + torch.xpu.reset_max_memory_allocated() + torch.xpu.reset_peak_memory_stats() controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") @@ -939,7 +951,7 @@ def test_sequential_cpu_offloading(self): ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) prompt = "house" image = load_image( @@ -953,7 +965,10 @@ def test_sequential_cpu_offloading(self): output_type="np", ) - mem_bytes = torch.cuda.max_memory_allocated() + if torch_device == "cuda": + mem_bytes = torch.cuda.max_memory_allocated() + elif torch_device == "xpu": + mem_bytes = torch.xpu.max_memory_allocated() # make sure that less than 7 GB is allocated assert mem_bytes < 4 * 10**9 @@ -963,7 +978,7 @@ def test_canny_guess_mode(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -996,7 +1011,7 @@ def test_canny_guess_mode_euler(self): "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -1037,7 +1052,7 @@ def test_v11_shuffle_global_pool_conditions(self): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 7c4ae716b37d..a6d642501d56 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -39,7 +39,7 @@ enable_full_determinism, floats_tensor, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -391,7 +391,7 @@ def test_save_pretrained_raise_not_implemented_exception(self): @slow -@require_torch_gpu +@require_torch_accelerator class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() @@ -409,7 +409,7 @@ def test_canny(self): pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index e49106334c2e..127510a4f8b9 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -40,7 +40,7 @@ floats_tensor, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -443,7 +443,7 @@ def test_save_pretrained_raise_not_implemented_exception(self): @slow -@require_torch_gpu +@require_torch_accelerator class ControlNetInpaintPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() @@ -461,7 +461,7 @@ def test_canny(self): pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, controlnet=controlnet ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -507,7 +507,7 @@ def test_inpaint(self): "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(33) diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index ea7fff5537a5..92a8f10eed4b 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -37,7 +37,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -212,7 +212,7 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): self._test_save_load_optional_components() - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -889,17 +889,23 @@ def test_negative_conditions(self): @slow -@require_torch_gpu +@require_torch_accelerator class ControlNetSDXLPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_canny(self): controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0") @@ -907,7 +913,7 @@ def test_canny(self): pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet ) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -930,7 +936,7 @@ def test_depth(self): pipe = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet ) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py index 6a5976bd0dda..88708b5cd1ab 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py @@ -28,7 +28,12 @@ UNet2DConditionModel, ) from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + require_torch_accelerator, + torch_device, +) from ..pipeline_params import ( IMAGE_TO_IMAGE_IMAGE_PARAMS, @@ -241,7 +246,7 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): pass - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -250,12 +255,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py index 8202424e7f15..99490258468a 100644 --- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py +++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py @@ -231,7 +231,7 @@ def test_canny(self): controlnet=controlnet, torch_dtype=torch.bfloat16, ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py index 5500c7bd1c81..eaab7a06a104 100644 --- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py +++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py @@ -30,7 +30,7 @@ from diffusers.utils import load_image from diffusers.utils.testing_utils import ( enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -178,19 +178,25 @@ def test_save_load_optional_components(self): @slow -@require_torch_gpu +@require_torch_accelerator class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase): pipeline_class = HunyuanDiTControlNetPipeline def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_canny(self): controlnet = HunyuanDiT2DControlNetModel.from_pretrained( @@ -199,7 +205,7 @@ def test_canny(self): pipe = HunyuanDiTControlNetPipeline.from_pretrained( "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -238,7 +244,7 @@ def test_pose(self): pipe = HunyuanDiTControlNetPipeline.from_pretrained( "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -277,7 +283,7 @@ def test_depth(self): pipe = HunyuanDiTControlNetPipeline.from_pretrained( "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -318,7 +324,7 @@ def test_multi_controlnet(self): pipe = HunyuanDiTControlNetPipeline.from_pretrained( "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index 508e5008a786..dcfb0e6d9935 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -40,7 +40,7 @@ load_numpy, require_accelerator, require_torch_2, - require_torch_gpu, + require_torch_accelerator, run_test_in_subprocess, slow, torch_device, @@ -92,7 +92,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout): safety_checker=None, torch_dtype=torch.float16, ) - pipe.to("cuda") + pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe.unet.to(memory_format=torch.channels_last) @@ -334,12 +334,15 @@ def test_to_device(self): @slow -@require_torch_gpu +@require_torch_accelerator class ControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( @@ -348,7 +351,7 @@ def test_canny(self): pipe = StableDiffusionControlNetXSPipeline.from_pretrained( "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -374,7 +377,7 @@ def test_depth(self): pipe = StableDiffusionControlNetXSPipeline.from_pretrained( "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py index 53cb070c9be4..9a41f18b17ef 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py @@ -31,7 +31,13 @@ UNet2DConditionModel, ) from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + load_image, + require_torch_accelerator, + slow, + torch_device, +) from diffusers.utils.torch_utils import randn_tensor from ...models.autoencoders.vae import ( @@ -192,7 +198,7 @@ def test_xformers_attention_forwardGenerator_pass(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) - @require_torch_gpu + @require_torch_accelerator # Copied from test_controlnet_sdxl.py def test_stable_diffusion_xl_offloads(self): pipes = [] @@ -202,12 +208,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -369,12 +375,15 @@ def test_multi_vae(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( @@ -383,7 +392,7 @@ def test_canny(self): pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -407,7 +416,7 @@ def test_depth(self): pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py index 2078a592ceca..f7e0093c515a 100644 --- a/tests/pipelines/ddim/test_ddim.py +++ b/tests/pipelines/ddim/test_ddim.py @@ -19,7 +19,7 @@ import torch from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, slow, torch_device from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS from ..test_pipelines_common import PipelineTesterMixin @@ -99,7 +99,7 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class DDIMPipelineIntegrationTests(unittest.TestCase): def test_inference_cifar10(self): model_id = "google/ddpm-cifar10-32" diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py index f6d0821da4c2..750885db2c23 100644 --- a/tests/pipelines/ddpm/test_ddpm.py +++ b/tests/pipelines/ddpm/test_ddpm.py @@ -19,7 +19,7 @@ import torch from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, slow, torch_device enable_full_determinism() @@ -88,7 +88,7 @@ def test_inference_predict_sample(self): @slow -@require_torch_gpu +@require_torch_accelerator class DDPMPipelineIntegrationTests(unittest.TestCase): def test_inference_cifar10(self): model_id = "google/ddpm-cifar10-32" diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index 26ac42831b8b..b98c1dd61f4f 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -26,7 +26,7 @@ floats_tensor, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -102,7 +102,7 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class IFImg2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test @@ -123,11 +123,16 @@ def test_if_img2img(self): torch_dtype=torch.float16, ) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe.enable_model_cpu_offload() - - torch.cuda.reset_max_memory_allocated() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + pipe.enable_model_cpu_offload(device=torch_device) + + if torch_device == "cuda": + torch.cuda.reset_max_memory_allocated() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + elif torch_device == "xpu": + torch.xpu.reset_max_memory_allocated() + torch.xpu.empty_cache() + torch.xpu.reset_peak_memory_stats() image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index 1d1244c96c33..061121a1792b 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -26,7 +26,7 @@ floats_tensor, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -99,13 +99,16 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): # clean up the VRAM after each test @@ -120,11 +123,16 @@ def test_if_img2img_superresolution(self): torch_dtype=torch.float16, ) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) - torch.cuda.reset_max_memory_allocated() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + if torch_device == "cuda": + torch.cuda.reset_max_memory_allocated() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + elif torch_device == "xpu": + torch.xpu.reset_max_memory_allocated() + torch.xpu.empty_cache() + torch.xpu.reset_peak_memory_stats() generator = torch.Generator(device="cpu").manual_seed(0) @@ -144,7 +152,11 @@ def test_if_img2img_superresolution(self): assert image.shape == (256, 256, 3) - mem_bytes = torch.cuda.max_memory_allocated() + if torch_device == "cuda": + mem_bytes = torch.cuda.max_memory_allocated() + elif torch_device == "xpu": + mem_bytes = torch.xpu.max_memory_allocated() + assert mem_bytes < 12 * 10**9 expected_image = load_numpy( diff --git a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py index 653cb41e4bc4..b295b280a560 100644 --- a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py +++ b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py @@ -30,7 +30,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -299,7 +299,7 @@ def test_fused_qkv_projections(self): @slow -@require_torch_gpu +@require_torch_accelerator class HunyuanDiTPipelineIntegrationTests(unittest.TestCase): prompt = "一个宇航员在骑马" @@ -319,7 +319,7 @@ def test_hunyuan_dit_1024(self): pipe = HunyuanDiTPipeline.from_pretrained( "XCLiu/HunyuanDiT-0523", revision="refs/pr/2", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt image = pipe( diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index 592ebd35f4a9..a283e2862cb2 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -39,7 +39,7 @@ enable_full_determinism, floats_tensor, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -226,23 +226,29 @@ def test_num_videos_per_prompt(self): @slow -@require_torch_gpu +@require_torch_accelerator class I2VGenXLPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_i2vgen_xl(self): pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true" diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 423c82e0602e..9cdca0354a5d 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -77,7 +77,7 @@ require_flax, require_onnxruntime, require_torch_2, - require_torch_gpu, + require_torch_accelerator, run_test_in_subprocess, slow, torch_device, @@ -1136,7 +1136,7 @@ def test_custom_model_and_pipeline(self): assert conf_1 == conf_2 @slow - @require_torch_gpu + @require_torch_accelerator def test_download_from_git(self): # Because adaptive_avg_pool2d_backward_cuda # does not have a deterministic implementation. @@ -1350,7 +1350,7 @@ def test_stable_diffusion_components(self): assert image_img2img.shape == (1, 32, 32, 3) assert image_text2img.shape == (1, 64, 64, 3) - @require_torch_gpu + @require_torch_accelerator def test_pipe_false_offload_warn(self): unet = self.dummy_cond_unet() scheduler = PNDMScheduler(skip_prk_steps=True) @@ -1814,19 +1814,25 @@ def test_wrong_model(self): @slow -@require_torch_gpu +@require_torch_accelerator class PipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def test_smart_download(self): model_id = "hf-internal-testing/unet-pipeline-dummy" @@ -2045,13 +2051,16 @@ def test_weighted_prompts_compel(self): @nightly -@require_torch_gpu +@require_torch_accelerator class PipelineNightlyTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + if torch_device == "cuda": + torch.cuda.empty_cache() + elif torch_device == "xpu": + torch.xpu.empty_cache() def tearDown(self): # clean up the VRAM after each test From 88919c01584594b4959950754a88c7f7afd75b83 Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Thu, 2 Jan 2025 21:59:55 -0800 Subject: [PATCH 02/33] fix empty cache --- .../pipelines/animatediff/test_animatediff.py | 11 +++-------- .../cogvideo/test_cogvideox_image2video.py | 11 +++-------- tests/pipelines/controlnet/test_controlnet.py | 11 +++-------- .../controlnet/test_controlnet_sdxl.py | 11 +++-------- .../test_controlnet_hunyuandit.py | 11 +++-------- .../controlnet_xs/test_controlnetxs.py | 6 ++---- .../controlnet_xs/test_controlnetxs_sdxl.py | 6 ++---- .../test_if_img2img_superresolution.py | 6 ++---- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 11 +++-------- tests/pipelines/test_pipelines.py | 18 +++++------------- 10 files changed, 29 insertions(+), 73 deletions(-) diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index cc247f011bd9..c7411a7145c5 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -20,6 +20,7 @@ from diffusers.models.attention import FreeNoiseTransformerBlock from diffusers.utils import is_xformers_available, logging from diffusers.utils.testing_utils import ( + backend_empty_cache, numpy_cosine_similarity_distance, require_accelerator, require_torch_accelerator, @@ -553,19 +554,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_animatediff(self): adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2") diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py index d1ce9880a6f0..cac47f1a83d4 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py @@ -24,6 +24,7 @@ from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, require_torch_accelerator, @@ -351,18 +352,12 @@ class CogVideoXImageToVideoPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_cogvideox(self): generator = torch.Generator("cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index beb87729d685..44b4d8cea711 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -34,6 +34,7 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, get_python_version, is_torch_compile, @@ -705,18 +706,12 @@ class ControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index 92a8f10eed4b..85924af050b0 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -35,6 +35,7 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_image, require_torch_accelerator, @@ -894,18 +895,12 @@ class ControlNetSDXLPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0") diff --git a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py index eaab7a06a104..30dfe94e50f1 100644 --- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py +++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py @@ -29,6 +29,7 @@ from diffusers.models import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, require_torch_accelerator, slow, @@ -185,18 +186,12 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = HunyuanDiT2DControlNetModel.from_pretrained( diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index dcfb0e6d9935..6d53d0618959 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -34,6 +34,7 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, is_torch_compile, load_image, @@ -339,10 +340,7 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py index 9a41f18b17ef..d7ecf92f41cd 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py @@ -32,6 +32,7 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_image, require_torch_accelerator, @@ -380,10 +381,7 @@ class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index 061121a1792b..328e22d27c74 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -23,6 +23,7 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, floats_tensor, load_numpy, require_accelerator, @@ -105,10 +106,7 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index a283e2862cb2..ae29a34a3320 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -36,6 +36,7 @@ from diffusers.models.unets import I2VGenXLUNet from diffusers.utils import is_xformers_available, load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, numpy_cosine_similarity_distance, @@ -232,19 +233,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_i2vgen_xl(self): pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 9cdca0354a5d..b875d17108b6 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -66,6 +66,7 @@ ) from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, enable_full_determinism, floats_tensor, get_python_version, @@ -1820,19 +1821,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def test_smart_download(self): model_id = "hf-internal-testing/unet-pipeline-dummy" @@ -2057,16 +2052,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - if torch_device == "cuda": - torch.cuda.empty_cache() - elif torch_device == "xpu": - torch.xpu.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_ddpm_ddim_equality_batched(self): seed = 0 From e32a9ac7517a0e13aea54ae21935315a0562568f Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Thu, 2 Jan 2025 22:02:48 -0800 Subject: [PATCH 03/33] fix one more --- tests/pipelines/controlnet/test_controlnet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 44b4d8cea711..5b09f6f7decd 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -1074,17 +1074,17 @@ def test_v11_shuffle_global_pool_conditions(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_pose_and_canny(self): controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") @@ -1095,7 +1095,7 @@ def test_pose_and_canny(self): safety_checker=None, controlnet=[controlnet_pose, controlnet_canny], ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) From cb7d9d5e8b1fd871598395ddbf34f288deb9f27a Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Thu, 2 Jan 2025 22:04:05 -0800 Subject: [PATCH 04/33] fix style --- tests/pipelines/controlnet/test_controlnet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 5b09f6f7decd..98545879f6a9 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -42,7 +42,6 @@ load_numpy, require_torch_2, require_torch_accelerator, - require_torch_gpu, run_test_in_subprocess, slow, torch_device, From a393860b52b10ebe4198a224f7718d0869fd1bcd Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Sun, 5 Jan 2025 18:17:25 -0800 Subject: [PATCH 05/33] update device functions --- src/diffusers/utils/testing_utils.py | 52 ++++++++++++++++++++++--- tests/pipelines/deepfloyd_if/test_if.py | 18 ++++----- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 3ae74cddcbbf..626d20e1c239 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1,4 +1,5 @@ import functools +import gc import importlib import importlib.metadata import inspect @@ -86,7 +87,12 @@ ) from e logger.info(f"torch_device overrode to {torch_device}") else: - torch_device = "cuda" if torch.cuda.is_available() else "cpu" + if torch.cuda.is_available(): + torch_device = "cuda" + elif torch.xpu.is_available(): + torch_device = "xpu" + else: + torch_device = "cpu" is_torch_higher_equal_than_1_12 = version.parse( version.parse(torch.__version__).base_version ) >= version.parse("1.12") @@ -1055,12 +1061,34 @@ def _is_torch_fp64_available(device): # Guard these lookups for when Torch is not used - alternative accelerator support is for PyTorch if is_torch_available(): # Behaviour flags - BACKEND_SUPPORTS_TRAINING = {"cuda": True, "cpu": True, "mps": False, "default": True} + BACKEND_SUPPORTS_TRAINING = {"cuda": True, "xpu": True, "cpu": True, "mps": False, "default": True} # Function definitions - BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "mps": None, "default": None} - BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "mps": lambda: 0, "default": 0} - BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed} + BACKEND_EMPTY_CACHE = { + "cuda": torch.cuda.empty_cache, + "xpu": torch.xpu.empty_cache, + "cpu": None, + "mps": None, + "default": None, + } + BACKEND_DEVICE_COUNT = { + "cuda": torch.cuda.device_count, + "xpu": torch.xpu.device_count, + "cpu": lambda: 0, + "mps": lambda: 0, + "default": 0, + } + BACKEND_MANUAL_SEED = { + "cuda": torch.cuda.manual_seed, + "xpu": torch.xpu.manual_seed, + "cpu": torch.manual_seed, + "default": torch.manual_seed, + } + BACKEND_RESET_PEAK_MEMORY_STATS = { + "cuda": torch.cuda.reset_peak_memory_stats(), + "xpu": torch.xpu.reset_peak_memory_stats(), + "default": None, + } # This dispatches a defined function according to the accelerator from the function definitions. @@ -1091,6 +1119,10 @@ def backend_device_count(device: str): return _device_agnostic_dispatch(device, BACKEND_DEVICE_COUNT) +def backend_reset_peak_memory(device: str): + return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) + + # These are callables which return boolean behaviour flags and can be used to specify some # device agnostic alternative where the feature is unsupported. def backend_supports_training(device: str): @@ -1147,3 +1179,13 @@ def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN") update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN") update_mapping_from_spec(BACKEND_SUPPORTS_TRAINING, "SUPPORTS_TRAINING") + update_mapping_from_spec(BACKEND_RESET_PEAK_MEMORY_STATS, "RESET_PEAK_MEM_STATS") + + +@require_torch +def flush_memory(device: str, gc_collect=False, reset_mem_stats=False): + if gc_collect: + gc.collect() + if reset_mem_stats: + backend_reset_peak_memory(device) + backend_empty_cache(device) diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index 13a05855f145..7336addeb2fe 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import unittest import torch @@ -24,9 +23,10 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + flush_memory, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -91,28 +91,24 @@ def test_xformers_attention_forwardGenerator_pass(self): @slow -@require_torch_gpu +@require_torch_accelerator class IFPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def test_if_text_to_image(self): pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) - torch.cuda.reset_max_memory_allocated() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + flush_memory(torch_device, reset_mem_stats=True) generator = torch.Generator(device="cpu").manual_seed(0) output = pipe( From 2f3ad323e3dfe72649d3e099f427377faa0b26ea Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Sun, 5 Jan 2025 21:32:17 -0800 Subject: [PATCH 06/33] update --- .../pipelines/animatediff/test_animatediff.py | 9 +++---- .../cogvideo/test_cogvideox_image2video.py | 9 +++---- tests/pipelines/controlnet/test_controlnet.py | 24 +++++-------------- .../controlnet/test_controlnet_sdxl.py | 9 +++---- .../test_controlnet_hunyuandit.py | 9 +++---- .../controlnet_xs/test_controlnetxs.py | 6 ++--- .../controlnet_xs/test_controlnetxs_sdxl.py | 6 ++--- .../pipelines/deepfloyd_if/test_if_img2img.py | 10 ++------ .../test_if_img2img_superresolution.py | 18 ++++---------- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 9 +++---- tests/pipelines/test_pipelines.py | 14 ++++------- 11 files changed, 36 insertions(+), 87 deletions(-) diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index c7411a7145c5..f283a2b9e57c 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -1,4 +1,3 @@ -import gc import unittest import numpy as np @@ -20,7 +19,7 @@ from diffusers.models.attention import FreeNoiseTransformerBlock from diffusers.utils import is_xformers_available, logging from diffusers.utils.testing_utils import ( - backend_empty_cache, + flush_memory, numpy_cosine_similarity_distance, require_accelerator, require_torch_accelerator, @@ -553,14 +552,12 @@ class AnimateDiffPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_animatediff(self): adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2") diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py index cac47f1a83d4..b4d9511ecbf8 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import inspect import unittest @@ -24,8 +23,8 @@ from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler from diffusers.utils import load_image from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, + flush_memory, numpy_cosine_similarity_distance, require_torch_accelerator, slow, @@ -351,13 +350,11 @@ class CogVideoXImageToVideoPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_cogvideox(self): generator = torch.Generator("cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 98545879f6a9..5e765a8ac4bd 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import tempfile import traceback import unittest @@ -34,8 +33,8 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, + flush_memory, get_python_version, is_torch_compile, load_image, @@ -704,13 +703,11 @@ def test_save_pretrained_raise_not_implemented_exception(self): class ControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_canny(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") @@ -929,14 +926,7 @@ def test_seg(self): assert np.abs(expected_image - image).max() < 8e-2 def test_sequential_cpu_offloading(self): - if torch_device == "cuda": - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - elif torch_device == "xpu": - torch.xpu.empty_cache() - torch.xpu.reset_max_memory_allocated() - torch.xpu.reset_peak_memory_stats() + flush_memory(torch_device, gc_collect=True, reset_mem_stats=True) controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") @@ -1077,13 +1067,11 @@ def test_v11_shuffle_global_pool_conditions(self): class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_pose_and_canny(self): controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index 85924af050b0..d9af9931cbd6 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -14,7 +14,6 @@ # limitations under the License. import copy -import gc import unittest import numpy as np @@ -35,8 +34,8 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, + flush_memory, load_image, require_torch_accelerator, slow, @@ -894,13 +893,11 @@ def test_negative_conditions(self): class ControlNetSDXLPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_canny(self): controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0") diff --git a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py index 30dfe94e50f1..e9550b8dd79b 100644 --- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py +++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import unittest import numpy as np @@ -29,8 +28,8 @@ from diffusers.models import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel from diffusers.utils import load_image from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, + flush_memory, require_torch_accelerator, slow, torch_device, @@ -185,13 +184,11 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_canny(self): controlnet = HunyuanDiT2DControlNetModel.from_pretrained( diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index 6d53d0618959..12df19c74ab9 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import traceback import unittest @@ -34,8 +33,8 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, + flush_memory, is_torch_compile, load_image, load_numpy, @@ -339,8 +338,7 @@ def test_to_device(self): class ControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py index d7ecf92f41cd..b912e4901c29 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import unittest import numpy as np @@ -32,8 +31,8 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, + flush_memory, load_image, require_torch_accelerator, slow, @@ -380,8 +379,7 @@ def test_multi_vae(self): class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index b98c1dd61f4f..1b69119e302e 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -24,6 +24,7 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( floats_tensor, + flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -125,14 +126,7 @@ def test_if_img2img(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - if torch_device == "cuda": - torch.cuda.reset_max_memory_allocated() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - elif torch_device == "xpu": - torch.xpu.reset_max_memory_allocated() - torch.xpu.empty_cache() - torch.xpu.reset_peak_memory_stats() + flush_memory(torch_device, reset_mem_stats=True) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index 328e22d27c74..eca01509e334 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -23,8 +22,8 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( - backend_empty_cache, floats_tensor, + flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -105,14 +104,12 @@ class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def test_if_img2img_superresolution(self): pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained( @@ -123,14 +120,7 @@ def test_if_img2img_superresolution(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - if torch_device == "cuda": - torch.cuda.reset_max_memory_allocated() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - elif torch_device == "xpu": - torch.xpu.reset_max_memory_allocated() - torch.xpu.empty_cache() - torch.xpu.reset_peak_memory_stats() + flush_memory(torch_device, reset_mem_stats=True) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index ae29a34a3320..87c4a10c72ad 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -36,9 +35,9 @@ from diffusers.models.unets import I2VGenXLUNet from diffusers.utils import is_xformers_available, load_image from diffusers.utils.testing_utils import ( - backend_empty_cache, enable_full_determinism, floats_tensor, + flush_memory, numpy_cosine_similarity_distance, require_torch_accelerator, skip_mps, @@ -232,14 +231,12 @@ class I2VGenXLPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_i2vgen_xl(self): pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index b875d17108b6..4238baa437ff 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -66,9 +66,9 @@ ) from diffusers.utils.testing_utils import ( CaptureLogger, - backend_empty_cache, enable_full_determinism, floats_tensor, + flush_memory, get_python_version, get_tests_dir, is_torch_compile, @@ -1820,14 +1820,12 @@ class PipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_smart_download(self): model_id = "hf-internal-testing/unet-pipeline-dummy" @@ -2051,14 +2049,12 @@ class PipelineNightlyTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - backend_empty_cache(torch_device) + flush_memory(torch_device, gc_collect=True) def test_ddpm_ddim_equality_batched(self): seed = 0 From f3a519fd803ffd59917fa706648ad1f811599049 Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Sun, 5 Jan 2025 23:04:43 -0800 Subject: [PATCH 07/33] update --- src/diffusers/utils/testing_utils.py | 4 ++-- .../deepfloyd_if/test_if_inpainting.py | 18 ++++++--------- .../test_if_inpainting_superresolution.py | 23 +++++++++---------- .../deepfloyd_if/test_if_superresolution.py | 23 +++++++++---------- 4 files changed, 31 insertions(+), 37 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 626d20e1c239..a238252d9c0c 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1085,8 +1085,8 @@ def _is_torch_fp64_available(device): "default": torch.manual_seed, } BACKEND_RESET_PEAK_MEMORY_STATS = { - "cuda": torch.cuda.reset_peak_memory_stats(), - "xpu": torch.xpu.reset_peak_memory_stats(), + "cuda": torch.cuda.reset_peak_memory_stats, + "xpu": torch.xpu.reset_peak_memory_stats, "default": None, } diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index 1c4f27403332..d4f655e1b165 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -24,9 +23,10 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( floats_tensor, + flush_memory, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -99,30 +99,26 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class IFInpaintingPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def test_if_inpainting(self): pipe = IFInpaintingPipeline.from_pretrained( "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16 ) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + flush_memory(torch_device, reset_mem_stats=True) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device) diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index fc1b04aacb9b..4d793202d282 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -24,9 +23,10 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( floats_tensor, + flush_memory, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -101,31 +101,27 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def test_if_inpainting_superresolution(self): pipe = IFInpaintingSuperResolutionPipeline.from_pretrained( "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16 ) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) # Super resolution test - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + flush_memory(torch_device, reset_mem_stats=True) generator = torch.Generator(device="cpu").manual_seed(0) @@ -147,7 +143,10 @@ def test_if_inpainting_superresolution(self): assert image.shape == (256, 256, 3) - mem_bytes = torch.cuda.max_memory_allocated() + if torch_device == "cuda": + mem_bytes = torch.cuda.max_memory_allocated() + elif torch_device == "xpu": + mem_bytes = torch.xpu.max_memory_allocated() assert mem_bytes < 12 * 10**9 expected_image = load_numpy( diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py index bdb9f8a76d8a..94b915923b4f 100644 --- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import random import unittest @@ -24,9 +23,10 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( floats_tensor, + flush_memory, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -94,31 +94,27 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class IFSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def tearDown(self): # clean up the VRAM after each test super().tearDown() - gc.collect() - torch.cuda.empty_cache() + flush_memory(torch_device, gc_collect=True) def test_if_superresolution(self): pipe = IFSuperResolutionPipeline.from_pretrained( "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16 ) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) # Super resolution test - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + flush_memory(torch_device, reset_mem_stats=True) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) @@ -134,7 +130,10 @@ def test_if_superresolution(self): assert image.shape == (256, 256, 3) - mem_bytes = torch.cuda.max_memory_allocated() + if torch_device == "cuda": + mem_bytes = torch.cuda.max_memory_allocated() + elif torch_device == "xpu": + mem_bytes = torch.xpu.max_memory_allocated() assert mem_bytes < 12 * 10**9 expected_image = load_numpy( From d1532d2d314f3d94900e7788aac1051ed7f2164f Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:05:59 +0800 Subject: [PATCH 08/33] Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> --- src/diffusers/utils/testing_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index a238252d9c0c..97eae5c8c57d 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1179,7 +1179,9 @@ def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN") update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN") update_mapping_from_spec(BACKEND_SUPPORTS_TRAINING, "SUPPORTS_TRAINING") - update_mapping_from_spec(BACKEND_RESET_PEAK_MEMORY_STATS, "RESET_PEAK_MEM_STATS") + update_mapping_from_spec(BACKEND_RESET_PEAK_MEMORY_STATS, "RESET_PEAK_MEMORY_STATS_FN") + update_mapping_from_spec(BACKEND_RESET_MAX_MEMORY_ALLOCATED, "RESET_MAX_MEMORY_ALLOCATED_FN") + update_mapping_from_spec(BACKEND_MAX_MEMORY_ALLOCATED, "MAX_MEMORY_ALLOCATED_FN") @require_torch From 16cca2275e6aab149cb4984d247269fac9aad628 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:06:14 +0800 Subject: [PATCH 09/33] Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> --- src/diffusers/utils/testing_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 97eae5c8c57d..c033a84b90c2 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1089,6 +1089,15 @@ def _is_torch_fp64_available(device): "xpu": torch.xpu.reset_peak_memory_stats, "default": None, } + BACKEND_RESET_MAX_MEMORY_ALLOCATED = { + "cuda": torch.cuda.reset_max_memory_allocated, + "default": None, + } + BACKEND_MAX_MEMORY_ALLOCATED = { + "cuda": torch.cuda.max_memory_allocated, + "xpu": torch.xpu.max_memory_allocated, + "default": 0, + } # This dispatches a defined function according to the accelerator from the function definitions. From 3420e1f6f5312719878f4e81e4d77b77f68e1719 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:06:24 +0800 Subject: [PATCH 10/33] Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> --- src/diffusers/utils/testing_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index c033a84b90c2..c8c3c7ad0715 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1132,6 +1132,14 @@ def backend_reset_peak_memory(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) +def backend_reset_max_memory_allocated(device: str): + return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED) + + +def backend_max_memory_allocated(device: str): + return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED) + + # These are callables which return boolean behaviour flags and can be used to specify some # device agnostic alternative where the feature is unsupported. def backend_supports_training(device: str): From d15618bf6e7f476585660410e458ac70bc057207 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:06:29 +0800 Subject: [PATCH 11/33] Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> --- tests/pipelines/controlnet/test_controlnet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 5e765a8ac4bd..9fe0b2b59c3a 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -926,7 +926,9 @@ def test_seg(self): assert np.abs(expected_image - image).max() < 8e-2 def test_sequential_cpu_offloading(self): - flush_memory(torch_device, gc_collect=True, reset_mem_stats=True) + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") From e814635357d28e31e900a88e221f0c472c1545be Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:07:14 +0800 Subject: [PATCH 12/33] Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> --- src/diffusers/utils/testing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index c8c3c7ad0715..171a75e92bb5 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1128,7 +1128,7 @@ def backend_device_count(device: str): return _device_agnostic_dispatch(device, BACKEND_DEVICE_COUNT) -def backend_reset_peak_memory(device: str): +def backend_reset_peak_memory_stats(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) From e7995166281ae8661ded16475062c6f8b2f2b269 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:08:16 +0800 Subject: [PATCH 13/33] Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> --- src/diffusers/utils/testing_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 171a75e92bb5..1278086bb7d2 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1201,10 +1201,3 @@ def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name update_mapping_from_spec(BACKEND_MAX_MEMORY_ALLOCATED, "MAX_MEMORY_ALLOCATED_FN") -@require_torch -def flush_memory(device: str, gc_collect=False, reset_mem_stats=False): - if gc_collect: - gc.collect() - if reset_mem_stats: - backend_reset_peak_memory(device) - backend_empty_cache(device) From d3e8678b3dfbb6d4bd5e0ae239ae36dffb95ed05 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Tue, 7 Jan 2025 11:08:29 +0800 Subject: [PATCH 14/33] Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> --- tests/pipelines/controlnet/test_controlnet.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 9fe0b2b59c3a..a7e828c1cbc4 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -951,10 +951,7 @@ def test_sequential_cpu_offloading(self): output_type="np", ) - if torch_device == "cuda": - mem_bytes = torch.cuda.max_memory_allocated() - elif torch_device == "xpu": - mem_bytes = torch.xpu.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 7 GB is allocated assert mem_bytes < 4 * 10**9 From fed282bc2aa920afd576f2a349ac8faadea5ffdc Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Mon, 6 Jan 2025 19:13:48 -0800 Subject: [PATCH 15/33] with gc.collect --- tests/pipelines/animatediff/test_animatediff.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index f283a2b9e57c..c7411a7145c5 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -1,3 +1,4 @@ +import gc import unittest import numpy as np @@ -19,7 +20,7 @@ from diffusers.models.attention import FreeNoiseTransformerBlock from diffusers.utils import is_xformers_available, logging from diffusers.utils.testing_utils import ( - flush_memory, + backend_empty_cache, numpy_cosine_similarity_distance, require_accelerator, require_torch_accelerator, @@ -552,12 +553,14 @@ class AnimateDiffPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_animatediff(self): adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2") From 8577a14d49f5b107253e4bfe0de2d077e512615f Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Mon, 6 Jan 2025 19:48:39 -0800 Subject: [PATCH 16/33] update --- src/diffusers/utils/testing_utils.py | 1 - .../cogvideo/test_cogvideox_image2video.py | 9 +++++--- tests/pipelines/controlnet/test_controlnet.py | 15 ++++++++----- .../controlnet/test_controlnet_sdxl.py | 9 +++++--- .../test_controlnet_hunyuandit.py | 9 +++++--- .../controlnet_xs/test_controlnetxs.py | 6 ++++-- .../controlnet_xs/test_controlnetxs_sdxl.py | 6 ++++-- tests/pipelines/deepfloyd_if/test_if.py | 15 +++++++++---- .../pipelines/deepfloyd_if/test_if_img2img.py | 12 +++++++---- .../test_if_img2img_superresolution.py | 21 ++++++++++++------- .../deepfloyd_if/test_if_inpainting.py | 18 +++++++++++----- .../test_if_inpainting_superresolution.py | 21 ++++++++++++------- .../deepfloyd_if/test_if_superresolution.py | 21 ++++++++++++------- tests/pipelines/i2vgen_xl/test_i2vgenxl.py | 9 +++++--- tests/pipelines/test_pipelines.py | 16 ++++++++------ 15 files changed, 123 insertions(+), 65 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 1278086bb7d2..393b6c8073f4 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1,5 +1,4 @@ import functools -import gc import importlib import importlib.metadata import inspect diff --git a/tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/pipelines/cogvideo/test_cogvideox_image2video.py index b4d9511ecbf8..cac47f1a83d4 100644 --- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py +++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import inspect import unittest @@ -23,8 +24,8 @@ from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - flush_memory, numpy_cosine_similarity_distance, require_torch_accelerator, slow, @@ -350,11 +351,13 @@ class CogVideoXImageToVideoPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_cogvideox(self): generator = torch.Generator("cpu").manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index a7e828c1cbc4..c8ed4f768092 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import tempfile import traceback import unittest @@ -33,8 +34,8 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - flush_memory, get_python_version, is_torch_compile, load_image, @@ -703,11 +704,13 @@ def test_save_pretrained_raise_not_implemented_exception(self): class ControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") @@ -1066,11 +1069,13 @@ def test_v11_shuffle_global_pool_conditions(self): class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_pose_and_canny(self): controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index d9af9931cbd6..85924af050b0 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -14,6 +14,7 @@ # limitations under the License. import copy +import gc import unittest import numpy as np @@ -34,8 +35,8 @@ from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - flush_memory, load_image, require_torch_accelerator, slow, @@ -893,11 +894,13 @@ def test_negative_conditions(self): class ControlNetSDXLPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0") diff --git a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py index e9550b8dd79b..30dfe94e50f1 100644 --- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py +++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import unittest import numpy as np @@ -28,8 +29,8 @@ from diffusers.models import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - flush_memory, require_torch_accelerator, slow, torch_device, @@ -184,11 +185,13 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_canny(self): controlnet = HunyuanDiT2DControlNetModel.from_pretrained( diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index 12df19c74ab9..6d53d0618959 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import traceback import unittest @@ -33,8 +34,8 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - flush_memory, is_torch_compile, load_image, load_numpy, @@ -338,7 +339,8 @@ def test_to_device(self): class ControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py index b912e4901c29..d7ecf92f41cd 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import unittest import numpy as np @@ -31,8 +32,8 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - flush_memory, load_image, require_torch_accelerator, slow, @@ -379,7 +380,8 @@ def test_multi_vae(self): class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_canny(self): controlnet = ControlNetXSAdapter.from_pretrained( diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index 7336addeb2fe..170374b8d4f2 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import unittest import torch @@ -23,7 +24,9 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( - flush_memory, + backend_empty_cache, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, load_numpy, require_accelerator, require_torch_accelerator, @@ -96,19 +99,23 @@ class IFPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_if_text_to_image(self): pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - flush_memory(torch_device, reset_mem_stats=True) + backend_reset_max_memory_allocated(torch_device) + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) output = pipe( diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index 1b69119e302e..257161c8bce1 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -23,8 +23,10 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, floats_tensor, - flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -109,13 +111,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_if_img2img(self): pipe = IFImg2ImgPipeline.from_pretrained( @@ -126,7 +128,9 @@ def test_if_img2img(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - flush_memory(torch_device, reset_mem_stats=True) + backend_reset_max_memory_allocated(torch_device) + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index eca01509e334..082a06e58e57 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest @@ -22,8 +23,11 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, floats_tensor, - flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -104,12 +108,14 @@ class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_if_img2img_superresolution(self): pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained( @@ -120,7 +126,9 @@ def test_if_img2img_superresolution(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - flush_memory(torch_device, reset_mem_stats=True) + backend_reset_max_memory_allocated(torch_device) + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) @@ -140,10 +148,7 @@ def test_if_img2img_superresolution(self): assert image.shape == (256, 256, 3) - if torch_device == "cuda": - mem_bytes = torch.cuda.max_memory_allocated() - elif torch_device == "xpu": - mem_bytes = torch.xpu.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 12 * 10**9 diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index d4f655e1b165..b3d469403332 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest @@ -22,8 +23,11 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, floats_tensor, - flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -104,12 +108,14 @@ class IFInpaintingPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_if_inpainting(self): pipe = IFInpaintingPipeline.from_pretrained( @@ -118,7 +124,9 @@ def test_if_inpainting(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - flush_memory(torch_device, reset_mem_stats=True) + backend_reset_max_memory_allocated(torch_device) + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device) @@ -134,7 +142,7 @@ def test_if_inpainting(self): ) image = output.images[0] - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 12 * 10**9 expected_image = load_numpy( diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index 4d793202d282..d8372578708b 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest @@ -22,8 +23,11 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, floats_tensor, - flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -106,12 +110,14 @@ class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_if_inpainting_superresolution(self): pipe = IFInpaintingSuperResolutionPipeline.from_pretrained( @@ -121,7 +127,9 @@ def test_if_inpainting_superresolution(self): pipe.enable_model_cpu_offload(device=torch_device) # Super resolution test - flush_memory(torch_device, reset_mem_stats=True) + backend_reset_max_memory_allocated(torch_device) + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) @@ -143,10 +151,7 @@ def test_if_inpainting_superresolution(self): assert image.shape == (256, 256, 3) - if torch_device == "cuda": - mem_bytes = torch.cuda.max_memory_allocated() - elif torch_device == "xpu": - mem_bytes = torch.xpu.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 12 * 10**9 expected_image = load_numpy( diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py index 94b915923b4f..e7009ec2bbd2 100644 --- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest @@ -22,8 +23,11 @@ from diffusers.models.attention_processor import AttnAddedKVProcessor from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, floats_tensor, - flush_memory, load_numpy, require_accelerator, require_torch_accelerator, @@ -99,12 +103,14 @@ class IFSuperResolutionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_if_superresolution(self): pipe = IFSuperResolutionPipeline.from_pretrained( @@ -114,7 +120,9 @@ def test_if_superresolution(self): pipe.enable_model_cpu_offload(device=torch_device) # Super resolution test - flush_memory(torch_device, reset_mem_stats=True) + backend_reset_max_memory_allocated(torch_device) + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) @@ -130,10 +138,7 @@ def test_if_superresolution(self): assert image.shape == (256, 256, 3) - if torch_device == "cuda": - mem_bytes = torch.cuda.max_memory_allocated() - elif torch_device == "xpu": - mem_bytes = torch.xpu.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 12 * 10**9 expected_image = load_numpy( diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index 87c4a10c72ad..ae29a34a3320 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import random import unittest @@ -35,9 +36,9 @@ from diffusers.models.unets import I2VGenXLUNet from diffusers.utils import is_xformers_available, load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, - flush_memory, numpy_cosine_similarity_distance, require_torch_accelerator, skip_mps, @@ -231,12 +232,14 @@ class I2VGenXLPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_i2vgen_xl(self): pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 4238baa437ff..db924c72437c 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -66,9 +66,9 @@ ) from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, enable_full_determinism, floats_tensor, - flush_memory, get_python_version, get_tests_dir, is_torch_compile, @@ -1820,12 +1820,14 @@ class PipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_smart_download(self): model_id = "hf-internal-testing/unet-pipeline-dummy" @@ -2017,7 +2019,7 @@ def test_weighted_prompts_compel(self): pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.enable_attention_slicing() compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder) @@ -2049,12 +2051,14 @@ class PipelineNightlyTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() - flush_memory(torch_device, gc_collect=True) + gc.collect() + backend_empty_cache(torch_device) def test_ddpm_ddim_equality_batched(self): seed = 0 From 35d7a7a81c8dcc7840177e82c5b5896cdcfd9f37 Mon Sep 17 00:00:00 2001 From: hlky <hlky@hlky.ac> Date: Tue, 7 Jan 2025 15:15:40 +0000 Subject: [PATCH 17/33] make style --- src/diffusers/utils/testing_utils.py | 8 +++----- tests/pipelines/controlnet/test_controlnet.py | 3 +++ tests/pipelines/deepfloyd_if/test_if_inpainting.py | 2 +- .../deepfloyd_if/test_if_inpainting_superresolution.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 393b6c8073f4..bb0a2c174797 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1093,9 +1093,9 @@ def _is_torch_fp64_available(device): "default": None, } BACKEND_MAX_MEMORY_ALLOCATED = { - "cuda": torch.cuda.max_memory_allocated, - "xpu": torch.xpu.max_memory_allocated, - "default": 0, + "cuda": torch.cuda.max_memory_allocated, + "xpu": torch.xpu.max_memory_allocated, + "default": 0, } @@ -1198,5 +1198,3 @@ def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name update_mapping_from_spec(BACKEND_RESET_PEAK_MEMORY_STATS, "RESET_PEAK_MEMORY_STATS_FN") update_mapping_from_spec(BACKEND_RESET_MAX_MEMORY_ALLOCATED, "RESET_MAX_MEMORY_ALLOCATED_FN") update_mapping_from_spec(BACKEND_MAX_MEMORY_ALLOCATED, "MAX_MEMORY_ALLOCATED_FN") - - diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index c8ed4f768092..c215ecc8fdc6 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -35,6 +35,9 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, get_python_version, is_torch_compile, diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index b3d469403332..2a5294a24421 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -124,8 +124,8 @@ def test_if_inpainting(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - backend_reset_max_memory_allocated(torch_device) backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index d8372578708b..548ffd1f8df5 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -127,8 +127,8 @@ def test_if_inpainting_superresolution(self): pipe.enable_model_cpu_offload(device=torch_device) # Super resolution test - backend_reset_max_memory_allocated(torch_device) backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) generator = torch.Generator(device="cpu").manual_seed(0) From c8661f0dee75f788e05e78f6168c56d393889f6c Mon Sep 17 00:00:00 2001 From: hlky <hlky@hlky.ac> Date: Tue, 7 Jan 2025 15:21:30 +0000 Subject: [PATCH 18/33] check_torch_dependencies --- src/diffusers/utils/testing_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index bb0a2c174797..f9dcb46e87f0 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1085,7 +1085,7 @@ def _is_torch_fp64_available(device): } BACKEND_RESET_PEAK_MEMORY_STATS = { "cuda": torch.cuda.reset_peak_memory_stats, - "xpu": torch.xpu.reset_peak_memory_stats, + "xpu": getattr(torch.xpu, "reset_peak_memory_stats", None), "default": None, } BACKEND_RESET_MAX_MEMORY_ALLOCATED = { @@ -1094,7 +1094,7 @@ def _is_torch_fp64_available(device): } BACKEND_MAX_MEMORY_ALLOCATED = { "cuda": torch.cuda.max_memory_allocated, - "xpu": torch.xpu.max_memory_allocated, + "xpu": getattr(torch.xpu, "max_memory_allocated", None), "default": 0, } From d820f75c16d6fa811fa15263047e9b3a5d308c05 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Wed, 8 Jan 2025 17:46:37 +0800 Subject: [PATCH 19/33] add mps empty cache --- src/diffusers/utils/testing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index f9dcb46e87f0..fa18e1606997 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1067,7 +1067,7 @@ def _is_torch_fp64_available(device): "cuda": torch.cuda.empty_cache, "xpu": torch.xpu.empty_cache, "cpu": None, - "mps": None, + "mps": torch.mps.empty_cache, "default": None, } BACKEND_DEVICE_COUNT = { From 6ed4523ef9fac919a1b3f76e741b42cbd84103e9 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli.lin@intel.com> Date: Thu, 9 Jan 2025 06:44:33 +0000 Subject: [PATCH 20/33] add changes --- src/diffusers/utils/testing_utils.py | 10 +-- tests/lora/test_lora_layers_sd.py | 18 ++-- tests/lora/test_lora_layers_sd3.py | 10 +-- tests/pipelines/deepfloyd_if/test_if.py | 2 - .../pipelines/deepfloyd_if/test_if_img2img.py | 2 - .../test_if_img2img_superresolution.py | 1 - .../deepfloyd_if/test_if_inpainting.py | 2 - .../test_if_inpainting_superresolution.py | 1 - .../deepfloyd_if/test_if_superresolution.py | 2 - .../pipelines/marigold/test_marigold_depth.py | 29 ++++--- .../marigold/test_marigold_normals.py | 31 +++---- tests/pipelines/mochi/test_mochi.py | 11 +-- tests/pipelines/pag/test_pag_sd.py | 13 +-- tests/pipelines/pag/test_pag_sd3_img2img.py | 11 +-- tests/pipelines/pag/test_pag_sd_img2img.py | 15 ++-- tests/pipelines/pag/test_pag_sd_inpaint.py | 13 +-- tests/pipelines/pixart_alpha/test_pixart.py | 17 ++-- tests/pipelines/pixart_sigma/test_pixart.py | 17 ++-- tests/pipelines/sana/test_sana.py | 13 +-- .../test_stable_cascade_combined.py | 8 +- .../test_stable_cascade_decoder.py | 11 +-- .../test_stable_cascade_prior.py | 11 +-- .../stable_diffusion/test_stable_diffusion.py | 87 +++++++++---------- .../test_stable_diffusion_img2img.py | 43 ++++----- .../test_stable_diffusion_inpaint.py | 39 +++++---- ...st_stable_diffusion_instruction_pix2pix.py | 20 +++-- .../test_stable_diffusion.py | 20 ++--- .../test_stable_diffusion_depth.py | 15 ++-- .../test_stable_diffusion_diffedit.py | 17 ++-- .../test_stable_diffusion_inpaint.py | 17 ++-- .../test_stable_diffusion_latent_upscale.py | 15 ++-- .../test_stable_diffusion_v_pred.py | 36 ++++---- .../test_stable_diffusion_adapter.py | 9 +- .../test_stable_diffusion_image_variation.py | 26 +++--- tests/pipelines/test_pipelines.py | 4 +- 35 files changed, 301 insertions(+), 295 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 393b6c8073f4..c2b26883c286 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1088,10 +1088,6 @@ def _is_torch_fp64_available(device): "xpu": torch.xpu.reset_peak_memory_stats, "default": None, } - BACKEND_RESET_MAX_MEMORY_ALLOCATED = { - "cuda": torch.cuda.reset_max_memory_allocated, - "default": None, - } BACKEND_MAX_MEMORY_ALLOCATED = { "cuda": torch.cuda.max_memory_allocated, "xpu": torch.xpu.max_memory_allocated, @@ -1101,6 +1097,7 @@ def _is_torch_fp64_available(device): # This dispatches a defined function according to the accelerator from the function definitions. def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs): + if device not in dispatch_table: return dispatch_table["default"](*args, **kwargs) @@ -1131,10 +1128,6 @@ def backend_reset_peak_memory_stats(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) -def backend_reset_max_memory_allocated(device: str): - return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED) - - def backend_max_memory_allocated(device: str): return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED) @@ -1196,7 +1189,6 @@ def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN") update_mapping_from_spec(BACKEND_SUPPORTS_TRAINING, "SUPPORTS_TRAINING") update_mapping_from_spec(BACKEND_RESET_PEAK_MEMORY_STATS, "RESET_PEAK_MEMORY_STATS_FN") - update_mapping_from_spec(BACKEND_RESET_MAX_MEMORY_ALLOCATED, "RESET_MAX_MEMORY_ALLOCATED_FN") update_mapping_from_spec(BACKEND_MAX_MEMORY_ALLOCATED, "MAX_MEMORY_ALLOCATED_FN") diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py index e91b0689b4ce..e71c6e3b53e3 100644 --- a/tests/lora/test_lora_layers_sd.py +++ b/tests/lora/test_lora_layers_sd.py @@ -37,7 +37,7 @@ nightly, numpy_cosine_similarity_distance, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -101,7 +101,7 @@ def tearDown(self): # Keeping this test here makes sense because it doesn't look any integration # (value assertions on logits). @slow - @require_torch_gpu + @require_torch_accelerator def test_integration_move_lora_cpu(self): path = "stable-diffusion-v1-5/stable-diffusion-v1-5" lora_id = "takuma104/lora-test-text-encoder-lora-target" @@ -158,7 +158,7 @@ def test_integration_move_lora_cpu(self): self.assertTrue(m.weight.device != torch.device("cpu")) @slow - @require_torch_gpu + @require_torch_accelerator def test_integration_move_lora_dora_cpu(self): from peft import LoraConfig @@ -209,18 +209,18 @@ def test_integration_move_lora_dora_cpu(self): @slow @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend class LoraIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_integration_logits_with_scale(self): path = "stable-diffusion-v1-5/stable-diffusion-v1-5" @@ -378,7 +378,7 @@ def test_a1111_with_model_cpu_offload(self): generator = torch.Generator().manual_seed(0) pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) lora_model_id = "hf-internal-testing/civitai-light-shadow-lora" lora_filename = "light_and_shadow.safetensors" pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) @@ -400,7 +400,7 @@ def test_a1111_with_sequential_cpu_offload(self): generator = torch.Generator().manual_seed(0) pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) lora_model_id = "hf-internal-testing/civitai-light-shadow-lora" lora_filename = "light_and_shadow.safetensors" pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) @@ -656,7 +656,7 @@ def test_sd_load_civitai_empty_network_alpha(self): See: https://github.com/huggingface/diffusers/issues/5606 """ pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") - pipeline.enable_sequential_cpu_offload() + pipeline.enable_sequential_cpu_offload(device=torch_device) civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors") pipeline.load_lora_weights(civitai_path, adapter_name="ahri") diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index 40383e3f1ee3..a6cd4c97449d 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -32,7 +32,7 @@ nightly, numpy_cosine_similarity_distance, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -91,7 +91,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): def output_shape(self): return (1, 32, 32, 3) - @require_torch_gpu + @require_torch_accelerator def test_sd3_lora(self): """ Test loading the loras that are saved with the diffusers and peft formats. @@ -130,7 +130,7 @@ def test_modify_padding_mode(self): @slow @nightly -@require_torch_gpu +@require_torch_accelerator @require_peft_backend class LoraSD3IntegrationTests(unittest.TestCase): pipeline_class = StableDiffusion3Img2ImgPipeline @@ -139,12 +139,12 @@ class LoraSD3IntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): init_image = load_image( diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index 170374b8d4f2..09e48d1132d8 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -25,7 +25,6 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( backend_empty_cache, - backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, load_numpy, require_accelerator, @@ -113,7 +112,6 @@ def test_if_text_to_image(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - backend_reset_max_memory_allocated(torch_device) backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index 257161c8bce1..077e43738ca8 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -24,7 +24,6 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( backend_empty_cache, - backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, @@ -128,7 +127,6 @@ def test_if_img2img(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - backend_reset_max_memory_allocated(torch_device) backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index 082a06e58e57..cc4e4fe89b00 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -25,7 +25,6 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, - backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index b3d469403332..a42a72f81b14 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -25,7 +25,6 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, - backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, @@ -124,7 +123,6 @@ def test_if_inpainting(self): pipe.unet.set_attn_processor(AttnAddedKVProcessor()) pipe.enable_model_cpu_offload(device=torch_device) - backend_reset_max_memory_allocated(torch_device) backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index d8372578708b..66bbadba22fc 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -25,7 +25,6 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, - backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py index e7009ec2bbd2..a53392be5c11 100644 --- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -25,7 +25,6 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, - backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, @@ -120,7 +119,6 @@ def test_if_superresolution(self): pipe.enable_model_cpu_offload(device=torch_device) # Super resolution test - backend_reset_max_memory_allocated(torch_device) backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) diff --git a/tests/pipelines/marigold/test_marigold_depth.py b/tests/pipelines/marigold/test_marigold_depth.py index fcb9adca7a7b..1f777bb31b4d 100644 --- a/tests/pipelines/marigold/test_marigold_depth.py +++ b/tests/pipelines/marigold/test_marigold_depth.py @@ -32,12 +32,14 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, is_flaky, load_image, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -288,17 +290,17 @@ def test_marigold_depth_dummy_no_processing_resolution(self): @slow -@require_torch_gpu +@require_torch_accelerator class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def _test_marigold_depth( self, @@ -317,8 +319,7 @@ def _test_marigold_depth( from_pretrained_kwargs["torch_dtype"] = torch.float16 pipe = MarigoldDepthPipeline.from_pretrained(model_id, **from_pretrained_kwargs) - if device == "cuda": - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device=device).manual_seed(generator_seed) @@ -358,7 +359,7 @@ def test_marigold_depth_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self): def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=False, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1244, 0.1265, 0.1292, 0.1240, 0.1252, 0.1266, 0.1246, 0.1226, 0.1180]), num_inference_steps=1, @@ -371,7 +372,7 @@ def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1241, 0.1262, 0.1290, 0.1238, 0.1250, 0.1265, 0.1244, 0.1225, 0.1179]), num_inference_steps=1, @@ -384,7 +385,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=2024, expected_slice=np.array([0.1710, 0.1725, 0.1738, 0.1700, 0.1700, 0.1696, 0.1698, 0.1663, 0.1592]), num_inference_steps=1, @@ -397,7 +398,7 @@ def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]), num_inference_steps=2, @@ -410,7 +411,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.2683, 0.2693, 0.2698, 0.2666, 0.2632, 0.2615, 0.2656, 0.2603, 0.2573]), num_inference_steps=1, @@ -423,7 +424,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1200, 0.1215, 0.1237, 0.1193, 0.1197, 0.1202, 0.1196, 0.1166, 0.1109]), num_inference_steps=1, @@ -437,7 +438,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.1121, 0.1135, 0.1155, 0.1111, 0.1115, 0.1118, 0.1111, 0.1079, 0.1019]), num_inference_steps=1, @@ -451,7 +452,7 @@ def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self): self._test_marigold_depth( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.2671, 0.2690, 0.2720, 0.2659, 0.2676, 0.2739, 0.2664, 0.2686, 0.2573]), num_inference_steps=1, diff --git a/tests/pipelines/marigold/test_marigold_normals.py b/tests/pipelines/marigold/test_marigold_normals.py index c86c600be8e5..1b3a8576dde6 100644 --- a/tests/pipelines/marigold/test_marigold_normals.py +++ b/tests/pipelines/marigold/test_marigold_normals.py @@ -32,11 +32,13 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -285,17 +287,17 @@ def test_marigold_depth_dummy_no_processing_resolution(self): @slow -@require_torch_gpu +@require_torch_accelerator class MarigoldNormalsPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def _test_marigold_normals( self, @@ -314,8 +316,7 @@ def _test_marigold_normals( from_pretrained_kwargs["torch_dtype"] = torch.float16 pipe = MarigoldNormalsPipeline.from_pretrained(model_id, **from_pretrained_kwargs) - if device == "cuda": - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device=device).manual_seed(generator_seed) @@ -342,7 +343,7 @@ def _test_marigold_normals( def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self): self._test_marigold_normals( is_fp16=False, - device="cpu", + device=torch_device, generator_seed=0, expected_slice=np.array([0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971, 0.8971]), num_inference_steps=1, @@ -355,7 +356,7 @@ def test_marigold_normals_einstein_f32_cpu_G0_S1_P32_E1_B1_M1(self): def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=False, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7980, 0.7952, 0.7914, 0.7931, 0.7871, 0.7816, 0.7844, 0.7710, 0.7601]), num_inference_steps=1, @@ -368,7 +369,7 @@ def test_marigold_normals_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7979, 0.7949, 0.7915, 0.7930, 0.7871, 0.7817, 0.7842, 0.7710, 0.7603]), num_inference_steps=1, @@ -381,7 +382,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=2024, expected_slice=np.array([0.8428, 0.8428, 0.8433, 0.8369, 0.8325, 0.8315, 0.8271, 0.8135, 0.8057]), num_inference_steps=1, @@ -394,7 +395,7 @@ def test_marigold_normals_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7095, 0.7095, 0.7104, 0.7070, 0.7051, 0.7061, 0.7017, 0.6938, 0.6914]), num_inference_steps=2, @@ -407,7 +408,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7168, 0.7163, 0.7163, 0.7080, 0.7061, 0.7046, 0.7031, 0.7007, 0.6987]), num_inference_steps=1, @@ -420,7 +421,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7114, 0.7124, 0.7144, 0.7085, 0.7070, 0.7080, 0.7051, 0.6958, 0.6924]), num_inference_steps=1, @@ -434,7 +435,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7412, 0.7441, 0.7490, 0.7383, 0.7388, 0.7437, 0.7329, 0.7271, 0.7300]), num_inference_steps=1, @@ -448,7 +449,7 @@ def test_marigold_normals_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): def test_marigold_normals_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self): self._test_marigold_normals( is_fp16=True, - device="cuda", + device=torch_device, generator_seed=0, expected_slice=np.array([0.7188, 0.7144, 0.7134, 0.7178, 0.7207, 0.7222, 0.7231, 0.7041, 0.6987]), num_inference_steps=1, diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py index bbcf6d210ce5..3517bb4ce8f1 100644 --- a/tests/pipelines/mochi/test_mochi.py +++ b/tests/pipelines/mochi/test_mochi.py @@ -22,9 +22,10 @@ from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -261,25 +262,25 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): @slow -@require_torch_gpu +@require_torch_accelerator class MochiPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_mochi(self): generator = torch.Generator("cpu").manual_seed(0) pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt videos = pipe( diff --git a/tests/pipelines/pag/test_pag_sd.py b/tests/pipelines/pag/test_pag_sd.py index 3979bb170e0b..73aaacb58e0e 100644 --- a/tests/pipelines/pag/test_pag_sd.py +++ b/tests/pipelines/pag/test_pag_sd.py @@ -30,8 +30,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -280,7 +281,7 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionPAGPipeline repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" @@ -288,12 +289,12 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=1, guidance_scale=7.0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -310,7 +311,7 @@ def get_inputs(self, device, generator_device="cpu", seed=1, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -328,7 +329,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sd3_img2img.py b/tests/pipelines/pag/test_pag_sd3_img2img.py index bffcd254e2c5..592e94953ecc 100644 --- a/tests/pipelines/pag/test_pag_sd3_img2img.py +++ b/tests/pipelines/pag/test_pag_sd3_img2img.py @@ -16,10 +16,11 @@ StableDiffusion3PAGImg2ImgPipeline, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -193,7 +194,7 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusion3PAGImg2ImgPipeline repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" @@ -201,12 +202,12 @@ class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs( self, device, generator_device="cpu", dtype=torch.float32, seed=0, guidance_scale=7.0, pag_scale=0.7 @@ -233,7 +234,7 @@ def test_pag_cfg(self): pipeline = AutoPipelineForImage2Image.from_pretrained( self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.17"] ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/pag/test_pag_sd_img2img.py b/tests/pipelines/pag/test_pag_sd_img2img.py index ec8cde23c31d..38c5c5c83595 100644 --- a/tests/pipelines/pag/test_pag_sd_img2img.py +++ b/tests/pipelines/pag/test_pag_sd_img2img.py @@ -32,10 +32,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -212,7 +213,7 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionPAGImg2ImgPipeline repo_id = "Jiali/stable-diffusion-1.5" @@ -220,12 +221,12 @@ class StableDiffusionPAGImg2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -247,7 +248,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0 def test_pag_cfg(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -265,10 +266,10 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) - inputs = self.get_inputs(torch_device, guidance_scale=0.0) + inputs = self.get_inputs(torch_device) image = pipeline(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() diff --git a/tests/pipelines/pag/test_pag_sd_inpaint.py b/tests/pipelines/pag/test_pag_sd_inpaint.py index cd175c600d47..64fc218d600e 100644 --- a/tests/pipelines/pag/test_pag_sd_inpaint.py +++ b/tests/pipelines/pag/test_pag_sd_inpaint.py @@ -30,10 +30,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -246,7 +247,7 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionPAGInpaintPipeline repo_id = "runwayml/stable-diffusion-v1-5" @@ -254,12 +255,12 @@ class StableDiffusionPAGPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" @@ -284,7 +285,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -302,7 +303,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py index e7039c61a448..e56faeda5526 100644 --- a/tests/pipelines/pixart_alpha/test_pixart.py +++ b/tests/pipelines/pixart_alpha/test_pixart.py @@ -28,9 +28,10 @@ PixArtTransformer2DModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -326,7 +327,7 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class PixArtAlphaPipelineIntegrationTests(unittest.TestCase): ckpt_id_1024 = "PixArt-alpha/PixArt-XL-2-1024-MS" ckpt_id_512 = "PixArt-alpha/PixArt-XL-2-512x512" @@ -335,18 +336,18 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_pixart_1024(self): generator = torch.Generator("cpu").manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images @@ -361,7 +362,7 @@ def test_pixart_512(self): generator = torch.Generator("cpu").manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt @@ -377,7 +378,7 @@ def test_pixart_1024_without_resolution_binning(self): generator = torch.manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 1024, 768 @@ -411,7 +412,7 @@ def test_pixart_512_without_resolution_binning(self): generator = torch.manual_seed(0) pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 512, 768 diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py index a92e99366ee3..eac95f214aeb 100644 --- a/tests/pipelines/pixart_sigma/test_pixart.py +++ b/tests/pipelines/pixart_sigma/test_pixart.py @@ -28,9 +28,10 @@ PixArtTransformer2DModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -355,7 +356,7 @@ def test_fused_qkv_projections(self): @slow -@require_torch_gpu +@require_torch_accelerator class PixArtSigmaPipelineIntegrationTests(unittest.TestCase): ckpt_id_1024 = "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS" ckpt_id_512 = "PixArt-alpha/PixArt-Sigma-XL-2-512-MS" @@ -364,18 +365,18 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_pixart_1024(self): generator = torch.Generator("cpu").manual_seed(0) pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images @@ -395,7 +396,7 @@ def test_pixart_512(self): pipe = PixArtSigmaPipeline.from_pretrained( self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt @@ -411,7 +412,7 @@ def test_pixart_1024_without_resolution_binning(self): generator = torch.manual_seed(0) pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 1024, 768 @@ -450,7 +451,7 @@ def test_pixart_512_without_resolution_binning(self): pipe = PixArtSigmaPipeline.from_pretrained( self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt height, width = 512, 768 diff --git a/tests/pipelines/sana/test_sana.py b/tests/pipelines/sana/test_sana.py index 21de4e04437a..a894faedb76e 100644 --- a/tests/pipelines/sana/test_sana.py +++ b/tests/pipelines/sana/test_sana.py @@ -22,8 +22,9 @@ from diffusers import AutoencoderDC, FlowMatchEulerDiscreteScheduler, SanaPipeline, SanaTransformer2DModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -273,19 +274,19 @@ def test_float16_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class SanaPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_sana_1024(self): generator = torch.Generator("cpu").manual_seed(0) @@ -293,7 +294,7 @@ def test_sana_1024(self): pipe = SanaPipeline.from_pretrained( "Efficient-Large-Model/Sana_1600M_1024px_diffusers", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) image = pipe( prompt=self.prompt, @@ -319,7 +320,7 @@ def test_sana_512(self): pipe = SanaPipeline.from_pretrained( "Efficient-Large-Model/Sana_1600M_512px_diffusers", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) image = pipe( prompt=self.prompt, diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py index d256deed376c..5ec86f92c3d2 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py @@ -22,7 +22,7 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadeCombinedPipeline from diffusers.models import StableCascadeUNet from diffusers.pipelines.wuerstchen import PaellaVQModel -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin @@ -205,7 +205,7 @@ def test_stable_cascade(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -214,12 +214,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py index 07e4244e3c68..e3362aad88ce 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py @@ -24,11 +24,12 @@ from diffusers.models import StableCascadeUNet from diffusers.pipelines.wuerstchen import PaellaVQModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_numpy, load_pt, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -309,25 +310,25 @@ def test_stable_cascade_decoder_single_prompt_multiple_image_embeddings_with_gui @slow -@require_torch_gpu +@require_torch_accelerator class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_cascade_decoder(self): pipe = StableCascadeDecoderPipeline.from_pretrained( "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." diff --git a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py index 0208224a1d80..27018907f108 100644 --- a/tests/pipelines/stable_cascade/test_stable_cascade_prior.py +++ b/tests/pipelines/stable_cascade/test_stable_cascade_prior.py @@ -24,11 +24,12 @@ from diffusers.models import StableCascadeUNet from diffusers.utils.import_utils import is_peft_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_numpy, numpy_cosine_similarity_distance, require_peft_backend, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -277,25 +278,25 @@ def test_stable_cascade_decoder_prompt_embeds(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableCascadePriorPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_cascade_prior(self): pipe = StableCascadePriorPipeline.from_pretrained( "stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index ccd5567106d2..0f12fc0c36d6 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -44,6 +44,9 @@ ) from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, is_torch_compile, load_image, @@ -52,7 +55,7 @@ numpy_cosine_similarity_distance, require_accelerate_version_greater, require_torch_2, - require_torch_gpu, + require_torch_accelerator, require_torch_multi_gpu, run_test_in_subprocess, skip_mps, @@ -850,11 +853,11 @@ def test_pipeline_accept_tuple_type_unet_sample_size(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineSlowTests(unittest.TestCase): def setUp(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -956,7 +959,7 @@ def test_stable_diffusion_dpm(self): assert np.abs(image_slice - expected_slice).max() < 3e-3 def test_stable_diffusion_attention_slicing(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe.unet.set_default_attn_processor() pipe = pipe.to(torch_device) @@ -967,8 +970,8 @@ def test_stable_diffusion_attention_slicing(self): inputs = self.get_inputs(torch_device, dtype=torch.float16) image_sliced = pipe(**inputs).images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 3.75 GB is allocated assert mem_bytes < 3.75 * 10**9 @@ -979,7 +982,7 @@ def test_stable_diffusion_attention_slicing(self): image = pipe(**inputs).images # make sure that more than 3.75 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3.75 * 10**9 max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) assert max_diff < 1e-3 @@ -998,8 +1001,8 @@ def test_stable_diffusion_vae_slicing(self): inputs["latents"] = torch.cat([inputs["latents"]] * 4) image_sliced = pipe(**inputs).images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 4 GB is allocated assert mem_bytes < 4e9 @@ -1011,14 +1014,14 @@ def test_stable_diffusion_vae_slicing(self): image = pipe(**inputs).images # make sure that more than 4 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 4e9 # There is a small discrepancy at the image borders vs. a fully batched version. max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) assert max_diff < 1e-2 def test_stable_diffusion_vae_tiling(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) model_id = "CompVis/stable-diffusion-v1-4" pipe = StableDiffusionPipeline.from_pretrained( model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None @@ -1032,7 +1035,7 @@ def test_stable_diffusion_vae_tiling(self): # enable vae tiling pipe.enable_vae_tiling() - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) generator = torch.Generator(device="cpu").manual_seed(0) output_chunked = pipe( [prompt], @@ -1045,7 +1048,7 @@ def test_stable_diffusion_vae_tiling(self): ) image_chunked = output_chunked.images - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # disable vae tiling pipe.disable_vae_tiling() @@ -1138,26 +1141,24 @@ def test_stable_diffusion_low_cpu_mem_usage(self): assert 2 * low_cpu_mem_usage_time < normal_load_time def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.8 GB is allocated assert mem_bytes < 2.8 * 10**9 def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) @@ -1171,7 +1172,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) outputs = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # With model offloading @@ -1182,16 +1183,15 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): ) pipe.unet.set_default_attn_processor() - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, dtype=torch.float16) outputs_offloaded = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() + mem_bytes_offloaded = backend_max_memory_allocated(torch_device) images = outputs.images offloaded_images = outputs_offloaded.images @@ -1204,13 +1204,12 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): assert module.device == torch.device("cpu") # With attention slicing - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe.enable_attention_slicing() _ = pipe(**inputs) - mem_bytes_slicing = torch.cuda.max_memory_allocated() + mem_bytes_slicing = backend_max_memory_allocated(torch_device) assert mem_bytes_slicing < mem_bytes_offloaded assert mem_bytes_slicing < 3 * 10**9 @@ -1225,7 +1224,7 @@ def test_stable_diffusion_textual_inversion(self): ) pipe.load_textual_inversion(a111_file) pipe.load_textual_inversion(a111_file_neg) - pipe.to("cuda") + pipe.to(torch_device) generator = torch.Generator(device="cpu").manual_seed(1) @@ -1242,7 +1241,7 @@ def test_stable_diffusion_textual_inversion(self): def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self): pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") @@ -1267,8 +1266,8 @@ def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self): def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self): pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - pipe.enable_sequential_cpu_offload() - pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") + pipe.enable_sequential_cpu_offload(device=torch_device) + pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons").to(torch_device) a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") a111_file_neg = hf_hub_download( @@ -1326,17 +1325,17 @@ def test_stable_diffusion_lcm(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineCkptTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_download_from_hub(self): ckpt_paths = [ @@ -1347,7 +1346,7 @@ def test_download_from_hub(self): for ckpt_path in ckpt_paths: pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.to(torch_device) image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -1363,7 +1362,7 @@ def test_download_local(self): ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16 ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.to(torch_device) image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -1371,17 +1370,17 @@ def test_download_local(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -1481,7 +1480,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, generator_device="cpu", seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 7ba0bb5a4a5d..881c2ca849bd 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -35,6 +35,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, is_torch_compile, @@ -42,7 +45,7 @@ load_numpy, nightly, require_torch_2, - require_torch_gpu, + require_torch_accelerator, run_test_in_subprocess, skip_mps, slow, @@ -393,17 +396,17 @@ def callback_on_step_end(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -506,28 +509,27 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: assert number_of_steps == 2 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionImg2ImgPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) @@ -541,7 +543,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # With model offloading @@ -552,14 +554,13 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): torch_dtype=torch.float16, ) - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) _ = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() + mem_bytes_offloaded = backend_max_memory_allocated(torch_device) assert mem_bytes_offloaded < mem_bytes for module in pipe.text_encoder, pipe.unet, pipe.vae: @@ -656,17 +657,17 @@ def test_img2img_compile(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index ff04ea2cfc5d..81bf9231b82f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -37,6 +37,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_reset_peak_memory_stats, + backend_max_memory_allocated, + backend_empty_cache, enable_full_determinism, floats_tensor, is_torch_compile, @@ -44,7 +47,7 @@ load_numpy, nightly, require_torch_2, - require_torch_gpu, + require_torch_accelerator, run_test_in_subprocess, slow, torch_device, @@ -595,7 +598,7 @@ def test_stable_diffusion_inpaint_euler(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() @@ -603,7 +606,7 @@ def setUp(self): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -697,21 +700,20 @@ def test_stable_diffusion_inpaint_k_lms(self): assert np.abs(expected_slice - image_slice).max() < 6e-3 def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 @@ -786,7 +788,7 @@ def test_stable_diffusion_simple_inpaint_ddim(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.TestCase): def setUp(self): super().setUp() @@ -794,7 +796,7 @@ def setUp(self): def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -900,9 +902,8 @@ def test_stable_diffusion_inpaint_k_lms(self): assert np.abs(expected_slice - image_slice).max() < 6e-3 def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) vae = AsymmetricAutoencoderKL.from_pretrained( "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16 @@ -913,12 +914,12 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): pipe.vae = vae pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.45 GB is allocated assert mem_bytes < 2.45 * 10**9 @@ -1002,7 +1003,7 @@ def test_download_local(self): pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe.vae = vae pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.to(torch_device) inputs = self.get_inputs(torch_device) inputs["num_inference_steps"] = 1 @@ -1012,17 +1013,17 @@ def test_download_local(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index b9b061c060c0..05608e82b10b 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -33,10 +33,13 @@ ) from diffusers.image_processor import VaeImageProcessor from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_reset_peak_memory_stats, + backend_max_memory_allocated, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -269,17 +272,17 @@ def callback_no_cfg(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, seed=0): generator = torch.manual_seed(seed) @@ -387,21 +390,20 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: assert number_of_steps == 3 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs() _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index e7114d19e208..7f0dde2e7f32 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -34,12 +34,13 @@ from diffusers.utils.testing_utils import ( CaptureLogger, backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, load_numpy, nightly, numpy_cosine_similarity_distance, require_torch_accelerator, - require_torch_gpu, skip_mps, slow, torch_device, @@ -321,9 +322,8 @@ def tearDown(self): backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - _generator_device = "cpu" if not generator_device.startswith("cuda") else "cuda" if not str(device).startswith("mps"): - generator = torch.Generator(device=_generator_device).manual_seed(seed) + generator = torch.Generator(device=generator_device).manual_seed(seed) else: generator = torch.manual_seed(seed) @@ -352,9 +352,9 @@ def test_stable_diffusion_default_ddim(self): expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) assert np.abs(image_slice - expected_slice).max() < 7e-3 - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_attention_slicing(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16 ) @@ -366,9 +366,9 @@ def test_stable_diffusion_attention_slicing(self): pipe.enable_attention_slicing() inputs = self.get_inputs(torch_device, dtype=torch.float16) image_sliced = pipe(**inputs).images - - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 3.3 GB is allocated assert mem_bytes < 3.3 * 10**9 @@ -377,9 +377,9 @@ def test_stable_diffusion_attention_slicing(self): pipe.unet.set_default_attn_processor() inputs = self.get_inputs(torch_device, dtype=torch.float16) image = pipe(**inputs).images - + # make sure that more than 3.3 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3.3 * 10**9 max_diff = numpy_cosine_similarity_distance(image.flatten(), image_sliced.flatten()) assert max_diff < 5e-3 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 01a0a3abe4ee..d89aeb7eff17 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -37,6 +37,7 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, @@ -44,7 +45,7 @@ nightly, require_accelerate_version_greater, require_accelerator, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -369,17 +370,17 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=device).manual_seed(seed) @@ -416,17 +417,17 @@ def test_stable_diffusion_depth2img_pipeline_default(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=device).manual_seed(seed) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py index 1cb03ddd96d7..8f3cb01490a7 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py @@ -33,12 +33,13 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, nightly, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -292,18 +293,18 @@ def test_inversion_dpm(self): self.assertLessEqual(max_diff, 1e-3) -@require_torch_gpu +@require_torch_accelerator @nightly class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @classmethod def setUpClass(cls): @@ -324,7 +325,7 @@ def test_stable_diffusion_diffedit_full(self): pipe.scheduler.clip_sample = True pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) source_prompt = "a bowl of fruit" @@ -370,17 +371,17 @@ def test_stable_diffusion_diffedit_full(self): @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @classmethod def setUpClass(cls): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index b99a1816456e..f1ef15f28a3c 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -24,11 +24,13 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -154,19 +156,19 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_inpaint_pipeline(self): init_image = load_image( @@ -241,9 +243,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self): assert np.abs(expected_image - image).max() < 5e-1 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" @@ -263,7 +264,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) prompt = "Face of a yellow cat, high resolution, sitting on a park bench" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 134175bdaffe..519542181b69 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -31,11 +31,12 @@ ) from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -280,29 +281,29 @@ def test_float16_inference(self): super().test_float16_inference(expected_max_diff=5e-1) -@require_torch_gpu +@require_torch_accelerator @slow class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_latent_upscaler_fp16(self): generator = torch.manual_seed(33) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe.to("cuda") + pipe.to(torch_device) upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 ) - upscaler.to("cuda") + upscaler.to(torch_device) prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic" @@ -328,7 +329,7 @@ def test_latent_upscaler_fp16_image(self): upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 ) - upscaler.to("cuda") + upscaler.to(torch_device) prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index d69d1c492548..13450cb43114 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -31,11 +31,14 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, load_numpy, numpy_cosine_similarity_distance, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -49,13 +52,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @property def dummy_cond_unet(self): @@ -258,19 +261,19 @@ def test_stable_diffusion_v_pred_fp16(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_v_pred_default(self): sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") @@ -357,7 +360,7 @@ def test_stable_diffusion_v_pred_dpm(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 def test_stable_diffusion_attention_slicing_v_pred(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) model_id = "stabilityai/stable-diffusion-2" pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe.to(torch_device) @@ -373,8 +376,8 @@ def test_stable_diffusion_attention_slicing_v_pred(self): ) image_chunked = output_chunked.images - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + mem_bytes = backend_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # make sure that less than 5.5 GB is allocated assert mem_bytes < 5.5 * 10**9 @@ -385,7 +388,7 @@ def test_stable_diffusion_attention_slicing_v_pred(self): image = output.images # make sure that more than 3.0 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3 * 10**9 max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten()) assert max_diff < 1e-3 @@ -421,7 +424,7 @@ def test_stable_diffusion_text2img_pipeline_unflawed(self): pipe.scheduler = DDIMScheduler.from_config( pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" @@ -466,7 +469,7 @@ def test_download_local(self): pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -530,20 +533,19 @@ def test_stable_diffusion_low_cpu_mem_usage_v_pred(self): assert 2 * low_cpu_mem_usage_time < normal_load_time def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipeline_id = "stabilityai/stable-diffusion-2" prompt = "Andromeda galaxy in a bottle" pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) pipeline.enable_attention_slicing(1) - pipeline.enable_sequential_cpu_offload() + pipeline.enable_sequential_cpu_offload(device=torch_device) generator = torch.manual_seed(0) _ = pipeline(prompt, generator=generator, num_inference_steps=5) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.8 GB is allocated assert mem_bytes < 2.8 * 10**9 diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py index 2a1e691e9e8f..b38935e12eba 100644 --- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py @@ -35,12 +35,13 @@ from diffusers.utils import logging from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -595,17 +596,17 @@ def test_inference_batch_single_identical( @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_adapter_depth_sd_v15(self): adapter_model = "TencentARC/t2iadapter_depth_sd15v2" diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index 7a3b0f70ccb1..6d65e3bf8e85 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -30,13 +30,16 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, load_numpy, nightly, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -162,17 +165,17 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -256,37 +259,36 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: assert number_of_steps == inputs["num_inference_steps"] def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionImageVariationPipeline.from_pretrained( "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16 ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) _ = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.6 GB is allocated assert mem_bytes < 2.6 * 10**9 @nightly -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index db924c72437c..cca2521fcc30 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -1369,11 +1369,11 @@ def test_pipe_false_offload_warn(self): feature_extractor=self.dummy_extractor, ) - sd.enable_model_cpu_offload() + sd.enable_model_cpu_offload(device=torch_device) logger = logging.get_logger("diffusers.pipelines.pipeline_utils") with CaptureLogger(logger) as cap_logger: - sd.to("cuda") + sd.to(torch_device) assert "It is strongly recommended against doing so" in str(cap_logger) From b813f166ab8f92ffe2e7dc77e8aeaeb959821acf Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Wed, 8 Jan 2025 23:00:42 -0800 Subject: [PATCH 21/33] bug fix --- src/diffusers/utils/testing_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index fa18e1606997..de0c65e2e478 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1081,20 +1081,28 @@ def _is_torch_fp64_available(device): "cuda": torch.cuda.manual_seed, "xpu": torch.xpu.manual_seed, "cpu": torch.manual_seed, + "mps": torch.mps.manual_seed, "default": torch.manual_seed, } BACKEND_RESET_PEAK_MEMORY_STATS = { "cuda": torch.cuda.reset_peak_memory_stats, "xpu": getattr(torch.xpu, "reset_peak_memory_stats", None), + "cpu": None, + "mps": None, "default": None, } BACKEND_RESET_MAX_MEMORY_ALLOCATED = { "cuda": torch.cuda.reset_max_memory_allocated, + "xpu": None, + "cpu": None, + "mps": None, "default": None, } BACKEND_MAX_MEMORY_ALLOCATED = { "cuda": torch.cuda.max_memory_allocated, "xpu": getattr(torch.xpu, "max_memory_allocated", None), + "cpu": 0, + "mps": 0, "default": 0, } From f6ae0566ff1985e8b7936e2840e8f0a4c96e1fca Mon Sep 17 00:00:00 2001 From: faaany <fanli.lin@intel.com> Date: Thu, 9 Jan 2025 10:46:29 +0000 Subject: [PATCH 22/33] enable on xpu --- src/diffusers/utils/testing_utils.py | 2 + .../unets/test_models_unet_2d_condition.py | 55 ++++++++++--------- .../controlnet/test_controlnet_sdxl.py | 4 +- tests/pipelines/flux/test_pipeline_flux.py | 7 ++- tests/pipelines/pag/test_pag_sdxl_img2img.py | 13 +++-- tests/pipelines/pag/test_pag_sdxl_inpaint.py | 13 +++-- .../stable_diffusion/test_stable_diffusion.py | 2 +- .../test_stable_diffusion_upscale.py | 26 +++++---- .../test_stable_diffusion_xl.py | 8 +-- .../test_stable_diffusion_xl_img2img.py | 14 ++--- .../test_stable_diffusion_xl_inpaint.py | 8 +-- .../test_stable_diffusion_xl_k_diffusion.py | 8 +-- .../test_stable_video_diffusion.py | 11 ++-- .../test_text_to_video.py | 9 +-- .../pipelines/unidiffuser/test_unidiffuser.py | 28 +++++----- .../wuerstchen/test_wuerstchen_combined.py | 8 +-- 16 files changed, 114 insertions(+), 102 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 935824564cf6..a1a02dbed09b 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1137,6 +1137,8 @@ def backend_device_count(device: str): def backend_reset_peak_memory_stats(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) +def backend_reset_max_memory_allocated(device: str): + return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED) def backend_max_memory_allocated(device: str): return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED) diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py index 8ec5b6e9a5e4..c7fe023f0aaa 100644 --- a/tests/models/unets/test_models_unet_2d_condition.py +++ b/tests/models/unets/test_models_unet_2d_condition.py @@ -36,6 +36,9 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( backend_empty_cache, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, + backend_max_memory_allocated, enable_full_determinism, floats_tensor, is_peft_available, @@ -1014,7 +1017,7 @@ def test_load_sharded_checkpoint_from_hub_local(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_from_hub_local_subfolder(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder") @@ -1025,7 +1028,7 @@ def test_load_sharded_checkpoint_from_hub_local_subfolder(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator @parameterized.expand( [ ("hf-internal-testing/unet2d-sharded-dummy", None), @@ -1040,7 +1043,7 @@ def test_load_sharded_checkpoint_device_map_from_hub(self, repo_id, variant): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator @parameterized.expand( [ ("hf-internal-testing/unet2d-sharded-dummy-subfolder", None), @@ -1055,7 +1058,7 @@ def test_load_sharded_checkpoint_device_map_from_hub_subfolder(self, repo_id, va assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_device_map_from_hub_local(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy") @@ -1065,7 +1068,7 @@ def test_load_sharded_checkpoint_device_map_from_hub_local(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_device_map_from_hub_local_subfolder(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy-subfolder") @@ -1165,11 +1168,11 @@ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"): return model - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_auto(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) unet = self.get_unet_model() unet.set_attention_slice("auto") @@ -1181,15 +1184,15 @@ def test_set_attention_slice_auto(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_max(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) unet = self.get_unet_model() unet.set_attention_slice("max") @@ -1201,15 +1204,15 @@ def test_set_attention_slice_max(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() - + mem_bytes = backend_max_memory_allocated(torch_device) + assert mem_bytes < 5 * 10**9 - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_int(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) unet = self.get_unet_model() unet.set_attention_slice(2) @@ -1221,15 +1224,15 @@ def test_set_attention_slice_int(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 - @require_torch_gpu + @require_torch_accelerator def test_set_attention_slice_list(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) # there are 32 sliceable layers slice_list = 16 * [2, 3] @@ -1243,7 +1246,7 @@ def test_set_attention_slice_list(self): with torch.no_grad(): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes < 5 * 10**9 diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index 85924af050b0..42b712842e47 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -222,12 +222,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py index 7981e6c2a93b..6c66f5bfa0fe 100644 --- a/tests/pipelines/flux/test_pipeline_flux.py +++ b/tests/pipelines/flux/test_pipeline_flux.py @@ -9,6 +9,7 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel from diffusers.utils.testing_utils import ( + backend_empty_cache, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, slow, @@ -219,12 +220,12 @@ class FluxPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): if str(device).startswith("mps"): @@ -254,7 +255,7 @@ def test_flux_inference(self): pipe = self.pipeline_class.from_pretrained( self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/pag/test_pag_sdxl_img2img.py b/tests/pipelines/pag/test_pag_sdxl_img2img.py index 7e5fc5fa28b9..88eecf05d658 100644 --- a/tests/pipelines/pag/test_pag_sdxl_img2img.py +++ b/tests/pipelines/pag/test_pag_sdxl_img2img.py @@ -39,10 +39,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -267,19 +268,19 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPAGImg2ImgPipelineIntegrationTests(unittest.TestCase): repo_id = "stabilityai/stable-diffusion-xl-base-1.0" def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): img_url = ( @@ -303,7 +304,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -320,7 +321,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForImage2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/pag/test_pag_sdxl_inpaint.py b/tests/pipelines/pag/test_pag_sdxl_inpaint.py index efc37abd0682..2c17bb2d603e 100644 --- a/tests/pipelines/pag/test_pag_sdxl_inpaint.py +++ b/tests/pipelines/pag/test_pag_sdxl_inpaint.py @@ -40,10 +40,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -272,19 +273,19 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPAGInpaintPipelineIntegrationTests(unittest.TestCase): repo_id = "stabilityai/stable-diffusion-xl-base-1.0" def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" @@ -309,7 +310,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -326,7 +327,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForInpainting.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 0f12fc0c36d6..4307a3faff09 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -988,7 +988,7 @@ def test_stable_diffusion_attention_slicing(self): assert max_diff < 1e-3 def test_stable_diffusion_vae_slicing(self): - torch.cuda.reset_peak_memory_stats() + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py index 4b04169a270b..5400c21c9f87 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py @@ -25,12 +25,16 @@ from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_max_memory_allocated, + backend_reset_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, load_numpy, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -44,13 +48,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @property def dummy_image(self): @@ -381,19 +385,19 @@ def test_stable_diffusion_upscale_from_save_pretrained(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_upscale_pipeline(self): image = load_image( @@ -459,9 +463,9 @@ def test_stable_diffusion_upscale_pipeline_fp16(self): assert np.abs(expected_image - image).max() < 5e-1 def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() + backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" @@ -475,7 +479,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() + pipe.enable_sequential_cpu_offload(device=torch_device) prompt = "a cat sitting on a park bench" @@ -488,6 +492,6 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): output_type="np", ) - mem_bytes = torch.cuda.max_memory_allocated() + mem_bytes = backend_max_memory_allocated(torch_device) # make sure that less than 2.9 GB is allocated assert mem_bytes < 2.9 * 10**9 diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index 8550f258045e..ff01f5405131 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -38,7 +38,7 @@ enable_full_determinism, load_image, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -346,7 +346,7 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): self._test_save_load_optional_components() - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -355,12 +355,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py index db0905a48310..2750cca429d7 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py @@ -42,7 +42,7 @@ enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -334,7 +334,7 @@ def test_stable_diffusion_xl_img2img_tiny_autoencoder(self): assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -343,12 +343,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -637,7 +637,7 @@ def test_stable_diffusion_xl_img2img_euler(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -646,12 +646,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py index 964c7123dd32..a807756ca196 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py @@ -41,7 +41,7 @@ UNet2DConditionModel, UniPCMultistepScheduler, ) -from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_accelerator, slow, torch_device from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, @@ -343,7 +343,7 @@ def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self): # make sure that it's equal assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -352,12 +352,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLInpaintPipeline(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = StableDiffusionXLInpaintPipeline(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py index 94ee9f0facc8..785c8633a3d4 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py @@ -20,14 +20,14 @@ import torch from diffusers import StableDiffusionXLKDiffusionPipeline -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import backend_empty_cache, enable_full_determinism, require_torch_accelerator, slow, torch_device enable_full_determinism() @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLKPipelineIntegrationTests(unittest.TestCase): dtype = torch.float16 @@ -35,13 +35,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_stable_diffusion_xl(self): sd_pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained( diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py index ac9acb26afd3..70706f39b01c 100644 --- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py +++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py @@ -22,12 +22,13 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( CaptureLogger, + backend_empty_cache, enable_full_determinism, floats_tensor, numpy_cosine_similarity_distance, require_accelerate_version_greater, require_accelerator, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -513,19 +514,19 @@ def test_disable_cfg(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableVideoDiffusionPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_sd_video(self): pipe = StableVideoDiffusionPipeline.from_pretrained( @@ -533,7 +534,7 @@ def test_sd_video(self): variant="fp16", torch_dtype=torch.float16, ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true" diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py index bca4fdbfae64..85eaa12b64b8 100644 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py +++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py @@ -23,10 +23,11 @@ from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel from diffusers.utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -176,19 +177,19 @@ def test_num_images_per_prompt(self): @slow @skip_mps -@require_torch_gpu +@require_torch_accelerator class TextToVideoSDPipelineSlowTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_two_step_model(self): expected_video = load_numpy( diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index 2e0ba1cfb8eb..b4487a779312 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -27,6 +27,7 @@ load_image, nightly, require_torch_2, + require_torch_accelerator, require_torch_gpu, run_test_in_subprocess, torch_device, @@ -499,20 +500,19 @@ def test_unidiffuser_img2text_multiple_prompts_with_latents(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=2e-4) - @require_torch_gpu - def test_unidiffuser_default_joint_v1_cuda_fp16(self): - device = "cuda" + @require_torch_accelerator + def test_unidiffuser_default_joint_v1_fp16(self): unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 ) - unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe = unidiffuser_pipe.to(torch_device) unidiffuser_pipe.set_progress_bar_config(disable=None) # Set mode to 'joint' unidiffuser_pipe.set_joint_mode() assert unidiffuser_pipe.mode == "joint" - inputs = self.get_dummy_inputs_with_latents(device) + inputs = self.get_dummy_inputs_with_latents(torch_device) # Delete prompt and image for joint inference. del inputs["prompt"] del inputs["image"] @@ -529,20 +529,19 @@ def test_unidiffuser_default_joint_v1_cuda_fp16(self): expected_text_prefix = '" This This' assert text[0][: len(expected_text_prefix)] == expected_text_prefix - @require_torch_gpu - def test_unidiffuser_default_text2img_v1_cuda_fp16(self): - device = "cuda" + @require_torch_accelerator + def test_unidiffuser_default_text2img_v1_fp16(self): unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 ) - unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe = unidiffuser_pipe.to(torch_device) unidiffuser_pipe.set_progress_bar_config(disable=None) # Set mode to 'text2img' unidiffuser_pipe.set_text_to_image_mode() assert unidiffuser_pipe.mode == "text2img" - inputs = self.get_dummy_inputs_with_latents(device) + inputs = self.get_dummy_inputs_with_latents(torch_device) # Delete prompt and image for joint inference. del inputs["image"] inputs["data_type"] = 1 @@ -554,20 +553,19 @@ def test_unidiffuser_default_text2img_v1_cuda_fp16(self): expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138]) assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - @require_torch_gpu - def test_unidiffuser_default_img2text_v1_cuda_fp16(self): - device = "cuda" + @require_torch_accelerator + def test_unidiffuser_default_img2text_v1_fp16(self): unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 ) - unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe = unidiffuser_pipe.to(torch_device) unidiffuser_pipe.set_progress_bar_config(disable=None) # Set mode to 'img2text' unidiffuser_pipe.set_image_to_text_mode() assert unidiffuser_pipe.mode == "img2text" - inputs = self.get_dummy_inputs_with_latents(device) + inputs = self.get_dummy_inputs_with_latents(torch_device) # Delete prompt and image for joint inference. del inputs["prompt"] inputs["data_type"] = 1 diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py index 0caed159100a..43801a3c5dbc 100644 --- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py +++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py @@ -21,7 +21,7 @@ from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin @@ -198,7 +198,7 @@ def test_wuerstchen(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -207,12 +207,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] From bb49caba6804baecb4fb5b8fbc2665e83a527edc Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Tue, 21 Jan 2025 18:01:19 -0800 Subject: [PATCH 23/33] update more cases --- src/diffusers/utils/testing_utils.py | 3 ++- tests/lora/test_lora_layers_sd.py | 1 + tests/lora/test_lora_layers_sd3.py | 2 +- .../unets/test_models_unet_2d_condition.py | 6 ++--- tests/pipelines/controlnet/test_controlnet.py | 2 +- .../test_controlnet_inpaint_sdxl.py | 8 +++--- .../controlnet_flux/test_controlnet_flux.py | 5 ++-- .../controlnet_sd3/test_controlnet_sd3.py | 2 +- .../test_if_img2img_superresolution.py | 1 + .../deepfloyd_if/test_if_inpainting.py | 1 + .../test_if_inpainting_superresolution.py | 1 + .../test_ip_adapter_stable_diffusion.py | 27 ++++++++++--------- tests/pipelines/kandinsky/test_kandinsky.py | 19 ++++++------- .../kandinsky/test_kandinsky_combined.py | 20 +++++++------- .../kandinsky/test_kandinsky_img2img.py | 17 ++++++------ .../kandinsky/test_kandinsky_inpaint.py | 15 ++++++----- .../pipelines/kandinsky2_2/test_kandinsky.py | 14 +++++----- .../kandinsky2_2/test_kandinsky_combined.py | 20 +++++++------- .../kandinsky2_2/test_kandinsky_img2img.py | 14 +++++----- .../kandinsky2_2/test_kandinsky_inpaint.py | 9 ++++--- tests/pipelines/kandinsky3/test_kandinsky3.py | 14 +++++----- .../kandinsky3/test_kandinsky3_img2img.py | 11 ++++---- .../test_latent_consistency_models.py | 7 ++--- .../test_latent_consistency_models_img2img.py | 7 ++--- tests/pipelines/latte/test_latte.py | 11 ++++---- .../test_ledits_pp_stable_diffusion.py | 9 ++++--- .../test_ledits_pp_stable_diffusion_xl.py | 4 +-- tests/pipelines/lumina/test_lumina_nextdit.py | 11 ++++---- tests/pipelines/mochi/test_mochi.py | 4 +-- tests/pipelines/pag/test_pag_sdxl.py | 13 ++++----- .../test_stable_diffusion_img2img.py | 3 +-- .../test_stable_diffusion_inpaint.py | 4 +-- ...st_stable_diffusion_instruction_pix2pix.py | 2 +- .../test_stable_diffusion.py | 4 +-- .../test_pipeline_stable_diffusion_3.py | 7 ++--- ...est_pipeline_stable_diffusion_3_img2img.py | 7 ++--- .../test_stable_diffusion_xl_inpaint.py | 8 +++++- .../test_stable_diffusion_xl_k_diffusion.py | 8 +++++- 38 files changed, 178 insertions(+), 143 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index da3f8dfd18c5..5e7b3b42853d 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1119,7 +1119,6 @@ def _is_torch_fp64_available(device): # This dispatches a defined function according to the accelerator from the function definitions. def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs): - if device not in dispatch_table: return dispatch_table["default"](*args, **kwargs) @@ -1149,9 +1148,11 @@ def backend_device_count(device: str): def backend_reset_peak_memory_stats(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS) + def backend_reset_max_memory_allocated(device: str): return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED) + def backend_max_memory_allocated(device: str): return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED) diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py index e71c6e3b53e3..3eefa97663e6 100644 --- a/tests/lora/test_lora_layers_sd.py +++ b/tests/lora/test_lora_layers_sd.py @@ -33,6 +33,7 @@ ) from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.testing_utils import ( + backend_empty_cache, load_image, nightly, numpy_cosine_similarity_distance, diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index 28e059dbdaa5..f121b5b6cd57 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -30,12 +30,12 @@ from diffusers.utils import load_image from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.testing_utils import ( + backend_empty_cache, nightly, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, require_peft_backend, require_torch_accelerator, - slow, torch_device, ) diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py index 1a0e0d3f9c63..8e1187f11468 100644 --- a/tests/models/unets/test_models_unet_2d_condition.py +++ b/tests/models/unets/test_models_unet_2d_condition.py @@ -36,9 +36,9 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( backend_empty_cache, + backend_max_memory_allocated, backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, - backend_max_memory_allocated, enable_full_determinism, floats_tensor, is_peft_available, @@ -1005,7 +1005,7 @@ def test_load_sharded_checkpoint_from_hub_subfolder(self, repo_id, variant): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) - @require_torch_gpu + @require_torch_accelerator def test_load_sharded_checkpoint_from_hub_local(self): _, inputs_dict = self.prepare_init_args_and_inputs_for_common() ckpt_path = snapshot_download("hf-internal-testing/unet2d-sharded-dummy") @@ -1204,7 +1204,7 @@ def test_set_attention_slice_max(self): _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample mem_bytes = backend_max_memory_allocated(torch_device) - + assert mem_bytes < 5 * 10**9 @require_torch_accelerator diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 43814b2b2211..5a2c151ecaa5 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -79,7 +79,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout): pipe = StableDiffusionControlNetPipeline.from_pretrained( "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet ) - pipe.to("cuda") + pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe.unet.to(memory_format=torch.channels_last) diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py index 6e752804e2e0..ca05db504485 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py @@ -40,7 +40,7 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, floats_tensor, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -245,7 +245,7 @@ def test_xformers_attention_forwardGenerator_pass(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) - @require_torch_gpu + @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] components = self.get_dummy_components() @@ -254,12 +254,12 @@ def test_stable_diffusion_xl_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py index 5e856b125f32..10df7b6e02c4 100644 --- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py +++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py @@ -31,6 +31,7 @@ from diffusers.models import FluxControlNetModel from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, nightly, numpy_cosine_similarity_distance, @@ -213,12 +214,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_canny(self): controlnet = FluxControlNetModel.from_pretrained( diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py index 7527d17af32a..9b3c9ec7c92f 100644 --- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py +++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py @@ -237,7 +237,7 @@ def test_canny(self): pipe = StableDiffusion3ControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index 9c0c87df61a4..96456506c037 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -25,6 +25,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index 22662f42142f..412fbd3d37a9 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -25,6 +25,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index 6a4e7199fdc8..2ecf9fba8165 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -25,6 +25,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, floats_tensor, load_numpy, diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index a8180a3bc27f..401fab6c2c96 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -34,11 +34,12 @@ from diffusers.image_processor import IPAdapterMaskProcessor from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, is_flaky, load_pt, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -54,13 +55,13 @@ def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_image_encoder(self, repo_id, subfolder): image_encoder = CLIPVisionModelWithProjection.from_pretrained( @@ -165,7 +166,7 @@ def get_dummy_inputs( @slow -@require_torch_gpu +@require_torch_accelerator class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin): def test_text_to_image(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -280,7 +281,7 @@ def test_text_to_image_model_cpu_offload(self): inputs = self.get_dummy_inputs() output_without_offload = pipeline(**inputs).images - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) inputs = self.get_dummy_inputs() output_with_offload = pipeline(**inputs).images max_diff = np.abs(output_with_offload - output_without_offload).max() @@ -391,7 +392,7 @@ def test_text_to_image_face_id(self): @slow -@require_torch_gpu +@require_torch_accelerator class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin): def test_text_to_image_sdxl(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="sdxl_models/image_encoder") @@ -403,7 +404,7 @@ def test_text_to_image_sdxl(self): feature_extractor=feature_extractor, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs() @@ -461,7 +462,7 @@ def test_image_to_image_sdxl(self): feature_extractor=feature_extractor, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs(for_image_to_image=True) @@ -530,7 +531,7 @@ def test_inpainting_sdxl(self): feature_extractor=feature_extractor, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs(for_inpainting=True) @@ -578,7 +579,7 @@ def test_ip_adapter_mask(self): image_encoder=image_encoder, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( "h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors" ) @@ -606,7 +607,7 @@ def test_ip_adapter_multiple_masks(self): image_encoder=image_encoder, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2 ) @@ -633,7 +634,7 @@ def test_instant_style_multiple_masks(self): pipeline = StableDiffusionXLPipeline.from_pretrained( "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16" ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( ["ostris/ip-composition-adapter", "h94/IP-Adapter"], @@ -674,7 +675,7 @@ def test_ip_adapter_multiple_masks_one_adapter(self): image_encoder=image_encoder, torch_dtype=self.dtype, ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.load_ip_adapter( "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] ) diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py index 1a13ec75d082..30144e37a9d4 100644 --- a/tests/pipelines/kandinsky/test_kandinsky.py +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -24,10 +24,11 @@ from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_numpy, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -246,7 +247,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -255,12 +256,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -275,19 +276,19 @@ def test_offloads(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_text2img(self): expected_image = load_numpy( @@ -306,7 +307,7 @@ def test_kandinsky_text2img(self): prompt = "red cat, 4k photo" - generator = torch.Generator(device="cuda").manual_seed(0) + generator = torch.Generator(device=torch_device).manual_seed(0) image_emb, zero_image_emb = pipe_prior( prompt, generator=generator, @@ -314,7 +315,7 @@ def test_kandinsky_text2img(self): negative_prompt="", ).to_tuple() - generator = torch.Generator(device="cuda").manual_seed(0) + generator = torch.Generator(device=torch_device).manual_seed(0) output = pipeline( prompt, image_embeds=image_emb, diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index 3c8767a708d4..c5f27a9cc9a9 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -18,7 +18,7 @@ import numpy as np from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin from .test_kandinsky import Dummies @@ -105,7 +105,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -114,12 +114,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -213,7 +213,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -222,12 +222,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -325,7 +325,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -334,12 +334,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py index 23f13ffee223..26361ce18b82 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -32,12 +32,13 @@ ) from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, nightly, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -267,7 +268,7 @@ def test_kandinsky_img2img(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -299,19 +300,19 @@ def test_dict_tuple_outputs_equivalent(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_img2img(self): expected_image = load_numpy( @@ -365,19 +366,19 @@ def test_kandinsky_img2img(self): @nightly -@require_torch_gpu +@require_torch_accelerator class KandinskyImg2ImgPipelineNightlyTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_img2img_ddpm(self): expected_image = load_numpy( diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py index ebb1a4d88739..e30c601b6011 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -25,12 +25,13 @@ from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, nightly, - require_torch_gpu, + require_torch_accelerator, torch_device, ) @@ -265,7 +266,7 @@ def test_kandinsky_inpaint(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -274,12 +275,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -297,19 +298,19 @@ def test_float16_inference(self): @nightly -@require_torch_gpu +@require_torch_accelerator class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_inpaint(self): expected_image = load_numpy( diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky.py b/tests/pipelines/kandinsky2_2/test_kandinsky.py index cbd9166efada..fea49d47b7bb 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky.py @@ -22,12 +22,14 @@ from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -221,19 +223,19 @@ def test_float16_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyV22PipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_text2img(self): expected_image = load_numpy( @@ -244,12 +246,12 @@ def test_kandinsky_text2img(self): pipe_prior = KandinskyV22PriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 ) - pipe_prior.enable_model_cpu_offload() + pipe_prior.enable_model_cpu_offload(device=torch_device) pipeline = KandinskyV22Pipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) prompt = "red cat, 4k photo" diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py index bbf2f08a7b08..90f8b2034109 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py @@ -22,7 +22,7 @@ KandinskyV22Img2ImgCombinedPipeline, KandinskyV22InpaintCombinedPipeline, ) -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin from .test_kandinsky import Dummies @@ -110,7 +110,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -119,12 +119,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -234,7 +234,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -243,12 +243,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] @@ -357,7 +357,7 @@ def test_kandinsky(self): np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - @require_torch_gpu + @require_torch_accelerator def test_offloads(self): pipes = [] components = self.get_dummy_components() @@ -366,12 +366,12 @@ def test_offloads(self): components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() + sd_pipe.enable_model_cpu_offload(device=torch_device) pipes.append(sd_pipe) components = self.get_dummy_components() sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() + sd_pipe.enable_sequential_cpu_offload(device=torch_device) pipes.append(sd_pipe) image_slices = [] diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py index 26d8b45cf900..4702f473a992 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py @@ -29,13 +29,15 @@ VQModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..test_pipelines_common import PipelineTesterMixin @@ -238,19 +240,19 @@ def test_float16_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_img2img(self): expected_image = load_numpy( @@ -266,12 +268,12 @@ def test_kandinsky_img2img(self): pipe_prior = KandinskyV22PriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 ) - pipe_prior.enable_model_cpu_offload() + pipe_prior.enable_model_cpu_offload(device=torch_device) pipeline = KandinskyV22Img2ImgPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 ) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py index 25cf4bbed456..9a7f659e533c 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py @@ -29,13 +29,14 @@ VQModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, is_flaky, load_image, load_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -292,19 +293,19 @@ def callback_inputs_test(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinsky_inpaint(self): expected_image = load_numpy( diff --git a/tests/pipelines/kandinsky3/test_kandinsky3.py b/tests/pipelines/kandinsky3/test_kandinsky3.py index 941ef9093361..af1d45ff8975 100644 --- a/tests/pipelines/kandinsky3/test_kandinsky3.py +++ b/tests/pipelines/kandinsky3/test_kandinsky3.py @@ -31,10 +31,12 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.schedulers.scheduling_ddpm import DDPMScheduler from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_image, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from ..pipeline_params import ( @@ -167,25 +169,25 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class Kandinsky3PipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinskyV3(self): pipe = AutoPipelineForText2Image.from_pretrained( "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." @@ -211,7 +213,7 @@ def test_kandinskyV3_img2img(self): pipe = AutoPipelineForImage2Image.from_pretrained( "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py index 8c817df32e0c..e00948621a06 100644 --- a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py +++ b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py @@ -31,10 +31,11 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.schedulers.scheduling_ddpm import DDPMScheduler from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -192,25 +193,25 @@ def test_inference_batch_single_identical(self): @slow -@require_torch_gpu +@require_torch_accelerator class Kandinsky3Img2ImgPipelineIntegrationTests(unittest.TestCase): def setUp(self): # clean up the VRAM before each test super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_kandinskyV3_img2img(self): pipe = AutoPipelineForImage2Image.from_pretrained( "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 ) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) diff --git a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py index b60a4553cded..ceedd3285cbf 100644 --- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py +++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py @@ -13,8 +13,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -215,11 +216,11 @@ def callback_inputs_test(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class LatentConsistencyModelPipelineSlowTests(unittest.TestCase): def setUp(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py index 386e60c54ac6..ccf41d0d3f19 100644 --- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py +++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py @@ -14,10 +14,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -222,11 +223,11 @@ def callback_inputs_test(pipe, i, t, callback_kwargs): @slow -@require_torch_gpu +@require_torch_accelerator class LatentConsistencyModelImg2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py index 9667ebff249d..1db2c3c074e0 100644 --- a/tests/pipelines/latte/test_latte.py +++ b/tests/pipelines/latte/test_latte.py @@ -30,9 +30,10 @@ ) from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -266,25 +267,25 @@ def test_xformers_attention_forwardGenerator_pass(self): @slow -@require_torch_gpu +@require_torch_accelerator class LattePipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_latte(self): generator = torch.Generator("cpu").manual_seed(0) pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) prompt = self.prompt videos = pipe( diff --git a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py index 4aa48a920fad..342561d4f5e9 100644 --- a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py +++ b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion.py @@ -29,10 +29,11 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -202,17 +203,17 @@ def test_ledits_pp_warmup_steps(self): @slow -@require_torch_gpu +@require_torch_accelerator class LEditsPPPipelineStableDiffusionSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) @classmethod def setUpClass(cls): diff --git a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py index da694175a9f1..75795a33422b 100644 --- a/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py +++ b/tests/pipelines/ledits_pp/test_ledits_pp_stable_diffusion_xl.py @@ -41,7 +41,7 @@ enable_full_determinism, floats_tensor, load_image, - require_torch_gpu, + require_torch_accelerator, skip_mps, slow, torch_device, @@ -253,7 +253,7 @@ def test_ledits_pp_warmup_steps(self): @slow -@require_torch_gpu +@require_torch_accelerator class LEditsPPPipelineStableDiffusionXLSlowTests(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/tests/pipelines/lumina/test_lumina_nextdit.py b/tests/pipelines/lumina/test_lumina_nextdit.py index e0fd06847b77..79781335377e 100644 --- a/tests/pipelines/lumina/test_lumina_nextdit.py +++ b/tests/pipelines/lumina/test_lumina_nextdit.py @@ -7,8 +7,9 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, LuminaNextDiT2DModel, LuminaText2ImgPipeline from diffusers.utils.testing_utils import ( + backend_empty_cache, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -127,7 +128,7 @@ def test_xformers_attention_forwardGenerator_pass(self): @slow -@require_torch_gpu +@require_torch_accelerator class LuminaText2ImgPipelineSlowTests(unittest.TestCase): pipeline_class = LuminaText2ImgPipeline repo_id = "Alpha-VLLM/Lumina-Next-SFT-diffusers" @@ -135,12 +136,12 @@ class LuminaText2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): if str(device).startswith("mps"): @@ -158,7 +159,7 @@ def get_inputs(self, device, seed=0): def test_lumina_inference(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py index 7799faf78ea9..4c278a403267 100644 --- a/tests/pipelines/mochi/test_mochi.py +++ b/tests/pipelines/mochi/test_mochi.py @@ -17,7 +17,6 @@ import unittest import numpy as np -import pytest import torch from transformers import AutoTokenizer, T5EncoderModel @@ -25,10 +24,9 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, enable_full_determinism, - nightly, numpy_cosine_similarity_distance, - require_big_gpu_with_torch_cuda, require_torch_accelerator, + slow, torch_device, ) diff --git a/tests/pipelines/pag/test_pag_sdxl.py b/tests/pipelines/pag/test_pag_sdxl.py index 589573385677..fe92796247d7 100644 --- a/tests/pipelines/pag/test_pag_sdxl.py +++ b/tests/pipelines/pag/test_pag_sdxl.py @@ -30,8 +30,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -290,7 +291,7 @@ def test_pag_inference(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusionXLPAGPipeline repo_id = "stabilityai/stable-diffusion-xl-base-1.0" @@ -298,12 +299,12 @@ class StableDiffusionXLPAGPipelineIntegrationTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -320,7 +321,7 @@ def get_inputs(self, device, generator_device="cpu", seed=0, guidance_scale=7.0) def test_pag_cfg(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device) @@ -337,7 +338,7 @@ def test_pag_cfg(self): def test_pag_uncond(self): pipeline = AutoPipelineForText2Image.from_pretrained(self.repo_id, enable_pag=True, torch_dtype=torch.float16) - pipeline.enable_model_cpu_offload() + pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) inputs = self.get_inputs(torch_device, guidance_scale=0.0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 881c2ca849bd..0ea25a91adc4 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -527,7 +527,6 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): assert mem_bytes < 2.2 * 10**9 def test_stable_diffusion_pipeline_with_model_offloading(self): - backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) @@ -560,7 +559,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): pipe.enable_model_cpu_offload(device=torch_device) pipe.set_progress_bar_config(disable=None) _ = pipe(**inputs) - mem_bytes_offloaded = backend_max_memory_allocated(torch_device) + mem_bytes_offloaded = backend_max_memory_allocated(torch_device) assert mem_bytes_offloaded < mem_bytes for module in pipe.text_encoder, pipe.unet, pipe.vae: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 81bf9231b82f..4979a57f87e5 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -37,9 +37,9 @@ UNet2DConditionModel, ) from diffusers.utils.testing_utils import ( - backend_reset_peak_memory_stats, - backend_max_memory_allocated, backend_empty_cache, + backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, is_torch_compile, diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 47efe5e71f92..fc5107a59cab 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -34,8 +34,8 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.utils.testing_utils import ( backend_empty_cache, - backend_reset_peak_memory_stats, backend_max_memory_allocated, + backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, load_image, diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 7f0dde2e7f32..de5ae0255f59 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -366,7 +366,7 @@ def test_stable_diffusion_attention_slicing(self): pipe.enable_attention_slicing() inputs = self.get_inputs(torch_device, dtype=torch.float16) image_sliced = pipe(**inputs).images - + mem_bytes = backend_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) # make sure that less than 3.3 GB is allocated @@ -377,7 +377,7 @@ def test_stable_diffusion_attention_slicing(self): pipe.unet.set_default_attn_processor() inputs = self.get_inputs(torch_device, dtype=torch.float16) image = pipe(**inputs).images - + # make sure that more than 3.3 GB is allocated mem_bytes = backend_max_memory_allocated(torch_device) assert mem_bytes > 3.3 * 10**9 diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py index a6f718ae4fbb..87b6ffb88e68 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py @@ -8,6 +8,7 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline from diffusers.utils.testing_utils import ( + backend_empty_cache, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, slow, @@ -271,12 +272,12 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): if str(device).startswith("mps"): @@ -294,7 +295,7 @@ def get_inputs(self, device, seed=0): def test_sd3_inference(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py index 358c8d9aee12..6a1bc2875f60 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py @@ -15,6 +15,7 @@ ) from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, floats_tensor, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, @@ -206,12 +207,12 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, seed=0): init_image = load_image( @@ -234,7 +235,7 @@ def get_inputs(self, device, seed=0): def test_sd3_img2img_inference(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py index 223451bfe380..caba6c364492 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py @@ -41,7 +41,13 @@ UNet2DConditionModel, UniPCMultistepScheduler, ) -from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_accelerator, slow, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + require_torch_accelerator, + slow, + torch_device, +) from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py index 785c8633a3d4..46f7d0e7b0b4 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py @@ -20,7 +20,13 @@ import torch from diffusers import StableDiffusionXLKDiffusionPipeline -from diffusers.utils.testing_utils import backend_empty_cache, enable_full_determinism, require_torch_accelerator, slow, torch_device +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + require_torch_accelerator, + slow, + torch_device, +) enable_full_determinism() From 1d1c13d03f8b895c1fb3a3210a32fdc9a0671883 Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Tue, 21 Jan 2025 18:14:20 -0800 Subject: [PATCH 24/33] revert --- tests/pipelines/mochi/test_mochi.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py index 4c278a403267..75b730b5ffd7 100644 --- a/tests/pipelines/mochi/test_mochi.py +++ b/tests/pipelines/mochi/test_mochi.py @@ -17,6 +17,7 @@ import unittest import numpy as np +import pytest import torch from transformers import AutoTokenizer, T5EncoderModel @@ -24,9 +25,10 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, enable_full_determinism, + nightly, numpy_cosine_similarity_distance, - require_torch_accelerator, - slow, + require_big_gpu_with_torch_cuda, + require_torch_gpu, torch_device, ) @@ -261,8 +263,10 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): ) -@slow -@require_torch_accelerator +@nightly +@require_torch_gpu +@require_big_gpu_with_torch_cuda +@pytest.mark.big_gpu_with_torch_cuda class MochiPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." From 5226094875dbc309fa2b01493bb28db50b927c49 Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Tue, 21 Jan 2025 18:28:20 -0800 Subject: [PATCH 25/33] revert back --- tests/pipelines/pag/test_pag_sd_img2img.py | 2 +- tests/pipelines/stable_diffusion/test_stable_diffusion.py | 2 ++ .../stable_diffusion/test_stable_diffusion_img2img.py | 3 +++ .../stable_diffusion/test_stable_diffusion_inpaint.py | 3 +++ .../test_stable_diffusion_instruction_pix2pix.py | 2 ++ .../stable_diffusion_2/test_stable_diffusion_inpaint.py | 2 ++ .../stable_diffusion_2/test_stable_diffusion_v_pred.py | 2 ++ .../test_stable_diffusion_image_variation.py | 2 ++ 8 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/pag/test_pag_sd_img2img.py b/tests/pipelines/pag/test_pag_sd_img2img.py index 2654db437796..cb7a5ca36e2f 100644 --- a/tests/pipelines/pag/test_pag_sd_img2img.py +++ b/tests/pipelines/pag/test_pag_sd_img2img.py @@ -269,7 +269,7 @@ def test_pag_uncond(self): pipeline.enable_model_cpu_offload(device=torch_device) pipeline.set_progress_bar_config(disable=None) - inputs = self.get_inputs(torch_device) + inputs = self.get_inputs(torch_device, guidance_scale=0.0) image = pipeline(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 4307a3faff09..8c5e07995b8a 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -46,6 +46,7 @@ CaptureLogger, backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, is_torch_compile, @@ -1142,6 +1143,7 @@ def test_stable_diffusion_low_cpu_mem_usage(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 0ea25a91adc4..5aa4fef4abab 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -37,6 +37,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, @@ -511,6 +512,7 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) pipe = StableDiffusionImg2ImgPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 @@ -529,6 +531,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_model_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 4979a57f87e5..246d8d2bfff2 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -39,6 +39,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, @@ -702,6 +703,7 @@ def test_stable_diffusion_inpaint_k_lms(self): def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) pipe = StableDiffusionInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16 @@ -904,6 +906,7 @@ def test_stable_diffusion_inpaint_k_lms(self): def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) vae = AsymmetricAutoencoderKL.from_pretrained( "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index fc5107a59cab..1a4aba2914e4 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -35,6 +35,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, @@ -389,6 +390,7 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index f1ef15f28a3c..735912f66695 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -25,6 +25,7 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel from diffusers.utils.testing_utils import ( backend_empty_cache, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, @@ -245,6 +246,7 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index 13450cb43114..dc1da82cd0a4 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -33,6 +33,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, load_numpy, @@ -535,6 +536,7 @@ def test_stable_diffusion_low_cpu_mem_usage_v_pred(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) pipeline_id = "stabilityai/stable-diffusion-2" prompt = "Andromeda galaxy in a bottle" diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index 1c5869ebf74e..ca25c8bc86f7 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -32,6 +32,7 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, backend_max_memory_allocated, + backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, @@ -263,6 +264,7 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) backend_reset_peak_memory_stats(torch_device) + backend_reset_max_memory_allocated(torch_device) pipe = StableDiffusionImageVariationPipeline.from_pretrained( "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16 From faa1615cebaad79975e4782faeff363468e7f988 Mon Sep 17 00:00:00 2001 From: hlky <hlky@hlky.ac> Date: Thu, 27 Feb 2025 06:46:25 +0000 Subject: [PATCH 26/33] Update test_stable_diffusion_xl.py --- .../pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py index 1c38401efd8b..c68cdf67036a 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -265,9 +265,6 @@ def test_attention_slicing_forward_pass(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) - def test_save_load_optional_components(self): - self._test_save_load_optional_components() - @require_torch_accelerator def test_stable_diffusion_xl_offloads(self): pipes = [] From fc57898f0729a3776b35ee8923d991217b815a71 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 27 Feb 2025 15:06:12 +0800 Subject: [PATCH 27/33] Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> --- tests/pipelines/stable_diffusion/test_stable_diffusion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 070d2a827a95..2f988b2d08a2 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -1117,6 +1117,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): pipe.unet.set_default_attn_processor() backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) pipe.enable_model_cpu_offload(device=torch_device) From 55f9658e0ac032a34571aa8326ca940cc156997a Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 27 Feb 2025 15:06:19 +0800 Subject: [PATCH 28/33] Update tests/pipelines/stable_diffusion/test_stable_diffusion.py Co-authored-by: hlky <hlky@hlky.ac> --- tests/pipelines/stable_diffusion/test_stable_diffusion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 2f988b2d08a2..42a18221ea6d 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -1139,6 +1139,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): # With attention slicing backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) pipe.enable_attention_slicing() From d647900abd5627a1ebdaca291fddee87df5cd167 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 27 Feb 2025 15:06:26 +0800 Subject: [PATCH 29/33] Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> --- .../pipelines/stable_diffusion/test_stable_diffusion_img2img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 916ba436ab14..434e65258514 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -537,8 +537,8 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_model_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) inputs = self.get_inputs(torch_device, dtype=torch.float16) From cfbf6019a2c53eb344c6a8cea384c5e509f159e6 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 27 Feb 2025 15:06:34 +0800 Subject: [PATCH 30/33] Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> --- .../pipelines/stable_diffusion/test_stable_diffusion_img2img.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 434e65258514..2c27139bb237 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -564,6 +564,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): ) backend_empty_cache(torch_device) + backend_reset_max_memory_allocated(torch_device) backend_reset_peak_memory_stats(torch_device) pipe.enable_model_cpu_offload(device=torch_device) From 88263e86a33d0ab64c35fb92c6b9ef607acdd1fc Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 27 Feb 2025 15:08:29 +0800 Subject: [PATCH 31/33] Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py Co-authored-by: hlky <hlky@hlky.ac> --- .../pipelines/stable_diffusion/test_stable_diffusion_img2img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 2c27139bb237..82b01a74869a 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -518,8 +518,8 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionImg2ImgPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 From 2e181a39c08e01883e1885f9088dfab9db4222f6 Mon Sep 17 00:00:00 2001 From: Fanli Lin <fanli0116@gmail.com> Date: Thu, 27 Feb 2025 15:22:38 +0800 Subject: [PATCH 32/33] Apply suggestions from code review Co-authored-by: hlky <hlky@hlky.ac> --- .../stable_diffusion/test_stable_diffusion_inpaint.py | 4 ++-- .../test_stable_diffusion_instruction_pix2pix.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_inpaint.py | 2 +- .../stable_diffusion_2/test_stable_diffusion_v_pred.py | 2 +- .../test_stable_diffusion_image_variation.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 8e18584938a6..e21cf23b8cbf 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -709,8 +709,8 @@ def test_stable_diffusion_inpaint_k_lms(self): def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16 @@ -912,8 +912,8 @@ def test_stable_diffusion_inpaint_k_lms(self): def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) vae = AsymmetricAutoencoderKL.from_pretrained( "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 1a4aba2914e4..9721bb02ee3e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -389,8 +389,8 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index eecd589fdf5a..2feeaaf11c12 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -252,8 +252,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index dc1da82cd0a4..1953017c0ee8 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -535,8 +535,8 @@ def test_stable_diffusion_low_cpu_mem_usage_v_pred(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipeline_id = "stabilityai/stable-diffusion-2" prompt = "Andromeda galaxy in a bottle" diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index ca25c8bc86f7..f706e7000b28 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -263,8 +263,8 @@ def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): backend_empty_cache(torch_device) - backend_reset_peak_memory_stats(torch_device) backend_reset_max_memory_allocated(torch_device) + backend_reset_peak_memory_stats(torch_device) pipe = StableDiffusionImageVariationPipeline.from_pretrained( "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16 From 486f7cc61ad421065c6bed8b8c599b7f1bf98407 Mon Sep 17 00:00:00 2001 From: "Lin, Fanli" <fanli.lin@intel.com> Date: Wed, 26 Feb 2025 23:23:23 -0800 Subject: [PATCH 33/33] add test marker --- .../stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py index e899e86b8103..66ae581a0529 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py @@ -311,6 +311,7 @@ def test_inference_batch_single_identical(self): def test_save_load_optional_components(self): pass + @require_torch_accelerator def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self): components = self.get_dummy_components() sd_pipe = StableDiffusionXLInpaintPipeline(**components)