72
72
Examples:
73
73
```py
74
74
>>> import torch
75
- >>> import PIL
76
- >>> import requests
77
- >>> from io import BytesIO
78
75
79
76
>>> from diffusers import LEditsPPPipelineStableDiffusionXL
77
+ >>> from diffusers.utils import load_image
80
78
81
79
>>> pipe = LEditsPPPipelineStableDiffusionXL.from_pretrained(
82
- ... "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
80
+ ... "stabilityai/stable-diffusion-xl-base-1.0", variant="fp16", torch_dtype=torch.float16
83
81
... )
82
+ >>> pipe.enable_vae_tiling()
84
83
>>> pipe = pipe.to("cuda")
85
84
86
-
87
- >>> def download_image(url):
88
- ... response = requests.get(url)
89
- ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
90
-
91
-
92
85
>>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
93
- >>> image = download_image (img_url)
86
+ >>> image = load_image (img_url).resize((1024, 1024) )
94
87
95
88
>>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
96
89
@@ -197,7 +190,7 @@ def __init__(self, device):
197
190
198
191
# The gaussian kernel is the product of the gaussian function of each dimension.
199
192
kernel = 1
200
- meshgrids = torch .meshgrid ([torch .arange (size , dtype = torch .float32 ) for size in kernel_size ])
193
+ meshgrids = torch .meshgrid ([torch .arange (size , dtype = torch .float32 ) for size in kernel_size ], indexing = "ij" )
201
194
for size , std , mgrid in zip (kernel_size , sigma , meshgrids ):
202
195
mean = (size - 1 ) / 2
203
196
kernel *= 1 / (std * math .sqrt (2 * math .pi )) * torch .exp (- (((mgrid - mean ) / (2 * std )) ** 2 ))
@@ -768,6 +761,35 @@ def denoising_end(self):
768
761
def num_timesteps (self ):
769
762
return self ._num_timesteps
770
763
764
+ def enable_vae_slicing (self ):
765
+ r"""
766
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
767
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
768
+ """
769
+ self .vae .enable_slicing ()
770
+
771
+ def disable_vae_slicing (self ):
772
+ r"""
773
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
774
+ computing decoding in one step.
775
+ """
776
+ self .vae .disable_slicing ()
777
+
778
+ def enable_vae_tiling (self ):
779
+ r"""
780
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
781
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
782
+ processing larger images.
783
+ """
784
+ self .vae .enable_tiling ()
785
+
786
+ def disable_vae_tiling (self ):
787
+ r"""
788
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
789
+ computing decoding in one step.
790
+ """
791
+ self .vae .disable_tiling ()
792
+
771
793
# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEditsPPPipelineStableDiffusion.prepare_unet
772
794
def prepare_unet (self , attention_store , PnP : bool = False ):
773
795
attn_procs = {}
@@ -1401,6 +1423,12 @@ def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="
1401
1423
image = self .image_processor .preprocess (
1402
1424
image = image , height = height , width = width , resize_mode = resize_mode , crops_coords = crops_coords
1403
1425
)
1426
+ height , width = image .shape [- 2 :]
1427
+ if height % 32 != 0 or width % 32 != 0 :
1428
+ raise ValueError (
1429
+ "Image height and width must be a factor of 32. "
1430
+ "Consider down-sampling the input using the `height` and `width` parameters"
1431
+ )
1404
1432
resized = self .image_processor .postprocess (image = image , output_type = "pil" )
1405
1433
1406
1434
if max (image .shape [- 2 :]) > self .vae .config ["sample_size" ] * 1.5 :
@@ -1439,6 +1467,10 @@ def invert(
1439
1467
crops_coords_top_left : Tuple [int , int ] = (0 , 0 ),
1440
1468
num_zero_noise_steps : int = 3 ,
1441
1469
cross_attention_kwargs : Optional [Dict [str , Any ]] = None ,
1470
+ height : Optional [int ] = None ,
1471
+ width : Optional [int ] = None ,
1472
+ resize_mode : Optional [str ] = "default" ,
1473
+ crops_coords : Optional [Tuple [int , int , int , int ]] = None ,
1442
1474
):
1443
1475
r"""
1444
1476
The function to the pipeline for image inversion as described by the [LEDITS++
@@ -1486,6 +1518,8 @@ def invert(
1486
1518
[`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
1487
1519
and respective VAE reconstruction(s).
1488
1520
"""
1521
+ if height is not None and height % 32 != 0 or width is not None and width % 32 != 0 :
1522
+ raise ValueError ("height and width must be a factor of 32." )
1489
1523
1490
1524
# Reset attn processor, we do not want to store attn maps during inversion
1491
1525
self .unet .set_attn_processor (AttnProcessor ())
@@ -1510,7 +1544,14 @@ def invert(
1510
1544
do_classifier_free_guidance = source_guidance_scale > 1.0
1511
1545
1512
1546
# 1. prepare image
1513
- x0 , resized = self .encode_image (image , dtype = self .text_encoder_2 .dtype )
1547
+ x0 , resized = self .encode_image (
1548
+ image ,
1549
+ dtype = self .text_encoder_2 .dtype ,
1550
+ height = height ,
1551
+ width = width ,
1552
+ resize_mode = resize_mode ,
1553
+ crops_coords = crops_coords ,
1554
+ )
1514
1555
width = x0 .shape [2 ] * self .vae_scale_factor
1515
1556
height = x0 .shape [3 ] * self .vae_scale_factor
1516
1557
self .size = (height , width )
0 commit comments