diff --git a/README.md b/README.md index 8454fc9f4..42535a433 100755 --- a/README.md +++ b/README.md @@ -305,7 +305,7 @@ only `pytorch1.13` is supported. **NOTE 3:** Training latent generative models (as e.g. in `configs/example_training/imagenet-f8_cond.yaml`) requires retrieving the checkpoint from [Hugging Face](https://huggingface.co/stabilityai/sdxl-vae/tree/main) and replacing -the `CKPT_PATH` placeholder in [this line](configs/example_training/imagenet-f8_cond.yaml#81). The same is to be done +the `CKPT_PATH` placeholder in [this line](configs/example_training/imagenet-f8_cond.yaml#76). The same is to be done for the provided text-to-image configs. ### Building New Diffusion Models diff --git a/requirements/pt2.txt b/requirements/pt2.txt index 824473abd..d57f4259e 100644 --- a/requirements/pt2.txt +++ b/requirements/pt2.txt @@ -1,6 +1,6 @@ black==23.7.0 chardet==5.1.0 -clip @ git+https://github.com/openai/CLIP.git +clip @ git+https://githubfast.com/openai/CLIP.git einops>=0.6.1 fairscale>=0.4.13 fire>=0.5.0 @@ -37,6 +37,6 @@ urllib3<1.27,>=1.25.4 wandb>=0.15.6 webdataset>=0.2.33 wheel>=0.41.0 -xformers>=0.0.20 +xformers==0.0.20 gradio streamlit-keyup==0.2.0 diff --git a/requirements/zhengxj_env.txt b/requirements/zhengxj_env.txt new file mode 100644 index 000000000..eff41d860 --- /dev/null +++ b/requirements/zhengxj_env.txt @@ -0,0 +1,167 @@ +aiofiles==23.2.1 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.12 +aiosignal==1.3.2 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.8.0 +async-timeout==5.0.1 +attrs==25.1.0 +black==23.7.0 +blinker==1.9.0 +braceexpand==0.1.7 +cachetools==5.5.1 +certifi==2025.1.31 +chardet==5.1.0 +charset-normalizer==3.4.1 +click==8.1.8 +clip @ git+https://githubfast.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 +cmake==3.31.4 +contourpy==1.3.1 +cycler==0.12.1 +docker-pycreds==0.4.0 +einops==0.8.0 +exceptiongroup==1.2.2 +fairscale==0.4.13 +fastapi==0.115.8 +ffmpy==0.5.0 +filelock==3.17.0 +fire==0.7.0 +fonttools==4.55.8 +frozenlist==1.5.0 +fsspec==2025.2.0 +ftfy==6.3.1 +gitdb==4.0.12 +GitPython==3.1.44 +gradio==5.15.0 +gradio_client==1.7.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.28.1 +idna==3.10 +imageio==2.37.0 +invisible-watermark==0.2.0 +jedi==0.19.2 +Jinja2==3.1.5 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +kornia==0.6.9 +lazy_loader==0.4 +lightning-utilities==0.12.0 +lit==18.1.8 +llvmlite==0.44.0 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.10.0 +mdurl==0.1.2 +mpmath==1.3.0 +multidict==6.1.0 +mypy-extensions==1.0.0 +narwhals==1.25.2 +natsort==8.4.0 +networkx==3.4.2 +ninja==1.11.1.3 +numba==0.61.0 +numpy==1.24.4 +nvidia-cublas-cu11==11.10.3.66 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +nvidia-cuda-runtime-cu11==11.7.99 +nvidia-cudnn-cu11==8.5.0.96 +nvidia-cufft-cu11==10.9.0.58 +nvidia-curand-cu11==10.2.10.91 +nvidia-cusolver-cu11==11.4.0.1 +nvidia-cusparse-cu11==11.7.4.91 +nvidia-nccl-cu11==2.14.3 +nvidia-nvtx-cu11==11.7.91 +omegaconf==2.3.0 +open-clip-torch==2.24.0 +opencv-python==4.6.0.66 +opencv-python-headless==4.11.0.86 +orjson==3.10.15 +packaging==24.2 +pandas==2.2.3 +parso==0.8.4 +pathspec==0.12.1 +pillow==11.1.0 +platformdirs==4.3.6 +pooch==1.8.2 +propcache==0.2.1 +protobuf==3.20.3 +psutil==6.1.1 +pudb==2024.1.3 +pyarrow==19.0.0 +pydantic==2.10.6 +pydantic_core==2.27.2 +pydeck==0.9.1 +pydub==0.25.1 +Pygments==2.19.1 +PyMatting==1.1.13 +pyparsing==3.2.1 +pyre-extensions==0.0.29 +python-dateutil==2.9.0.post0 +python-multipart==0.0.20 +pytorch-lightning==2.0.1 +pytz==2025.1 +PyWavelets==1.8.0 +PyYAML==6.0.2 +referencing==0.36.2 +regex==2024.11.6 +rembg==2.0.62 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +ruff==0.9.5 +safehttpx==0.1.6 +safetensors==0.5.2 +scikit-image==0.25.1 +scipy==1.15.1 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.20.0 +setproctitle==1.3.4 +shellingham==1.5.4 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +starlette==0.45.3 +streamlit==1.42.0 +streamlit-keyup==0.2.0 +sympy==1.13.3 +tenacity==9.0.0 +tensorboardX==2.6 +termcolor==2.5.0 +tifffile==2025.1.10 +timm==1.0.14 +tokenizers==0.12.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.0.1 +torch-summary==1.4.5 +torchaudio==2.0.2 +torchdata==0.6.1 +torchmetrics==1.6.1 +torchvision==0.15.2 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.19.1 +triton==2.0.0 +typer==0.15.1 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2025.1 +urllib3==1.26.20 +urwid==2.6.16 +urwid_readline==0.15.1 +uvicorn==0.34.0 +wandb==0.19.6 +watchdog==6.0.0 +wcwidth==0.2.13 +webdataset==0.2.100 +websockets==14.2 +xformers==0.0.20 +yarl==1.18.3 diff --git a/scripts/demo/streamlit_helpers.py b/scripts/demo/streamlit_helpers.py index e79fc1933..0a0ea6e6d 100644 --- a/scripts/demo/streamlit_helpers.py +++ b/scripts/demo/streamlit_helpers.py @@ -817,6 +817,7 @@ def denoiser(x, sigma, c): if filter is not None: samples = filter(samples) + grid = torch.stack([samples]) grid = rearrange(grid, "n b c h w -> (n h) (b w) c") outputs.image(grid.cpu().numpy()) if return_latents: diff --git a/sgm/modules/attention.py b/sgm/modules/attention.py index 52a50b7bd..643d11bd0 100644 --- a/sgm/modules/attention.py +++ b/sgm/modules/attention.py @@ -2,6 +2,7 @@ import math from inspect import isfunction from typing import Any, Optional +from omegaconf import ListConfig import torch import torch.nn.functional as F @@ -647,6 +648,8 @@ def __init__( f"{in_channels} channels and {n_heads} heads." ) + if exists(context_dim) and isinstance(context_dim, ListConfig): + context_dim = list(context_dim) if exists(context_dim) and not isinstance(context_dim, list): context_dim = [context_dim] if exists(context_dim) and isinstance(context_dim, list):