diff --git a/README.md b/README.md
index 8454fc9f4..42535a433 100755
--- a/README.md
+++ b/README.md
@@ -305,7 +305,7 @@ only `pytorch1.13` is supported.
 
 **NOTE 3:** Training latent generative models (as e.g. in `configs/example_training/imagenet-f8_cond.yaml`) requires
 retrieving the checkpoint from [Hugging Face](https://huggingface.co/stabilityai/sdxl-vae/tree/main) and replacing
-the `CKPT_PATH` placeholder in [this line](configs/example_training/imagenet-f8_cond.yaml#81). The same is to be done
+the `CKPT_PATH` placeholder in [this line](configs/example_training/imagenet-f8_cond.yaml#76). The same is to be done
 for the provided text-to-image configs.
 
 ### Building New Diffusion Models
diff --git a/requirements/pt2.txt b/requirements/pt2.txt
index 824473abd..d57f4259e 100644
--- a/requirements/pt2.txt
+++ b/requirements/pt2.txt
@@ -1,6 +1,6 @@
 black==23.7.0
 chardet==5.1.0
-clip @ git+https://github.com/openai/CLIP.git
+clip @ git+https://githubfast.com/openai/CLIP.git
 einops>=0.6.1
 fairscale>=0.4.13
 fire>=0.5.0
@@ -37,6 +37,6 @@ urllib3<1.27,>=1.25.4
 wandb>=0.15.6
 webdataset>=0.2.33
 wheel>=0.41.0
-xformers>=0.0.20
+xformers==0.0.20
 gradio
 streamlit-keyup==0.2.0
diff --git a/requirements/zhengxj_env.txt b/requirements/zhengxj_env.txt
new file mode 100644
index 000000000..eff41d860
--- /dev/null
+++ b/requirements/zhengxj_env.txt
@@ -0,0 +1,167 @@
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.12
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.8.0
+async-timeout==5.0.1
+attrs==25.1.0
+black==23.7.0
+blinker==1.9.0
+braceexpand==0.1.7
+cachetools==5.5.1
+certifi==2025.1.31
+chardet==5.1.0
+charset-normalizer==3.4.1
+click==8.1.8
+clip @ git+https://githubfast.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
+cmake==3.31.4
+contourpy==1.3.1
+cycler==0.12.1
+docker-pycreds==0.4.0
+einops==0.8.0
+exceptiongroup==1.2.2
+fairscale==0.4.13
+fastapi==0.115.8
+ffmpy==0.5.0
+filelock==3.17.0
+fire==0.7.0
+fonttools==4.55.8
+frozenlist==1.5.0
+fsspec==2025.2.0
+ftfy==6.3.1
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.15.0
+gradio_client==1.7.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.28.1
+idna==3.10
+imageio==2.37.0
+invisible-watermark==0.2.0
+jedi==0.19.2
+Jinja2==3.1.5
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.8
+kornia==0.6.9
+lazy_loader==0.4
+lightning-utilities==0.12.0
+lit==18.1.8
+llvmlite==0.44.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.10.0
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.25.2
+natsort==8.4.0
+networkx==3.4.2
+ninja==1.11.1.3
+numba==0.61.0
+numpy==1.24.4
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+omegaconf==2.3.0
+open-clip-torch==2.24.0
+opencv-python==4.6.0.66
+opencv-python-headless==4.11.0.86
+orjson==3.10.15
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pathspec==0.12.1
+pillow==11.1.0
+platformdirs==4.3.6
+pooch==1.8.2
+propcache==0.2.1
+protobuf==3.20.3
+psutil==6.1.1
+pudb==2024.1.3
+pyarrow==19.0.0
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.19.1
+PyMatting==1.1.13
+pyparsing==3.2.1
+pyre-extensions==0.0.29
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytorch-lightning==2.0.1
+pytz==2025.1
+PyWavelets==1.8.0
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+rembg==2.0.62
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+ruff==0.9.5
+safehttpx==0.1.6
+safetensors==0.5.2
+scikit-image==0.25.1
+scipy==1.15.1
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==2.20.0
+setproctitle==1.3.4
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+starlette==0.45.3
+streamlit==1.42.0
+streamlit-keyup==0.2.0
+sympy==1.13.3
+tenacity==9.0.0
+tensorboardX==2.6
+termcolor==2.5.0
+tifffile==2025.1.10
+timm==1.0.14
+tokenizers==0.12.1
+toml==0.10.2
+tomli==2.2.1
+tomlkit==0.13.2
+torch==2.0.1
+torch-summary==1.4.5
+torchaudio==2.0.2
+torchdata==0.6.1
+torchmetrics==1.6.1
+torchvision==0.15.2
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.19.1
+triton==2.0.0
+typer==0.15.1
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==1.26.20
+urwid==2.6.16
+urwid_readline==0.15.1
+uvicorn==0.34.0
+wandb==0.19.6
+watchdog==6.0.0
+wcwidth==0.2.13
+webdataset==0.2.100
+websockets==14.2
+xformers==0.0.20
+yarl==1.18.3
diff --git a/scripts/demo/streamlit_helpers.py b/scripts/demo/streamlit_helpers.py
index e79fc1933..0a0ea6e6d 100644
--- a/scripts/demo/streamlit_helpers.py
+++ b/scripts/demo/streamlit_helpers.py
@@ -817,6 +817,7 @@ def denoiser(x, sigma, c):
                 if filter is not None:
                     samples = filter(samples)
 
+                grid = torch.stack([samples])
                 grid = rearrange(grid, "n b c h w -> (n h) (b w) c")
                 outputs.image(grid.cpu().numpy())
                 if return_latents:
diff --git a/sgm/modules/attention.py b/sgm/modules/attention.py
index 52a50b7bd..643d11bd0 100644
--- a/sgm/modules/attention.py
+++ b/sgm/modules/attention.py
@@ -2,6 +2,7 @@
 import math
 from inspect import isfunction
 from typing import Any, Optional
+from omegaconf import ListConfig
 
 import torch
 import torch.nn.functional as F
@@ -647,6 +648,8 @@ def __init__(
             f"{in_channels} channels and {n_heads} heads."
         )
 
+        if exists(context_dim) and isinstance(context_dim, ListConfig):
+            context_dim = list(context_dim)
         if exists(context_dim) and not isinstance(context_dim, list):
             context_dim = [context_dim]
         if exists(context_dim) and isinstance(context_dim, list):