diff --git "a/Week15_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\214\341\205\265\341\204\213\341\205\263\341\206\253.ipynb" "b/Week15_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\214\341\205\265\341\204\213\341\205\263\341\206\253.ipynb" new file mode 100644 index 0000000..440e0c7 --- /dev/null +++ "b/Week15_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\214\341\205\265\341\204\213\341\205\263\341\206\253.ipynb" @@ -0,0 +1,178 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "### **Week 15 복습과제**\n", + "\n", + "https://github.com/CompVis/latent-diffusion/blob/main/ldm/modules/diffusionmodules/model.py" + ], + "metadata": { + "id": "fJCHLiImgVtD" + } + }, + { + "cell_type": "markdown", + "source": [ + "`ResnetBlock.forward`" + ], + "metadata": { + "id": "sFD35zGig3aM" + } + }, + { + "cell_type": "code", + "source": [ + "def forward(self, x, temb):\n", + " # x: 현재 레이어의 입력 feature map (latent space에서의 representation)\n", + " # temb: diffusion timestep t를 embedding한 벡터, 논문에서 말하는 \"time conditioning\"에 해당\n", + "\n", + " h = x\n", + "\n", + " # 첫 번째 normalization: latent feature를 안정화하여 diffusion step마다 분포 변화에 강건하게 만듦\n", + " h = self.norm1(h)\n", + "\n", + " # SiLU / Swish 계열 nonlinearity - score-based diffusion model에서 표준적으로 사용\n", + " h = nonlinearity(h)\n", + "\n", + " # 첫 번째 convolution: local spatial feature 추출 (UNet backbone의 기본 연산)\n", + " h = self.conv1(h)\n", + "\n", + " if temb is not None:\n", + " # timestep embedding을 projection하여 feature map에 더함\n", + " h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]\n", + "\n", + " # 두 번째 normalization\n", + " h = self.norm2(h)\n", + "\n", + " # nonlinearity\n", + " h = nonlinearity(h)\n", + "\n", + " # dropout (diffusion 학습 시 과적합 방지)\n", + " h = self.dropout(h)\n", + "\n", + " # 두 번째 convolution: residual block의 main transformation\n", + " h = self.conv2(h)\n", + "\n", + " # shortcut (residual connection) 경로\n", + " # 입력과 출력 channel 수가 다를 경우 dimension matching 필요\n", + " if self.in_channels != self.out_channels:\n", + " if self.use_conv_shortcut:\n", + " # convolution shortcut\n", + " x = self.conv_shortcut(x)\n", + " else:\n", + " # nin (1x1 convolution) shortcut\n", + " x = self.nin_shortcut(x)\n", + "\n", + " # residual connection\n", + " return x + h" + ], + "metadata": { + "id": "VGWSYvv3gf1R" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "`Model.forward`" + ], + "metadata": { + "id": "OcbCIshOg6ru" + } + }, + { + "cell_type": "code", + "source": [ + "def forward(self, x, t=None, context=None):\n", + " # x: 현재 diffusion step에서의 latent feature map\n", + " # t: diffusion timestep (정수 index)\n", + " # context: conditioning input (e.g., concat conditioning, spatial conditioning)\n", + "\n", + " # context가 주어지면 channel dimension으로 concat\n", + " if context is not None:\n", + " x = torch.cat((x, context), dim=1)\n", + "\n", + " # timestep conditioning 사용 여부\n", + " if self.use_timestep:\n", + " assert t is not None\n", + "\n", + " # diffusion timestep t를 sinusoidal embedding으로 변환\n", + " temb = get_timestep_embedding(t, self.ch)\n", + "\n", + " # MLP를 통해 timestep embedding을 네트워크 내부 차원으로 projection\n", + " temb = self.temb.dense[0](temb)\n", + " temb = nonlinearity(temb)\n", + " temb = self.temb.dense[1](temb)\n", + " else:\n", + " temb = None\n", + "\n", + " # 입력을 첫 convolution으로 latent feature로 변환\n", + " # hs: skip connection을 저장하기 위한 리스트\n", + " hs = [self.conv_in(x)]\n", + "\n", + " for i_level in range(self.num_resolutions):\n", + " for i_block in range(self.num_res_blocks):\n", + " # Residual block + timestep conditioning\n", + " h = self.down[i_level].block[i_block](hs[-1], temb)\n", + "\n", + " # 해당 resolution에서 attention을 사용하는 경우\n", + " if len(self.down[i_level].attn) > 0:\n", + " h = self.down[i_level].attn[i_block](h)\n", + "\n", + " # skip connection으로 저장\n", + " hs.append(h)\n", + "\n", + " # 마지막 resolution이 아니면 downsample 수행\n", + " if i_level != self.num_resolutions - 1:\n", + " hs.append(self.down[i_level].downsample(hs[-1]))\n", + "\n", + " # UNet의 가장 깊은 부분: 가장 global한 semantic representation\n", + " h = hs[-1]\n", + "\n", + " # residual block + timestep conditioning\n", + " h = self.mid.block_1(h, temb)\n", + "\n", + " # self-attention\n", + " h = self.mid.attn_1(h)\n", + "\n", + " # 두 번째 residual block\n", + " h = self.mid.block_2(h, temb)\n", + "\n", + " for i_level in reversed(range(self.num_resolutions)):\n", + " for i_block in range(self.num_res_blocks + 1):\n", + " # skip connection과 concat\n", + " h = self.up[i_level].block[i_block](\n", + " torch.cat([h, hs.pop()], dim=1), temb)\n", + "\n", + " # 해당 resolution에서 attention 사용\n", + " if len(self.up[i_level].attn) > 0:\n", + " h = self.up[i_level].attn[i_block](h)\n", + "\n", + " # 마지막 resolution이 아니면 upsample\n", + " if i_level != 0:\n", + " h = self.up[i_level].upsample(h)" + ], + "metadata": { + "id": "6kLb35hzgxG-" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git "a/Week15_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\247\200\354\235\200.md" "b/Week15_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\247\200\354\235\200.md" new file mode 100644 index 0000000..5a949a0 --- /dev/null +++ "b/Week15_\354\230\210\354\212\265\352\263\274\354\240\234_\352\271\200\354\247\200\354\235\200.md" @@ -0,0 +1 @@ +https://equatorial-chard-0cb.notion.site/LDM-High-Resolution-Image-Synthesis-with-Latent-Diffusion-Model-2cac71118dac80f98e3dde7e80d66095?source=copy_link