qninhdt
/

swim_new

Model card Files Files and versions Community

qninhdt commited on Dec 8, 2024

Commit

b759b90

1 Parent(s): d2c8fab

cc

Browse files

Files changed (12) hide show

cc.py +17 -11
configs/experiment/channel=64.yaml +22 -0
configs/experiment/example.yaml +0 -41
configs/experiment/potato.yaml +21 -0
configs/model/swim_gan.yaml +0 -2
swim/models/blocks.py +10 -20
swim/models/content_encoder.py +35 -32
swim/models/decoder.py +34 -44
swim/models/discriminator.py +40 -43
swim/models/style_encoder.py +14 -52
swim/models/swim_gan.py +86 -117
swim/train.py +0 -3

cc.py CHANGED Viewed

@@ -4,18 +4,24 @@ from torchinfo import summary
 from swim.models.content_encoder import ContentEncoder
 from swim.models.decoder import Decoder
 from swim.models.style_encoder import StyleEncoder
-from swim.models.discriminator import Discriminator
-from swim.models.swim_gan import SwimGAN
-model = SwimGAN().cuda()
 image = torch.randn(1, 3, 512, 512).to("cuda")
-sample = torch.randn(1, 4, 64, 64).to("cuda")
 style_emb = torch.randn(1, 256).to("cuda")
-# summary(content_encoder, input_data=(sample,))
-# summary(decoder, input_data=(sample, style_emb))
-# summary(style_encoder, input_data=sample)
-summary(
-    model,
-    input_data=(image),
-)

 from swim.models.content_encoder import ContentEncoder
 from swim.models.decoder import Decoder
 from swim.models.style_encoder import StyleEncoder
+from swim.models.discriminator import FeatureDiscriminator
+import vision_aided_loss
+# from swim.models.swim_gan import SwimGAN
 image = torch.randn(1, 3, 512, 512).to("cuda")
 style_emb = torch.randn(1, 256).to("cuda")
+content = torch.randn(1, 512, 64, 64).to("cuda")
+content_encoder = ContentEncoder().cuda()
+decoder = Decoder().cuda()
+style_encoder = StyleEncoder().cuda()
+discriminator = FeatureDiscriminator().cuda()
+# i_discriminator = vision_aided_loss.Discriminator(
+#     cv_type="clip", loss_type="multilevel_sigmoid_s", device="cuda"
+# ).to("cuda")
+summary(content_encoder, input_data=(image,))
+# summary(decoder, input_data=(content, style_emb))
+# summary(style_encoder, input_data=image)
+# summary(discriminator, input_data=(content, style_emb))
+# summary(i_discriminator, input_data=(image,))

configs/experiment/channel=64.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# @package _global_
+defaults:
+  - override /data: swim
+  - override /model: swim_gan
+  - override /callbacks: default
+  - override /trainer: gpu
+seed: 42
+trainer:
+  max_epochs: 100
+model:
+  channels: 64
+  z_c_channels: 256
+  n_enc_resnet_blocks: 4
+  n_dec_resnet_blocks: 6
+  n_f_d_resnet_blocks: 2
+data:
+  batch_size: 2

configs/experiment/example.yaml DELETED Viewed

@@ -1,41 +0,0 @@
-# @package _global_
-# to execute this experiment run:
-# python train.py experiment=example
-defaults:
-  - override /data: mnist
-  - override /model: mnist
-  - override /callbacks: default
-  - override /trainer: default
-# all parameters below will be merged with parameters from default configurations set above
-# this allows you to overwrite only specified parameters
-tags: ["mnist", "simple_dense_net"]
-seed: 12345
-trainer:
-  min_epochs: 10
-  max_epochs: 10
-  gradient_clip_val: 0.5
-model:
-  optimizer:
-    lr: 0.002
-  net:
-    lin1_size: 128
-    lin2_size: 256
-    lin3_size: 64
-  compile: false
-data:
-  batch_size: 64
-logger:
-  wandb:
-    tags: ${tags}
-    group: "mnist"
-  aim:
-    experiment: "mnist"

configs/experiment/potato.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+# @package _global_
+defaults:
+  - override /data: swim
+  - override /model: swim_gan
+  - override /callbacks: default
+  - override /trainer: gpu
+seed: 42
+trainer:
+  max_epochs: 100
+model:
+  channels: 32
+  z_c_channels: 128
+  n_enc_resnet_blocks: 1
+  n_dec_resnet_blocks: 1
+data:
+  batch_size: 2

configs/model/swim_gan.yaml CHANGED Viewed

@@ -1,3 +1 @@
 _target_: swim.models.swim_gan.SwimGAN
-learning_rate: 1e-4


1	_target_: swim.models.swim_gan.SwimGAN

swim/models/blocks.py CHANGED Viewed

@@ -77,11 +77,9 @@ class DownSample(nn.Module):
 class ResnetBlock(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, cond_channels: int = 0):
         super().__init__()
-        # First normalization and convolution layer
-        self.norm1 = normalization(in_channels)
         self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=1, padding=1)
-        # cond layer
         self.cond_channels = cond_channels
         if cond_channels > 0:
             self.cond_proj = nn.Sequential(
@@ -90,40 +88,32 @@ class ResnetBlock(nn.Module):
                 nn.Linear(cond_channels, out_channels * 2),
             )
-        # Second normalization and convolution layer
-        self.norm2 = normalization(out_channels)
         self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1)
-        # `in_channels` to `out_channels` mapping layer for residual connection
         if in_channels != out_channels:
-            self.nin_shortcut = nn.Conv2d(
-                in_channels, out_channels, 1, stride=1, padding=0
-            )
         else:
-            self.nin_shortcut = nn.Identity()
     def forward(self, x: torch.Tensor, cond: torch.Tensor = None):
         h = x
-        # First normalization and convolution layer
         h = self.norm1(h)
         h = F.silu(h)
-        h = self.conv1(h)
-        # cond layer
         if cond is not None:
             cond = self.cond_proj(cond)[..., None, None]
             cond_scale, cond_shift = torch.chunk(cond, 2, dim=1)
-            h = self.norm2(h)
             h = h * (1 + cond_scale) + cond_shift
-        else:
-            h = self.norm2(h)
-        # Second normalization and convolution layer
-        h = F.silu(h)
         h = self.conv2(h)
-        # Map and add residual
-        return self.nin_shortcut(x) + h
 def normalization(channels):

 class ResnetBlock(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, cond_channels: int = 0):
         super().__init__()
         self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=1, padding=1)
+        self.norm1 = normalization(in_channels)
         self.cond_channels = cond_channels
         if cond_channels > 0:
             self.cond_proj = nn.Sequential(
                 nn.Linear(cond_channels, out_channels * 2),
             )
         self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1)
+        self.norm2 = normalization(out_channels)
         if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0)
         else:
+            self.shortcut = nn.Identity()
     def forward(self, x: torch.Tensor, cond: torch.Tensor = None):
         h = x
+        h = self.conv1(h)
         h = self.norm1(h)
         h = F.silu(h)
         if cond is not None:
             cond = self.cond_proj(cond)[..., None, None]
             cond_scale, cond_shift = torch.chunk(cond, 2, dim=1)
             h = h * (1 + cond_scale) + cond_shift
         h = self.conv2(h)
+        h = self.norm2(h)
+        h = h + self.shortcut(x)
+        h = F.silu(h)
+        return h
 def normalization(channels):

swim/models/content_encoder.py CHANGED Viewed

@@ -4,53 +4,56 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .blocks import ResnetBlock, DownSample, AttentionBlock, normalization
 class ContentEncoder(nn.Module):
     def __init__(
         self,
-        in_channels: int = 4,
-        z_c_channels: int = 128,
         channels: int = 128,
-        channel_multipliers: List[int] = [1, 2, 4],
-        n_resnet_blocks: int = 2,
     ):
         super().__init__()
-        n_resolutions = len(channel_multipliers)
-        self.conv_in = nn.Conv2d(in_channels, channels, 3, stride=1, padding=1)
-        channels_list = [m * channels for m in [1] + channel_multipliers]
-        self.down = nn.ModuleList()
-        for i in range(n_resolutions):
-            resnet_blocks = nn.ModuleList()
-            for _ in range(n_resnet_blocks):
-                resnet_blocks.append(ResnetBlock(channels, channels_list[i + 1]))
-                channels = channels_list[i + 1]
-            down = nn.Module()
-            down.block = resnet_blocks
-            if i != n_resolutions - 1:
-                down.downsample = DownSample(channels)
-            else:
-                down.downsample = nn.Identity()
-            self.down.append(down)
-        self.norm_out = normalization(channels)
         self.conv_out = nn.Conv2d(channels, z_c_channels, 3, stride=1, padding=1)
-    def forward(self, img: torch.Tensor):
-        x = self.conv_in(img)
-        for down in self.down:
-            for block in down.block:
-                x = block(x)
-            x = down.downsample(x)
-        x = self.norm_out(x)
-        x = F.silu(x)
-        x = self.conv_out(x)
-        return x

 import torch.nn as nn
 import torch.nn.functional as F
+from .blocks import ResnetBlock, normalization
 class ContentEncoder(nn.Module):
     def __init__(
         self,
         channels: int = 128,
+        z_c_channels: int = 512,
+        downsample_channel_mults: List[int] = [1, 2, 4],
+        n_resnet_blocks: int = 4,
     ):
         super().__init__()
+        self.conv_in = nn.Conv2d(3, channels, 7, stride=1, padding=3)
+        self.norm_in = normalization(channels)
+        channel_list = [channels * mult for mult in downsample_channel_mults]
+        self.downsamples = nn.ModuleList()
+        for out_channels in channel_list:
+            self.downsamples.append(
+                nn.Sequential(
+                    nn.Conv2d(channels, out_channels, 4, stride=2, padding=1),
+                    normalization(out_channels),
+                    nn.SiLU(),
+                )
+            )
+            channels = out_channels
+        self.resnet_blocks = nn.ModuleList()
+        for _ in range(n_resnet_blocks):
+            self.resnet_blocks.append(ResnetBlock(channels, channels))
         self.conv_out = nn.Conv2d(channels, z_c_channels, 3, stride=1, padding=1)
+        self.norm_out = normalization(z_c_channels)
+    def forward(self, x: torch.Tensor):
+        h = self.conv_in(x)
+        h = self.norm_in(h)
+        h = F.silu(h)
+        for downsample in self.downsamples:
+            h = downsample(h)
+        for resnet_block in self.resnet_blocks:
+            h = resnet_block(h)
+        h = self.conv_out(h)
+        h = self.norm_out(h)
+        h = F.silu(h)
+        return h

swim/models/decoder.py CHANGED Viewed

@@ -4,68 +4,58 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .blocks import (
-    ResnetBlock,
-    AttentionBlock,
-    StyledSequential,
-    UpSample,
-    normalization,
-)
 class Decoder(nn.Module):
     def __init__(
         self,
-        z_c_channels: int = 128,
-        out_channels: int = 4,
         channels: int = 128,
-        channel_multipliers: List[int] = [1, 2, 4],
-        n_resnet_blocks: int = 2,
         d_style_emb: int = 256,
     ):
         super().__init__()
-        num_resolutions = len(channel_multipliers)
-        channels_list = [m * channels for m in channel_multipliers]
-        channels = channels_list[-1]
-        self.conv_in = nn.Conv2d(z_c_channels, channels, 3, stride=1, padding=1)
-        self.up = nn.ModuleList()
-        for i in reversed(range(num_resolutions)):
-            resnet_blocks = nn.ModuleList()
-            for _ in range(n_resnet_blocks + 1):
-                resnet_blocks.append(
-                    StyledSequential(
-                        ResnetBlock(channels, channels_list[i], d_style_emb)
-                    )
                 )
-                channels = channels_list[i]
-            up = nn.Module()
-            up.block = resnet_blocks
-            if i != 0:
-                up.upsample = UpSample(channels)
-            else:
-                up.upsample = nn.Identity()
-            self.up.insert(0, up)
-        self.norm_out = normalization(channels)
-        self.conv_out = nn.Conv2d(channels, out_channels, 3, stride=1, padding=1)
-    def forward(self, z_c: torch.Tensor, z_s: torch.Tensor):
-        h = self.conv_in(z_c)
-        for up in reversed(self.up):
-            for block in up.block:
-                h = block(h, z_s)
-            h = up.upsample(h)
-        h = self.norm_out(h)
-        h = F.silu(h)
-        img = self.conv_out(h)
-        #
-        return img

 import torch.nn as nn
 import torch.nn.functional as F
+from .blocks import ResnetBlock, normalization
 class Decoder(nn.Module):
     def __init__(
         self,
         channels: int = 128,
+        z_c_channels: int = 512,
+        upsample_channel_mults: List[int] = [1, 2, 4],
+        n_resnet_blocks: int = 6,
         d_style_emb: int = 256,
     ):
         super().__init__()
+        channel_list = [channels * mult for mult in upsample_channel_mults]
+        self.conv_in = nn.Conv2d(z_c_channels, channel_list[-1], 3, stride=1, padding=1)
+        self.norm_in = normalization(channel_list[-1])
+        channels = channel_list[-1]
+        self.resnet_blocks = nn.ModuleList()
+        for _ in range(n_resnet_blocks):
+            self.resnet_blocks.append(ResnetBlock(channels, channels, d_style_emb))
+        self.upsamples = nn.ModuleList()
+        for out_channels in channel_list[::-1]:
+            self.upsamples.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(channels, out_channels, 4, stride=2, padding=1),
+                    normalization(out_channels),
+                    nn.SiLU(),
                 )
+            )
+            channels = out_channels
+        self.conv_out = nn.Conv2d(channels, 3, 7, stride=1, padding=3)
+    def forward(self, x: torch.Tensor, style_emb: torch.Tensor):
+        h = self.conv_in(x)
+        h = self.norm_in(h)
+        h = F.silu(h)
+        for resnet_block in self.resnet_blocks:
+            h = resnet_block(h, style_emb)
+        for upsample in self.upsamples:
+            h = upsample(h)
+        h = self.conv_out(h)
+        h = torch.tanh(h)
+        return h

swim/models/discriminator.py CHANGED Viewed

@@ -10,10 +10,8 @@ class SNResnetBlock(nn.Module):
         in_channels: int,
         out_channels: int,
         cond_channels: int = 0,
-        downsample: bool = False,
     ):
         super().__init__()
-        self.downsample = downsample
         self.d_cond = cond_channels
         self.conv1 = spectral_norm(
             nn.Conv2d(in_channels, out_channels, 3, stride=1, padding=1)
@@ -32,85 +30,84 @@ class SNResnetBlock(nn.Module):
         )
         if in_channels != out_channels:
-            self.nin_shortcut = spectral_norm(
                 nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0)
             )
         else:
-            self.nin_shortcut = nn.Identity()
     def forward(self, x: torch.Tensor, cond: torch.Tensor = None):
         h = x
-        h = F.leaky_relu(h, 0.2)
         h = self.conv1(h)
         if self.d_cond > 0:
             cond = self.cond_proj(cond)[..., None, None]
             cond_scale, cond_shift = torch.chunk(cond, 2, dim=1)
             h = h * (1 + cond_scale) + cond_shift
-        else:
-            assert cond is None
-        h = F.leaky_relu(h, 0.2)
         h = self.conv2(h)
-        h_skip = self.nin_shortcut(x)
-        if self.downsample:
-            h = F.avg_pool2d(h, 2)
-            h_skip = F.avg_pool2d(h_skip, 2)
-        return h + h_skip
-class Discriminator(nn.Module):
     def __init__(
         self,
-        in_channels,
-        channels,
-        channel_multipliers,
-        d_cond: int = 0,
     ):
         super().__init__()
-        self.input_block = spectral_norm(nn.Conv2d(in_channels, channels, 3, padding=1))
-        self.blocks = nn.ModuleList()
-        n_resolutions = len(channel_multipliers)
-        channels_list = [m * channels for m in channel_multipliers]
-        for i in range(n_resolutions):
-            self.blocks.append(
-                SNResnetBlock(
-                    channels, channels_list[i], cond_channels=d_cond, downsample=True
-                )
-            )
-            channels = channels_list[i]
-        self.out = nn.Sequential(
-            nn.LeakyReLU(0.2), spectral_norm(nn.Conv2d(channels, 1, 3, 1, 1))
         )
     def forward(
-        self, x: torch.Tensor, cond: torch.Tensor = None, for_real: bool = False
     ):
-        h = self.input_block(x)
-        for block in self.blocks:
-            h = block(h, cond)
-        h = self.out(h)
         if for_real:
             target = torch.ones_like(h)
         else:
             target = torch.zeros_like(h)
-        loss = F.binary_cross_entropy_with_logits(h, target)
-        # if for_real:
-        #     loss = torch.relu(1 - h).mean()
-        # else:
-        #     loss = torch.relu(1 + h).mean()
         return loss

         in_channels: int,
         out_channels: int,
         cond_channels: int = 0,
     ):
         super().__init__()
         self.d_cond = cond_channels
         self.conv1 = spectral_norm(
             nn.Conv2d(in_channels, out_channels, 3, stride=1, padding=1)
         )
         if in_channels != out_channels:
+            self.shortcut = spectral_norm(
                 nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0)
             )
         else:
+            self.shortcut = nn.Identity()
     def forward(self, x: torch.Tensor, cond: torch.Tensor = None):
         h = x
         h = self.conv1(h)
+        h = F.leaky_relu(h, 0.2)
         if self.d_cond > 0:
             cond = self.cond_proj(cond)[..., None, None]
             cond_scale, cond_shift = torch.chunk(cond, 2, dim=1)
             h = h * (1 + cond_scale) + cond_shift
         h = self.conv2(h)
+        h = h + self.shortcut(x)
+        h = F.leaky_relu(h, 0.2)
+        return h
+class FeatureDiscriminator(nn.Module):
     def __init__(
         self,
+        z_c_channels: int = 512,
+        channels: int = 512,
+        d_style_emb: int = 256,
+        n_resnet_blocks: int = 2,
     ):
         super().__init__()
+        self.conv_in = spectral_norm(
+            nn.Conv2d(z_c_channels, channels, 3, stride=1, padding=1)
+        )
+        self.resnet_blocks = nn.ModuleList()
+        for _ in range(n_resnet_blocks):
+            self.resnet_blocks.append(SNResnetBlock(channels, channels, d_style_emb))
+        self.conv_out = spectral_norm(
+            nn.Conv2d(channels, channels, 3, stride=1, padding=1)
+        )
+        self.mlp = nn.Sequential(
+            spectral_norm(nn.Linear(channels, 256)),
+            nn.LeakyReLU(0.2),
+            spectral_norm(nn.Linear(256, 1)),
         )
     def forward(
+        self, x: torch.Tensor, style_emb: torch.Tensor, for_G=False, for_real=False
     ):
+        h = self.conv_in(x)
+        h = F.leaky_relu(h, 0.2)
+        for resnet_block in self.resnet_blocks:
+            h = resnet_block(h, style_emb)
+            h = F.avg_pool2d(h, 2)
+        h = self.conv_out(h)
+        h = F.leaky_relu(h, 0.2)
+        h = F.adaptive_avg_pool2d(h, 1)
+        h = torch.flatten(h, 1)
+        h = self.mlp(h)
+        if for_G:
+            for_real = True
         if for_real:
             target = torch.ones_like(h)
         else:
             target = torch.zeros_like(h)
+        loss = F.binary_cross_entropy_with_logits(h, target, reduction="none")
         return loss

swim/models/style_encoder.py CHANGED Viewed

@@ -1,62 +1,24 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from lightning import LightningModule
-from .blocks import ResnetBlock, DownSample
-class StyleEncoder(LightningModule):
-    def __init__(
-        self,
-        in_channels: int = 4,
-        n_styles: int = 4,
-        d_style_emb: int = 256,
-        d_hidden: int = 512,
-        channels: int = 64,
-        n_layers: int = 4,
-    ):
-        super().__init__()
-        self.n_styles = n_styles
-        # Initial convolution
-        self.conv_in = nn.Sequential(
-            nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
-            nn.GroupNorm(32, channels),
-            nn.SiLU(),
-        )
-        # Convolutional blocks with GroupNorm and single convolution
-        self.blocks = nn.ModuleList()
-        for i in range(n_layers):
-            self.blocks.append(
-                nn.Sequential(
-                    nn.Conv2d(
-                        channels, channels * 2, kernel_size=3, stride=2, padding=1
-                    ),  # Downsample
-                    nn.GroupNorm(32, channels * 2),
-                    nn.SiLU(),
-                )
-            )
-            channels *= 2
-        # Output MLP
-        self.out = nn.Sequential(
-            nn.AdaptiveAvgPool2d(1),  # Global average pooling
-            nn.Flatten(),  # Flatten spatial dimensions
-            nn.Linear(channels, d_hidden),  # First dense layer
-            nn.SiLU(),
-            nn.Linear(d_hidden, d_style_emb),  # Final style embedding
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        h = self.conv_in(x)  # Initial convolution
-        for block in self.blocks:
-            h = block(h)  # Pass through each block
-        h = self.out(h)  # Pool and process for style embedding
-        return h

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torchvision.models.resnet import resnet18, ResNet18_Weights
+class StyleEncoder(nn.Module):
+    def __init__(self, d_style_emb=256):
+        super(StyleEncoder, self).__init__()
+        self.resnet = resnet18(weights=ResNet18_Weights.DEFAULT)
+        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
+        self.fc = nn.Linear(512, d_style_emb)
+    def forward(self, x):
+        # resize input to 224x224
+        # x = F.interpolate(x, size=(224, 224), mode="bilinear")
+        x = self.resnet(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x

swim/models/swim_gan.py CHANGED Viewed

@@ -12,12 +12,14 @@ from lightning import LightningModule
 from diffusers import AutoencoderKL
 from diffusers.utils import make_image_grid
-from swim.utils.tensor_pool import GroupTensorPool, TensorPool
 from .style_encoder import StyleEncoder
 from .content_encoder import ContentEncoder
 from .decoder import Decoder
-from .discriminator import Discriminator
 class SwimGAN(LightningModule):
@@ -26,16 +28,18 @@ class SwimGAN(LightningModule):
         self,
         channels: int = 128,
         z_c_channels: int = 512,
-        channel_multipliers: list = [1, 2, 2, 4],
-        n_resnet_blocks: int = 2,
         n_styles: int = 5,
         d_style_emb: int = 128,
         input_size: int = 512,
         learning_rate: float = 1e-5,
-        weight_decay: float = 0,
         lambda_cls: float = 10.0,
-        lambda_rec: float = 1.0,
-        lambda_cycle: float = 1.0,
         lambda_c_g: float = 1.0,
         lambda_x_g: float = 1.0,
         lambda_c_const: float = 1.0,
@@ -58,43 +62,34 @@ class SwimGAN(LightningModule):
         self.lambda_x_g = lambda_x_g
         self.content_encoder = ContentEncoder(
-            in_channels=4,
-            z_c_channels=z_c_channels,
             channels=channels,
-            channel_multipliers=[1, 2, 2, 4],
-            n_resnet_blocks=2,
         )
-        self.style_encoder = StyleEncoder(4, n_styles, d_style_emb)
         self.decoder = Decoder(
-            z_c_channels=z_c_channels,
-            out_channels=4,
             channels=channels,
-            channel_multipliers=[1, 2, 2, 4],
-            n_resnet_blocks=2,
             d_style_emb=d_style_emb,
         )
-        self.vae: AutoencoderKL = AutoencoderKL.from_pretrained(
-            "stabilityai/sd-turbo", subfolder="vae"
-        )
-        self.vae.requires_grad_(False)
-        self.vae.eval()
         self.style_classifier = nn.Linear(d_style_emb, n_styles)
         # training only
-        self.x_discriminator = Discriminator(
-            in_channels=4,
-            channels=128,
-            channel_multipliers=[1, 2, 4, 8],
         )
-        self.c_discriminator = Discriminator(
-            in_channels=z_c_channels,
             channels=z_c_channels,
-            channel_multipliers=[1, 1, 1],
-            d_cond=d_style_emb,
         )
         self.style_pool = GroupTensorPool(n_styles, 256)
@@ -102,117 +97,100 @@ class SwimGAN(LightningModule):
         self.cls_loss = nn.CrossEntropyLoss()
-    def vae_encode(self, x: torch.Tensor) -> torch.Tensor:
-        return self.vae.encode(x).latent_dist.sample() * self.vae.config.scaling_factor
-    def vae_decode(self, z: torch.Tensor) -> torch.Tensor:
-        return self.vae.decode(z / self.vae.config.scaling_factor).sample.clamp(-1, 1)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        with torch.no_grad():
-            z = self.vae_encode(x)
-        z_c = self.content_encoder(z)
-        z_s = self.style_encoder(z)
-        z_rec = self.decoder(z_c, z_s)
-        return z, z_c, z_s, z_rec
     def training_step(self, batch, batch_idx):
         x = batch["images"]
         gt_style = batch["styles"]
-        g_opt, cls_opt, x_d_opt, c_d_opt = self.optimizers()
-        z = self.vae_encode(x)
-        # train the cls
-        z_s = self.style_encoder(z)
         style_logits = self.style_classifier(z_s)
         cls_loss = self.cls_loss(style_logits, gt_style)
-        cls_opt.zero_grad()
-        self.manual_backward(cls_loss)
-        cls_opt.step()
-        # train the autoencoder
-        z_s = z_s.detach()
-        z_c = self.content_encoder(z)
-        z_rec = self.decoder(z_c, z_s)
-        rec_loss = F.l1_loss(z, z_rec)
         # # sample a random content and style feature
         z_c_hat, _ = self.content_pool.query(z_c, gt_style)
         z_s_hat, _ = self.style_pool.query(z_s, gt_style)
-        z1 = self.decoder(z_c, z_s_hat)
-        z2 = self.decoder(z_c_hat, z_s)
-        z_c_rec = self.content_encoder(z1)
-        z_s_hat_rec = self.style_encoder(z1)
-        z_c_hat_rec = self.content_encoder(z2)
-        z_s_rec = self.style_encoder(z2)
         c_const_loss = F.l1_loss(z_c, z_c_rec) + F.l1_loss(z_c_hat, z_c_hat_rec)
         s_const_loss = F.l1_loss(z_s, z_s_rec) + F.l1_loss(z_s_hat, z_s_hat_rec)
         # adversarial loss
-        x_g_loss = (
-            self.x_discriminator(z1, for_real=True)
-            + self.x_discriminator(z2, for_real=True)
         ) / 2
-        c_g_loss = self.c_discriminator(z_c, z_s, for_real=False)
         g_loss = (
             self.lambda_rec * rec_loss
             + self.lambda_c_const * c_const_loss
             + self.lambda_s_const * s_const_loss
-            + self.lambda_x_g * x_g_loss
             + self.lambda_c_g * c_g_loss
         )
         g_opt.zero_grad()
         self.manual_backward(g_loss)
         g_opt.step()
-        # train the x discriminator
-        x_d_loss = (
-            self.x_discriminator(z, for_real=True)
             + (
-                self.x_discriminator(z1.detach(), for_real=False)
-                + self.x_discriminator(z2.detach(), for_real=False)
             )
             / 2
         ) / 2
-        x_d_opt.zero_grad()
-        self.manual_backward(x_d_loss)
-        x_d_opt.step()
-        # train the content discriminator
-        c_d_loss = (
-            self.c_discriminator(z_c.detach(), z_s.detach(), for_real=True)
             + (
-                self.c_discriminator(z_c.detach(), z_s_hat, for_real=False)
-                + self.c_discriminator(z_c_hat, z_s.detach(), for_real=False)
             )
             / 2
         ) / 2
-        c_d_opt.zero_grad()
-        self.manual_backward(c_d_loss)
-        c_d_opt.step()
         self.log_dict(
             {
                 "train/rec_loss": rec_loss,
                 "train/cls_loss": cls_loss,
-                "train/x_g_loss": x_g_loss,
-                "train/x_d_loss": x_d_loss,
                 "train/c_g_loss": c_g_loss,
-                "train/c_d_loss": c_d_loss,
                 "train/c_const_loss": c_const_loss,
                 "train/s_const_loss": s_const_loss,
             },
@@ -223,25 +201,20 @@ class SwimGAN(LightningModule):
     def validation_step(self, batch, batch_idx):
         x = batch["images"]
-        gt_style_logits = batch["styles"]  # B x n_styles
         x = x[torch.randperm(x.shape[0])]
-        z = self.vae_encode(x)
-        z_c = self.content_encoder(z)
-        z_s = self.style_encoder(z)
-        z_rec = self.decoder(z_c, z_s)
         x1, x2 = x.chunk(2, dim=0)
-        x_rec = self.vae_decode(z_rec)
         x1_rec, x2_rec = x_rec.chunk(2, dim=0)
         z1_c, z2_c = z_c.chunk(2, dim=0)
         z1_s, z2_s = z_s.chunk(2, dim=0)
-        z1_swap = self.decoder(z1_c, z2_s)
-        z2_swap = self.decoder(z2_c, z1_s)
-        x1_swap = self.vae_decode(z1_swap)
-        x2_swap = self.vae_decode(z2_swap)
         if self.trainer.is_global_zero:
             x1_img = self.postprocess_images(x1)
@@ -266,7 +239,7 @@ class SwimGAN(LightningModule):
                 wandb.Image(image, caption="orig | rec | swap") for image in images
             ]
-            wandb.log({"val/samples": images})
         self.log("val/lpips", -self.global_step, sync_dist=True)
@@ -282,28 +255,24 @@ class SwimGAN(LightningModule):
     def configure_optimizers(self):
         g_opt = torch.optim.AdamW(
-            list(self.content_encoder.parameters()) + list(self.decoder.parameters()),
             lr=self.learning_rate,
-            weight_decay=1e-4,
-        )
-        cls_opt = torch.optim.AdamW(
-            list(self.style_encoder.parameters())
-            + list(self.style_classifier.parameters()),
-            lr=self.learning_rate * 10,
-            weight_decay=1e-4,
         )
-        x_d_opt = torch.optim.AdamW(
-            list(self.x_discriminator.parameters()),
-            lr=self.learning_rate * 10,
-            weight_decay=1e-4,
         )
-        c_d_opt = torch.optim.AdamW(
-            list(self.c_discriminator.parameters()),
-            lr=self.learning_rate * 10,
-            weight_decay=1e-4,
         )
-        return [g_opt, cls_opt, x_d_opt, c_d_opt], []

 from diffusers import AutoencoderKL
 from diffusers.utils import make_image_grid
+from swim.utils.tensor_pool import GroupTensorPool
 from .style_encoder import StyleEncoder
 from .content_encoder import ContentEncoder
 from .decoder import Decoder
+from .discriminator import FeatureDiscriminator
+import vision_aided_loss
 class SwimGAN(LightningModule):
         self,
         channels: int = 128,
         z_c_channels: int = 512,
+        updown_channel_mults: List[int] = [1, 2, 4],
+        n_enc_resnet_blocks: int = 4,
+        n_dec_resnet_blocks: int = 6,
+        n_f_d_resnet_blocks: int = 2,
         n_styles: int = 5,
         d_style_emb: int = 128,
         input_size: int = 512,
         learning_rate: float = 1e-5,
+        weight_decay: float = 1e-4,
         lambda_cls: float = 10.0,
+        lambda_rec: float = 10.0,
+        lambda_cycle: float = 10.0,
         lambda_c_g: float = 1.0,
         lambda_x_g: float = 1.0,
         lambda_c_const: float = 1.0,
         self.lambda_x_g = lambda_x_g
         self.content_encoder = ContentEncoder(
             channels=channels,
+            z_c_channels=z_c_channels,
+            downsample_channel_mults=[1, 2, 4],
+            n_resnet_blocks=n_enc_resnet_blocks,
         )
+        self.style_encoder = StyleEncoder(d_style_emb=d_style_emb)
         self.decoder = Decoder(
             channels=channels,
+            z_c_channels=z_c_channels,
+            upsample_channel_mults=updown_channel_mults,
+            n_resnet_blocks=n_dec_resnet_blocks,
             d_style_emb=d_style_emb,
         )
         self.style_classifier = nn.Linear(d_style_emb, n_styles)
         # training only
+        self.i_discriminator = vision_aided_loss.Discriminator(
+            cv_type="clip", loss_type="multilevel_sigmoid", device="cpu"
         )
+        self.f_discriminator = FeatureDiscriminator(
+            z_c_channels=z_c_channels,
             channels=z_c_channels,
+            d_style_emb=d_style_emb,
+            n_resnet_blocks=n_f_d_resnet_blocks,
         )
         self.style_pool = GroupTensorPool(n_styles, 256)
         self.cls_loss = nn.CrossEntropyLoss()
+    def on_fit_start(self):
+        for model in self.i_discriminator.cv_ensemble.models:
+            model.to(self.device)
+            model.requires_grad_(False)
     def training_step(self, batch, batch_idx):
         x = batch["images"]
         gt_style = batch["styles"]
+        g_opt, i_d_opt, f_d_opt = self.optimizers()
+        # train the autoencoder
+        z_s = self.style_encoder(x)
+        z_c = self.content_encoder(x)
+        x_rec = self.decoder(z_c, z_s)
         style_logits = self.style_classifier(z_s)
         cls_loss = self.cls_loss(style_logits, gt_style)
+        rec_loss = F.l1_loss(x, x_rec)
         # # sample a random content and style feature
         z_c_hat, _ = self.content_pool.query(z_c, gt_style)
         z_s_hat, _ = self.style_pool.query(z_s, gt_style)
+        x1 = self.decoder(z_c, z_s_hat)
+        x2 = self.decoder(z_c_hat, z_s)
+        z_c_rec = self.content_encoder(x1)
+        z_s_hat_rec = self.style_encoder(x1)
+        z_c_hat_rec = self.content_encoder(x2)
+        z_s_rec = self.style_encoder(x2)
         c_const_loss = F.l1_loss(z_c, z_c_rec) + F.l1_loss(z_c_hat, z_c_hat_rec)
         s_const_loss = F.l1_loss(z_s, z_s_rec) + F.l1_loss(z_s_hat, z_s_hat_rec)
         # adversarial loss
+        i_g_loss = (
+            self.i_discriminator(x1, for_G=True).mean()
+            + self.i_discriminator(x2, for_G=True).mean()
         ) / 2
+        c_g_loss = self.f_discriminator(z_c, z_s, for_real=False).mean()
         g_loss = (
             self.lambda_rec * rec_loss
             + self.lambda_c_const * c_const_loss
             + self.lambda_s_const * s_const_loss
+            + self.lambda_x_g * i_g_loss
             + self.lambda_c_g * c_g_loss
+            + self.lambda_cls * cls_loss
         )
         g_opt.zero_grad()
         self.manual_backward(g_loss)
         g_opt.step()
+        # train the image discriminator
+        i_d_loss = (
+            self.i_discriminator(x, for_real=True).mean()
             + (
+                self.i_discriminator(x1.detach(), for_real=False).mean()
+                + self.i_discriminator(x2.detach(), for_real=False).mean()
             )
             / 2
         ) / 2
+        i_d_opt.zero_grad()
+        self.manual_backward(i_d_loss)
+        i_d_opt.step()
+        # train the feature discriminator
+        f_d_loss = (
+            self.f_discriminator(z_c.detach(), z_s.detach(), for_real=True).mean()
             + (
+                self.f_discriminator(z_c.detach(), z_s_hat, for_real=False).mean()
+                + self.f_discriminator(z_c_hat, z_s.detach(), for_real=False).mean()
             )
             / 2
         ) / 2
+        f_d_opt.zero_grad()
+        self.manual_backward(f_d_loss)
+        f_d_opt.step()
         self.log_dict(
             {
                 "train/rec_loss": rec_loss,
                 "train/cls_loss": cls_loss,
+                "train/i_g_loss": i_g_loss,
+                "train/f_d_loss": f_d_loss,
                 "train/c_g_loss": c_g_loss,
+                "train/f_d_loss": f_d_loss,
                 "train/c_const_loss": c_const_loss,
                 "train/s_const_loss": s_const_loss,
             },
     def validation_step(self, batch, batch_idx):
         x = batch["images"]
         x = x[torch.randperm(x.shape[0])]
+        z_c = self.content_encoder(x)
+        z_s = self.style_encoder(x)
+        x_rec = self.decoder(z_c, z_s)
         x1, x2 = x.chunk(2, dim=0)
         x1_rec, x2_rec = x_rec.chunk(2, dim=0)
         z1_c, z2_c = z_c.chunk(2, dim=0)
         z1_s, z2_s = z_s.chunk(2, dim=0)
+        x1_swap = self.decoder(z1_c, z2_s)
+        x2_swap = self.decoder(z2_c, z1_s)
         if self.trainer.is_global_zero:
             x1_img = self.postprocess_images(x1)
                 wandb.Image(image, caption="orig | rec | swap") for image in images
             ]
+            # wandb.log({"val/samples": images})
         self.log("val/lpips", -self.global_step, sync_dist=True)
     def configure_optimizers(self):
         g_opt = torch.optim.AdamW(
+            list(self.content_encoder.parameters())
+            + list(self.style_encoder.parameters())
+            + list(self.style_classifier.parameters())
+            + list(self.decoder.parameters()),
             lr=self.learning_rate,
+            weight_decay=self.weight_decay,
         )
+        i_d_opt = torch.optim.AdamW(
+            list(self.i_discriminator.parameters()),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
         )
+        f_d_opt = torch.optim.AdamW(
+            list(self.f_discriminator.parameters()),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
         )
+        return [g_opt, i_d_opt, f_d_opt]

swim/train.py CHANGED Viewed

@@ -91,9 +91,6 @@ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
             model.compile()
         log.info("Starting training!")
-        from swim.models.swim_gan import SwimGAN
-        # model = SwimGAN.load_from_checkpoint(cfg.get("ckpt_path"), strict=False)
         trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
     train_metrics = trainer.callback_metrics

             model.compile()
         log.info("Starting training!")
         trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
     train_metrics = trainer.callback_metrics