|
import torch |
|
from .sd_vae_decoder import VAEAttentionBlock, SDVAEDecoderStateDictConverter |
|
from .sd_unet import ResnetBlock, UpSampler |
|
from .tiler import TileWorker |
|
|
|
|
|
|
|
class SD3VAEDecoder(torch.nn.Module): |
|
def __init__(self): |
|
super().__init__() |
|
self.scaling_factor = 1.5305 |
|
self.shift_factor = 0.0609 |
|
self.conv_in = torch.nn.Conv2d(16, 512, kernel_size=3, padding=1) |
|
|
|
self.blocks = torch.nn.ModuleList([ |
|
|
|
ResnetBlock(512, 512, eps=1e-6), |
|
VAEAttentionBlock(1, 512, 512, 1, eps=1e-6), |
|
ResnetBlock(512, 512, eps=1e-6), |
|
|
|
ResnetBlock(512, 512, eps=1e-6), |
|
ResnetBlock(512, 512, eps=1e-6), |
|
ResnetBlock(512, 512, eps=1e-6), |
|
UpSampler(512), |
|
|
|
ResnetBlock(512, 512, eps=1e-6), |
|
ResnetBlock(512, 512, eps=1e-6), |
|
ResnetBlock(512, 512, eps=1e-6), |
|
UpSampler(512), |
|
|
|
ResnetBlock(512, 256, eps=1e-6), |
|
ResnetBlock(256, 256, eps=1e-6), |
|
ResnetBlock(256, 256, eps=1e-6), |
|
UpSampler(256), |
|
|
|
ResnetBlock(256, 128, eps=1e-6), |
|
ResnetBlock(128, 128, eps=1e-6), |
|
ResnetBlock(128, 128, eps=1e-6), |
|
]) |
|
|
|
self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6) |
|
self.conv_act = torch.nn.SiLU() |
|
self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1) |
|
|
|
def tiled_forward(self, sample, tile_size=64, tile_stride=32): |
|
hidden_states = TileWorker().tiled_forward( |
|
lambda x: self.forward(x), |
|
sample, |
|
tile_size, |
|
tile_stride, |
|
tile_device=sample.device, |
|
tile_dtype=sample.dtype |
|
) |
|
return hidden_states |
|
|
|
def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs): |
|
|
|
if tiled: |
|
return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride) |
|
|
|
|
|
hidden_states = sample / self.scaling_factor + self.shift_factor |
|
hidden_states = self.conv_in(hidden_states) |
|
time_emb = None |
|
text_emb = None |
|
res_stack = None |
|
|
|
|
|
for i, block in enumerate(self.blocks): |
|
hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack) |
|
|
|
|
|
hidden_states = self.conv_norm_out(hidden_states) |
|
hidden_states = self.conv_act(hidden_states) |
|
hidden_states = self.conv_out(hidden_states) |
|
|
|
return hidden_states |
|
|
|
@staticmethod |
|
def state_dict_converter(): |
|
return SDVAEDecoderStateDictConverter() |