somos99 commited on
Commit
cac3654
·
verified ·
1 Parent(s): 7b49680

Upload 9 files

Browse files
__init__.py ADDED
File without changes
autoencoder_kl_3d.py ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Tuple, Optional
16
+ import math
17
+ import random
18
+ import numpy as np
19
+ from einops import rearrange
20
+ import torch
21
+ from torch import Tensor, nn
22
+ import torch.nn.functional as F
23
+
24
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
25
+ from diffusers.models.modeling_outputs import AutoencoderKLOutput
26
+ from diffusers.models.modeling_utils import ModelMixin
27
+ from diffusers.utils.torch_utils import randn_tensor
28
+ from diffusers.utils import BaseOutput
29
+
30
+
31
+ class DiagonalGaussianDistribution(object):
32
+ def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
33
+ if parameters.ndim == 3:
34
+ dim = 2 # (B, L, C)
35
+ elif parameters.ndim == 5 or parameters.ndim == 4:
36
+ dim = 1 # (B, C, T, H ,W) / (B, C, H, W)
37
+ else:
38
+ raise NotImplementedError
39
+ self.parameters = parameters
40
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
41
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
42
+ self.deterministic = deterministic
43
+ self.std = torch.exp(0.5 * self.logvar)
44
+ self.var = torch.exp(self.logvar)
45
+ if self.deterministic:
46
+ self.var = self.std = torch.zeros_like(
47
+ self.mean, device=self.parameters.device, dtype=self.parameters.dtype
48
+ )
49
+
50
+ def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
51
+ # make sure sample is on the same device as the parameters and has same dtype
52
+ sample = randn_tensor(
53
+ self.mean.shape,
54
+ generator=generator,
55
+ device=self.parameters.device,
56
+ dtype=self.parameters.dtype,
57
+ )
58
+ x = self.mean + self.std * sample
59
+ return x
60
+
61
+ def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
62
+ if self.deterministic:
63
+ return torch.Tensor([0.0])
64
+ else:
65
+ reduce_dim = list(range(1, self.mean.ndim))
66
+ if other is None:
67
+ return 0.5 * torch.sum(
68
+ torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
69
+ dim=reduce_dim,
70
+ )
71
+ else:
72
+ return 0.5 * torch.sum(
73
+ torch.pow(self.mean - other.mean, 2) / other.var +
74
+ self.var / other.var -
75
+ 1.0 -
76
+ self.logvar +
77
+ other.logvar,
78
+ dim=reduce_dim,
79
+ )
80
+
81
+ def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
82
+ if self.deterministic:
83
+ return torch.Tensor([0.0])
84
+ logtwopi = np.log(2.0 * np.pi)
85
+ return 0.5 * torch.sum(
86
+ logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
87
+ dim=dims,
88
+ )
89
+
90
+ def mode(self) -> torch.Tensor:
91
+ return self.mean
92
+
93
+
94
+ @dataclass
95
+ class DecoderOutput(BaseOutput):
96
+ sample: torch.FloatTensor
97
+ posterior: Optional[DiagonalGaussianDistribution] = None
98
+
99
+
100
+ def swish(x: Tensor) -> Tensor:
101
+ return x * torch.sigmoid(x)
102
+
103
+
104
+ def forward_with_checkpointing(module, *inputs, use_checkpointing=False):
105
+ def create_custom_forward(module):
106
+ def custom_forward(*inputs):
107
+ return module(*inputs)
108
+ return custom_forward
109
+
110
+ if use_checkpointing:
111
+ return torch.utils.checkpoint.checkpoint(create_custom_forward(module), *inputs, use_reentrant=False)
112
+ else:
113
+ return module(*inputs)
114
+
115
+
116
+ class Conv3d(nn.Conv3d):
117
+ """
118
+ Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5.
119
+ Only symmetric padding is supported.
120
+ """
121
+
122
+ def forward(self, input):
123
+ B, C, T, H, W = input.shape
124
+ memory_count = (C * T * H * W) * 2 / 1024**3
125
+ if memory_count > 2:
126
+ n_split = math.ceil(memory_count / 2)
127
+ assert n_split >= 2
128
+ chunks = torch.chunk(input, chunks=n_split, dim=-3)
129
+ padded_chunks = []
130
+ for i in range(len(chunks)):
131
+ if self.padding[0] > 0:
132
+ padded_chunk = F.pad(
133
+ chunks[i],
134
+ (0, 0, 0, 0, self.padding[0], self.padding[0]),
135
+ mode="constant" if self.padding_mode == "zeros" else self.padding_mode,
136
+ value=0,
137
+ )
138
+ if i > 0:
139
+ padded_chunk[:, :, :self.padding[0]] = chunks[i - 1][:, :, -self.padding[0]:]
140
+ if i < len(chunks) - 1:
141
+ padded_chunk[:, :, -self.padding[0]:] = chunks[i + 1][:, :, :self.padding[0]]
142
+ else:
143
+ padded_chunk = chunks[i]
144
+ padded_chunks.append(padded_chunk)
145
+ padding_bak = self.padding
146
+ self.padding = (0, self.padding[1], self.padding[2])
147
+ outputs = []
148
+ for i in range(len(padded_chunks)):
149
+ outputs.append(super().forward(padded_chunks[i]))
150
+ self.padding = padding_bak
151
+ return torch.cat(outputs, dim=-3)
152
+ else:
153
+ return super().forward(input)
154
+
155
+
156
+ class AttnBlock(nn.Module):
157
+ """ Attention with torch sdpa implementation. """
158
+ def __init__(self, in_channels: int):
159
+ super().__init__()
160
+ self.in_channels = in_channels
161
+
162
+ self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
163
+
164
+ self.q = Conv3d(in_channels, in_channels, kernel_size=1)
165
+ self.k = Conv3d(in_channels, in_channels, kernel_size=1)
166
+ self.v = Conv3d(in_channels, in_channels, kernel_size=1)
167
+ self.proj_out = Conv3d(in_channels, in_channels, kernel_size=1)
168
+
169
+ def attention(self, h_: Tensor) -> Tensor:
170
+ h_ = self.norm(h_)
171
+ q = self.q(h_)
172
+ k = self.k(h_)
173
+ v = self.v(h_)
174
+
175
+ b, c, f, h, w = q.shape
176
+ q = rearrange(q, "b c f h w -> b 1 (f h w) c").contiguous()
177
+ k = rearrange(k, "b c f h w -> b 1 (f h w) c").contiguous()
178
+ v = rearrange(v, "b c f h w -> b 1 (f h w) c").contiguous()
179
+ h_ = nn.functional.scaled_dot_product_attention(q, k, v)
180
+
181
+ return rearrange(h_, "b 1 (f h w) c -> b c f h w", f=f, h=h, w=w, c=c, b=b)
182
+
183
+ def forward(self, x: Tensor) -> Tensor:
184
+ return x + self.proj_out(self.attention(x))
185
+
186
+
187
+ class ResnetBlock(nn.Module):
188
+ def __init__(self, in_channels: int, out_channels: int):
189
+ super().__init__()
190
+ self.in_channels = in_channels
191
+ out_channels = in_channels if out_channels is None else out_channels
192
+ self.out_channels = out_channels
193
+
194
+ self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
195
+ self.conv1 = Conv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
196
+ self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
197
+ self.conv2 = Conv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
198
+ if self.in_channels != self.out_channels:
199
+ self.nin_shortcut = Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
200
+
201
+ def forward(self, x):
202
+ h = x
203
+ h = self.norm1(h)
204
+ h = swish(h)
205
+ h = self.conv1(h)
206
+
207
+ h = self.norm2(h)
208
+ h = swish(h)
209
+ h = self.conv2(h)
210
+
211
+ if self.in_channels != self.out_channels:
212
+ x = self.nin_shortcut(x)
213
+ return x + h
214
+
215
+
216
+ class Downsample(nn.Module):
217
+ def __init__(self, in_channels: int, add_temporal_downsample: bool = True):
218
+ super().__init__()
219
+ self.add_temporal_downsample = add_temporal_downsample
220
+ stride = (2, 2, 2) if add_temporal_downsample else (1, 2, 2) # THW
221
+ # no asymmetric padding in torch conv, must do it ourselves
222
+ self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=stride, padding=0)
223
+
224
+ def forward(self, x: Tensor):
225
+ spatial_pad = (0, 1, 0, 1, 0, 0) # WHT
226
+ x = nn.functional.pad(x, spatial_pad, mode="constant", value=0)
227
+
228
+ temporal_pad = (0, 0, 0, 0, 0, 1) if self.add_temporal_downsample else (0, 0, 0, 0, 1, 1)
229
+ x = nn.functional.pad(x, temporal_pad, mode="replicate")
230
+
231
+ x = self.conv(x)
232
+ return x
233
+
234
+
235
+ class DownsampleDCAE(nn.Module):
236
+ def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
237
+ super().__init__()
238
+ factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2
239
+ assert out_channels % factor == 0
240
+ self.conv = Conv3d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
241
+
242
+ self.add_temporal_downsample = add_temporal_downsample
243
+ self.group_size = factor * in_channels // out_channels
244
+
245
+ def forward(self, x: Tensor):
246
+ r1 = 2 if self.add_temporal_downsample else 1
247
+ h = self.conv(x)
248
+ h = rearrange(h, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
249
+ shortcut = rearrange(x, "b c (f r1) (h r2) (w r3) -> b (r1 r2 r3 c) f h w", r1=r1, r2=2, r3=2)
250
+
251
+ B, C, T, H, W = shortcut.shape
252
+ shortcut = shortcut.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
253
+ return h + shortcut
254
+
255
+
256
+ class Upsample(nn.Module):
257
+ def __init__(self, in_channels: int, add_temporal_upsample: bool = True):
258
+ super().__init__()
259
+ self.add_temporal_upsample = add_temporal_upsample
260
+ self.scale_factor = (2, 2, 2) if add_temporal_upsample else (1, 2, 2) # THW
261
+ self.conv = Conv3d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
262
+
263
+ def forward(self, x: Tensor):
264
+ x = nn.functional.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
265
+ x = self.conv(x)
266
+ return x
267
+
268
+
269
+ class UpsampleDCAE(nn.Module):
270
+ def __init__(self, in_channels: int, out_channels: int, add_temporal_upsample: bool = True):
271
+ super().__init__()
272
+ factor = 2 * 2 * 2 if add_temporal_upsample else 1 * 2 * 2
273
+ self.conv = Conv3d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
274
+
275
+ self.add_temporal_upsample = add_temporal_upsample
276
+ self.repeats = factor * out_channels // in_channels
277
+
278
+ def forward(self, x: Tensor):
279
+ r1 = 2 if self.add_temporal_upsample else 1
280
+ h = self.conv(x)
281
+ h = rearrange(h, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
282
+ shortcut = x.repeat_interleave(repeats=self.repeats, dim=1)
283
+ shortcut = rearrange(shortcut, "b (r1 r2 r3 c) f h w -> b c (f r1) (h r2) (w r3)", r1=r1, r2=2, r3=2)
284
+ return h + shortcut
285
+
286
+
287
+ class Encoder(nn.Module):
288
+ """
289
+ The encoder network of AutoencoderKLConv3D.
290
+ """
291
+ def __init__(
292
+ self,
293
+ in_channels: int,
294
+ z_channels: int,
295
+ block_out_channels: Tuple[int, ...],
296
+ num_res_blocks: int,
297
+ ffactor_spatial: int,
298
+ ffactor_temporal: int,
299
+ downsample_match_channel: bool = True,
300
+ ):
301
+ super().__init__()
302
+ assert block_out_channels[-1] % (2 * z_channels) == 0
303
+
304
+ self.z_channels = z_channels
305
+ self.block_out_channels = block_out_channels
306
+ self.num_res_blocks = num_res_blocks
307
+
308
+ # downsampling
309
+ self.conv_in = Conv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
310
+
311
+ self.down = nn.ModuleList()
312
+ block_in = block_out_channels[0]
313
+ for i_level, ch in enumerate(block_out_channels):
314
+ block = nn.ModuleList()
315
+ block_out = ch
316
+ for _ in range(self.num_res_blocks):
317
+ block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
318
+ block_in = block_out
319
+ down = nn.Module()
320
+ down.block = block
321
+
322
+ add_spatial_downsample = bool(i_level < np.log2(ffactor_spatial))
323
+ add_temporal_downsample = (add_spatial_downsample and
324
+ bool(i_level >= np.log2(ffactor_spatial // ffactor_temporal)))
325
+ if add_spatial_downsample or add_temporal_downsample:
326
+ assert i_level < len(block_out_channels) - 1
327
+ block_out = block_out_channels[i_level + 1] if downsample_match_channel else block_in
328
+ down.downsample = DownsampleDCAE(block_in, block_out, add_temporal_downsample)
329
+ block_in = block_out
330
+ self.down.append(down)
331
+
332
+ # middle
333
+ self.mid = nn.Module()
334
+ self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
335
+ self.mid.attn_1 = AttnBlock(block_in)
336
+ self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
337
+
338
+ # end
339
+ self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
340
+ self.conv_out = Conv3d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
341
+
342
+ self.gradient_checkpointing = False
343
+
344
+ def forward(self, x: Tensor) -> Tensor:
345
+ use_checkpointing = bool(self.training and self.gradient_checkpointing)
346
+
347
+ # downsampling
348
+ h = self.conv_in(x)
349
+ for i_level in range(len(self.block_out_channels)):
350
+ for i_block in range(self.num_res_blocks):
351
+ h = forward_with_checkpointing(
352
+ self.down[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
353
+ if hasattr(self.down[i_level], "downsample"):
354
+ h = forward_with_checkpointing(self.down[i_level].downsample, h, use_checkpointing=use_checkpointing)
355
+
356
+ # middle
357
+ h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
358
+ h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
359
+ h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
360
+
361
+ # end
362
+ group_size = self.block_out_channels[-1] // (2 * self.z_channels)
363
+ shortcut = rearrange(h, "b (c r) f h w -> b c r f h w", r=group_size).mean(dim=2)
364
+ h = self.norm_out(h)
365
+ h = swish(h)
366
+ h = self.conv_out(h)
367
+ h += shortcut
368
+ return h
369
+
370
+
371
+ class Decoder(nn.Module):
372
+ """
373
+ The decoder network of AutoencoderKLConv3D.
374
+ """
375
+ def __init__(
376
+ self,
377
+ z_channels: int,
378
+ out_channels: int,
379
+ block_out_channels: Tuple[int, ...],
380
+ num_res_blocks: int,
381
+ ffactor_spatial: int,
382
+ ffactor_temporal: int,
383
+ upsample_match_channel: bool = True,
384
+ ):
385
+ super().__init__()
386
+ assert block_out_channels[0] % z_channels == 0
387
+
388
+ self.z_channels = z_channels
389
+ self.block_out_channels = block_out_channels
390
+ self.num_res_blocks = num_res_blocks
391
+
392
+ # z to block_in
393
+ block_in = block_out_channels[0]
394
+ self.conv_in = Conv3d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
395
+
396
+ # middle
397
+ self.mid = nn.Module()
398
+ self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
399
+ self.mid.attn_1 = AttnBlock(block_in)
400
+ self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
401
+
402
+ # upsampling
403
+ self.up = nn.ModuleList()
404
+ for i_level, ch in enumerate(block_out_channels):
405
+ block = nn.ModuleList()
406
+ block_out = ch
407
+ for _ in range(self.num_res_blocks + 1):
408
+ block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
409
+ block_in = block_out
410
+ up = nn.Module()
411
+ up.block = block
412
+
413
+ add_spatial_upsample = bool(i_level < np.log2(ffactor_spatial))
414
+ add_temporal_upsample = bool(i_level < np.log2(ffactor_temporal))
415
+ if add_spatial_upsample or add_temporal_upsample:
416
+ assert i_level < len(block_out_channels) - 1
417
+ block_out = block_out_channels[i_level + 1] if upsample_match_channel else block_in
418
+ up.upsample = UpsampleDCAE(block_in, block_out, add_temporal_upsample)
419
+ block_in = block_out
420
+ self.up.append(up)
421
+
422
+ # end
423
+ self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
424
+ self.conv_out = Conv3d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
425
+
426
+ self.gradient_checkpointing = False
427
+
428
+ def forward(self, z: Tensor) -> Tensor:
429
+ use_checkpointing = bool(self.training and self.gradient_checkpointing)
430
+
431
+ # z to block_in
432
+ repeats = self.block_out_channels[0] // (self.z_channels)
433
+ h = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
434
+
435
+ # middle
436
+ h = forward_with_checkpointing(self.mid.block_1, h, use_checkpointing=use_checkpointing)
437
+ h = forward_with_checkpointing(self.mid.attn_1, h, use_checkpointing=use_checkpointing)
438
+ h = forward_with_checkpointing(self.mid.block_2, h, use_checkpointing=use_checkpointing)
439
+
440
+ # upsampling
441
+ for i_level in range(len(self.block_out_channels)):
442
+ for i_block in range(self.num_res_blocks + 1):
443
+ h = forward_with_checkpointing(self.up[i_level].block[i_block], h, use_checkpointing=use_checkpointing)
444
+ if hasattr(self.up[i_level], "upsample"):
445
+ h = forward_with_checkpointing(self.up[i_level].upsample, h, use_checkpointing=use_checkpointing)
446
+
447
+ # end
448
+ h = self.norm_out(h)
449
+ h = swish(h)
450
+ h = self.conv_out(h)
451
+ return h
452
+
453
+
454
+ class AutoencoderKLConv3D(ModelMixin, ConfigMixin):
455
+ """
456
+ Autoencoder model with KL-regularized latent space based on 3D convolutions.
457
+ """
458
+ _supports_gradient_checkpointing = True
459
+
460
+ @register_to_config
461
+ def __init__(
462
+ self,
463
+ in_channels: int,
464
+ out_channels: int,
465
+ latent_channels: int,
466
+ block_out_channels: Tuple[int, ...],
467
+ layers_per_block: int,
468
+ ffactor_spatial: int,
469
+ ffactor_temporal: int,
470
+ sample_size: int,
471
+ sample_tsize: int,
472
+ scaling_factor: float = None,
473
+ shift_factor: Optional[float] = None,
474
+ downsample_match_channel: bool = True,
475
+ upsample_match_channel: bool = True,
476
+ only_encoder: bool = False, # only build encoder for saving memory
477
+ only_decoder: bool = False, # only build decoder for saving memory
478
+ ):
479
+ super().__init__()
480
+ self.ffactor_spatial = ffactor_spatial
481
+ self.ffactor_temporal = ffactor_temporal
482
+ self.scaling_factor = scaling_factor
483
+ self.shift_factor = shift_factor
484
+
485
+ # build model
486
+ if not only_decoder:
487
+ self.encoder = Encoder(
488
+ in_channels=in_channels,
489
+ z_channels=latent_channels,
490
+ block_out_channels=block_out_channels,
491
+ num_res_blocks=layers_per_block,
492
+ ffactor_spatial=ffactor_spatial,
493
+ ffactor_temporal=ffactor_temporal,
494
+ downsample_match_channel=downsample_match_channel,
495
+ )
496
+ if not only_encoder:
497
+ self.decoder = Decoder(
498
+ z_channels=latent_channels,
499
+ out_channels=out_channels,
500
+ block_out_channels=list(reversed(block_out_channels)),
501
+ num_res_blocks=layers_per_block,
502
+ ffactor_spatial=ffactor_spatial,
503
+ ffactor_temporal=ffactor_temporal,
504
+ upsample_match_channel=upsample_match_channel,
505
+ )
506
+
507
+ # slicing and tiling related
508
+ self.use_slicing = False
509
+ self.slicing_bsz = 1
510
+ self.use_spatial_tiling = False
511
+ self.use_temporal_tiling = False
512
+ self.use_tiling_during_training = False
513
+
514
+ # only relevant if vae tiling is enabled
515
+ self.tile_sample_min_size = sample_size
516
+ self.tile_latent_min_size = sample_size // ffactor_spatial
517
+ self.tile_sample_min_tsize = sample_tsize
518
+ self.tile_latent_min_tsize = sample_tsize // ffactor_temporal
519
+ self.tile_overlap_factor = 0.25
520
+
521
+ # use torch.compile for faster encode speed
522
+ self.use_compile = False
523
+
524
+ def _set_gradient_checkpointing(self, module, value=False):
525
+ if isinstance(module, (Encoder, Decoder)):
526
+ module.gradient_checkpointing = value
527
+
528
+ def enable_tiling_during_training(self, use_tiling: bool = True):
529
+ self.use_tiling_during_training = use_tiling
530
+
531
+ def disable_tiling_during_training(self):
532
+ self.enable_tiling_during_training(False)
533
+
534
+ def enable_temporal_tiling(self, use_tiling: bool = True):
535
+ self.use_temporal_tiling = use_tiling
536
+
537
+ def disable_temporal_tiling(self):
538
+ self.enable_temporal_tiling(False)
539
+
540
+ def enable_spatial_tiling(self, use_tiling: bool = True):
541
+ self.use_spatial_tiling = use_tiling
542
+
543
+ def disable_spatial_tiling(self):
544
+ self.enable_spatial_tiling(False)
545
+
546
+ def enable_tiling(self, use_tiling: bool = True):
547
+ self.enable_spatial_tiling(use_tiling)
548
+
549
+ def disable_tiling(self):
550
+ self.disable_spatial_tiling()
551
+
552
+ def enable_slicing(self):
553
+ self.use_slicing = True
554
+
555
+ def disable_slicing(self):
556
+ self.use_slicing = False
557
+
558
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
559
+ blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
560
+ for x in range(blend_extent):
561
+ b[:, :, :, :, x] = \
562
+ a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
563
+ return b
564
+
565
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
566
+ blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
567
+ for y in range(blend_extent):
568
+ b[:, :, :, y, :] = \
569
+ a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
570
+ return b
571
+
572
+ def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int):
573
+ blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
574
+ for x in range(blend_extent):
575
+ b[:, :, x, :, :] = \
576
+ a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
577
+ return b
578
+
579
+ def spatial_tiled_encode(self, x: torch.Tensor):
580
+ """ spatial tailing for frames """
581
+ B, C, T, H, W = x.shape
582
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) # 256 * (1 - 0.25) = 192
583
+ blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) # 8 * 0.25 = 2
584
+ row_limit = self.tile_latent_min_size - blend_extent # 8 - 2 = 6
585
+
586
+ rows = []
587
+ for i in range(0, H, overlap_size):
588
+ row = []
589
+ for j in range(0, W, overlap_size):
590
+ tile = x[:, :, :, i: i + self.tile_sample_min_size, j: j + self.tile_sample_min_size]
591
+ tile = self.encoder(tile)
592
+ row.append(tile)
593
+ rows.append(row)
594
+ result_rows = []
595
+ for i, row in enumerate(rows):
596
+ result_row = []
597
+ for j, tile in enumerate(row):
598
+ if i > 0:
599
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
600
+ if j > 0:
601
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
602
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
603
+ result_rows.append(torch.cat(result_row, dim=-1))
604
+ moments = torch.cat(result_rows, dim=-2)
605
+ return moments
606
+
607
+ def temporal_tiled_encode(self, x: torch.Tensor):
608
+ """ temporal tailing for frames """
609
+ B, C, T, H, W = x.shape
610
+ overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor)) # 64 * (1 - 0.25) = 48
611
+ blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor) # 8 * 0.25 = 2
612
+ t_limit = self.tile_latent_min_tsize - blend_extent # 8 - 2 = 6
613
+
614
+ row = []
615
+ for i in range(0, T, overlap_size):
616
+ tile = x[:, :, i: i + self.tile_sample_min_tsize, :, :]
617
+ if self.use_spatial_tiling and (
618
+ tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
619
+ tile = self.spatial_tiled_encode(tile)
620
+ else:
621
+ tile = self.encoder(tile)
622
+ row.append(tile)
623
+ result_row = []
624
+ for i, tile in enumerate(row):
625
+ if i > 0:
626
+ tile = self.blend_t(row[i - 1], tile, blend_extent)
627
+ result_row.append(tile[:, :, :t_limit, :, :])
628
+ moments = torch.cat(result_row, dim=-3)
629
+ return moments
630
+
631
+ def spatial_tiled_decode(self, z: torch.Tensor):
632
+ """ spatial tailing for frames """
633
+ B, C, T, H, W = z.shape
634
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) # 8 * (1 - 0.25) = 6
635
+ blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor) # 256 * 0.25 = 64
636
+ row_limit = self.tile_sample_min_size - blend_extent # 256 - 64 = 192
637
+
638
+ rows = []
639
+ for i in range(0, H, overlap_size):
640
+ row = []
641
+ for j in range(0, W, overlap_size):
642
+ tile = z[:, :, :, i: i + self.tile_latent_min_size, j: j + self.tile_latent_min_size]
643
+ decoded = self.decoder(tile)
644
+ row.append(decoded)
645
+ rows.append(row)
646
+
647
+ result_rows = []
648
+ for i, row in enumerate(rows):
649
+ result_row = []
650
+ for j, tile in enumerate(row):
651
+ if i > 0:
652
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
653
+ if j > 0:
654
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
655
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
656
+ result_rows.append(torch.cat(result_row, dim=-1))
657
+ dec = torch.cat(result_rows, dim=-2)
658
+ return dec
659
+
660
+ def temporal_tiled_decode(self, z: torch.Tensor):
661
+ """ temporal tailing for frames """
662
+ B, C, T, H, W = z.shape
663
+ overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor)) # 8 * (1 - 0.25) = 6
664
+ blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor) # 64 * 0.25 = 16
665
+ t_limit = self.tile_sample_min_tsize - blend_extent # 64 - 16 = 48
666
+ assert 0 < overlap_size < self.tile_latent_min_tsize
667
+
668
+ row = []
669
+ for i in range(0, T, overlap_size):
670
+ tile = z[:, :, i: i + self.tile_latent_min_tsize, :, :]
671
+ if self.use_spatial_tiling and (
672
+ tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
673
+ decoded = self.spatial_tiled_decode(tile)
674
+ else:
675
+ decoded = self.decoder(tile)
676
+ row.append(decoded)
677
+
678
+ result_row = []
679
+ for i, tile in enumerate(row):
680
+ if i > 0:
681
+ tile = self.blend_t(row[i - 1], tile, blend_extent)
682
+ result_row.append(tile[:, :, :t_limit, :, :])
683
+ dec = torch.cat(result_row, dim=-3)
684
+ return dec
685
+
686
+ def encode(self, x: Tensor, return_dict: bool = True):
687
+ """
688
+ Encodes the input by passing through the encoder network.
689
+ Support slicing and tiling for memory efficiency.
690
+ """
691
+ def _encode(x):
692
+ if self.use_temporal_tiling and x.shape[-3] > self.tile_sample_min_tsize:
693
+ return self.temporal_tiled_encode(x)
694
+ if self.use_spatial_tiling and (
695
+ x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
696
+ return self.spatial_tiled_encode(x)
697
+
698
+ if self.use_compile:
699
+ @torch.compile
700
+ def encoder(x):
701
+ return self.encoder(x)
702
+ return encoder(x)
703
+ return self.encoder(x)
704
+
705
+ if len(x.shape) != 5: # (B, C, T, H, W)
706
+ x = x[:, :, None]
707
+ assert len(x.shape) == 5 # (B, C, T, H, W)
708
+ if x.shape[2] == 1:
709
+ x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
710
+ else:
711
+ assert x.shape[2] != self.ffactor_temporal and x.shape[2] % self.ffactor_temporal == 0
712
+
713
+ if self.use_slicing and x.shape[0] > 1:
714
+ if self.slicing_bsz == 1:
715
+ encoded_slices = [_encode(x_slice) for x_slice in x.split(1)]
716
+ else:
717
+ sections = [self.slicing_bsz] * (x.shape[0] // self.slicing_bsz)
718
+ if x.shape[0] % self.slicing_bsz != 0:
719
+ sections.append(x.shape[0] % self.slicing_bsz)
720
+ encoded_slices = [_encode(x_slice) for x_slice in x.split(sections)]
721
+ h = torch.cat(encoded_slices)
722
+ else:
723
+ h = _encode(x)
724
+ posterior = DiagonalGaussianDistribution(h)
725
+
726
+ if not return_dict:
727
+ return (posterior,)
728
+
729
+ return AutoencoderKLOutput(latent_dist=posterior)
730
+
731
+ def decode(self, z: Tensor, return_dict: bool = True, generator=None):
732
+ """
733
+ Decodes the input by passing through the decoder network.
734
+ Support slicing and tiling for memory efficiency.
735
+ """
736
+ def _decode(z):
737
+ if self.use_temporal_tiling and z.shape[-3] > self.tile_latent_min_tsize:
738
+ return self.temporal_tiled_decode(z)
739
+ if self.use_spatial_tiling and (
740
+ z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
741
+ return self.spatial_tiled_decode(z)
742
+ return self.decoder(z)
743
+
744
+ if self.use_slicing and z.shape[0] > 1:
745
+ decoded_slices = [_decode(z_slice) for z_slice in z.split(1)]
746
+ decoded = torch.cat(decoded_slices)
747
+ else:
748
+ decoded = _decode(z)
749
+
750
+ if z.shape[-3] == 1:
751
+ decoded = decoded[:, :, -1:]
752
+
753
+ if not return_dict:
754
+ return (decoded,)
755
+
756
+ return DecoderOutput(sample=decoded)
757
+
758
+ def forward(
759
+ self,
760
+ sample: torch.Tensor,
761
+ sample_posterior: bool = False,
762
+ return_posterior: bool = True,
763
+ return_dict: bool = True
764
+ ):
765
+ posterior = self.encode(sample).latent_dist
766
+ z = posterior.sample() if sample_posterior else posterior.mode()
767
+ dec = self.decode(z).sample
768
+ return DecoderOutput(sample=dec, posterior=posterior) if return_dict else (dec, posterior)
769
+
770
+ def random_reset_tiling(self, x: torch.Tensor):
771
+ if x.shape[-3] == 1:
772
+ self.disable_spatial_tiling()
773
+ self.disable_temporal_tiling()
774
+ return
775
+
776
+ # Use fixed shape here
777
+ min_sample_size = int(1 / self.tile_overlap_factor) * self.ffactor_spatial
778
+ min_sample_tsize = int(1 / self.tile_overlap_factor) * self.ffactor_temporal
779
+ sample_size = random.choice([None, 1 * min_sample_size, 2 * min_sample_size, 3 * min_sample_size])
780
+ if sample_size is None:
781
+ self.disable_spatial_tiling()
782
+ else:
783
+ self.tile_sample_min_size = sample_size
784
+ self.tile_latent_min_size = sample_size // self.ffactor_spatial
785
+ self.enable_spatial_tiling()
786
+
787
+ sample_tsize = random.choice([None, 1 * min_sample_tsize, 2 * min_sample_tsize, 3 * min_sample_tsize])
788
+ if sample_tsize is None:
789
+ self.disable_temporal_tiling()
790
+ else:
791
+ self.tile_sample_min_tsize = sample_tsize
792
+ self.tile_latent_min_tsize = sample_tsize // self.ffactor_temporal
793
+ self.enable_temporal_tiling()
configuration_hunyuan.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+
14
+ from transformers.configuration_utils import PretrainedConfig
15
+ from transformers.utils import logging
16
+ from typing import List, Union
17
+
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+
22
+ class HunyuanImage3Config(PretrainedConfig):
23
+ r"""
24
+ This is the configuration class to store the configuration of a [`HunyuanImage3Model`]. It is used to instantiate
25
+ an Hunyuan model according to the specified arguments, defining the model architecture. Instantiating a
26
+ configuration with the defaults will yield a similar configuration to that of the Hunyuan-7B.
27
+
28
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
29
+ documentation from [`PretrainedConfig`] for more information.
30
+
31
+
32
+ Args:
33
+ vocab_size (`int`, *optional*, defaults to 32000):
34
+ Vocabulary size of the Hunyuan Image 3 model. Defines the number of different tokens that can be
35
+ represented by the `inputs_ids` passed when calling [`HunyuanImage3Model`]
36
+ hidden_size (`int`, *optional*, defaults to 4096):
37
+ Dimension of the hidden representations.
38
+ intermediate_size (`int`, *optional*, defaults to 11008):
39
+ Dimension of the MLP representations or shared MLP representations.
40
+ moe_intermediate_size (`int` or `List`, *optional*, defaults to 11008):
41
+ Dimension of the MLP representations in MoE. Use a list if you want a different size per layer.
42
+ num_hidden_layers (`int`, *optional*, defaults to 32):
43
+ Number of hidden layers in the Transformer decoder.
44
+ num_attention_heads (`int`, *optional*, defaults to 32):
45
+ Number of attention heads for each attention layer in the Transformer decoder.
46
+ num_key_value_heads (`int`, *optional*):
47
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
48
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
49
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
50
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
51
+ by meanpooling all the original heads within that group. For more details checkout [this
52
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
53
+ `num_attention_heads`.
54
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
+ The non-linear activation function (function or string) in the decoder.
56
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
57
+ The maximum sequence length that this model might ever be used with.
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
+ The epsilon used by the rms normalization layers.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
64
+ relevant if `config.is_decoder=True`.
65
+ pad_token_id (`int`, *optional*):
66
+ Padding token id.
67
+ bos_token_id (`int`, *optional*, defaults to 1):
68
+ Beginning of stream token id.
69
+ eos_token_id (`int`, *optional*, defaults to 2):
70
+ End of stream token id.
71
+ pretraining_tp (`int`, *optional*, defaults to 1):
72
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
73
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
74
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
75
+ issue](https://github.com/pytorch/pytorch/issues/76232).
76
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
77
+ Whether to tie weight embeddings
78
+ rope_theta (`float`, *optional*, defaults to 10000.0):
79
+ The base period of the RoPE embeddings.
80
+ rope_scaling (`Dict`, *optional*):
81
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
82
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
83
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
84
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
85
+ these scaling strategies behave:
86
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
87
+ experimental feature, subject to breaking API changes in future versions.
88
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
89
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
90
+ attention_dropout (`float`, *optional*, defaults to 0.0):
91
+ The dropout ratio for the attention probabilities.
92
+ use_qk_norm (`bool`, *optional*, defaults to `False`):
93
+ Whether query and key in attention use norm
94
+ use_cla (`bool`, *optional*, defaults to `False`):
95
+ Whether to use CLA in attention
96
+ cla_share_factor (`int`, *optional*, defaults to 1):
97
+ The share factor of CLA
98
+ num_experts (`int` or `List`, *optional*, defaults to 1):
99
+ The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
100
+ num_shared_expert (`int` or `List`, *optional*, defaults to 1):
101
+ The number of shared experts for moe. If it is a list, it will be used as the number of shared experts
102
+ for each layer.
103
+ moe_topk (`int` or `List`, *optional*, defaults to 1):
104
+ The topk value for moe. If it is a list, it will be used as the topk value for each layer.
105
+ capacity_factor (Not used) (`float` or `List`, *optional*, defaults to 1.0):
106
+ The capacity factor for moe. If it is a list, it will be used as the capacity factor for each layer.
107
+ moe_layer_num_skipped (`int`, *optional*, defaults to 0):
108
+ First moe_layer_num_skipped layers do not use MoE.
109
+ """
110
+
111
+ model_type = "Hunyuan"
112
+ keys_to_ignore_at_inference = ["past_key_values"]
113
+
114
+ def __init__(
115
+ self,
116
+ vocab_size=290943,
117
+ hidden_size=4096,
118
+ intermediate_size: int=11008,
119
+ moe_intermediate_size: Union[int, List]=None,
120
+ num_hidden_layers=32,
121
+ num_attention_heads=32,
122
+ num_key_value_heads=None,
123
+ attention_head_dim=None,
124
+ hidden_act="silu",
125
+ max_position_embeddings=2048,
126
+ initializer_range=0.02,
127
+ rms_norm_eps=1e-5,
128
+ use_cache=True,
129
+ pad_token_id=0,
130
+ bos_token_id=1,
131
+ eos_token_id=2,
132
+ eod_token_id=3,
133
+ im_start_id=4,
134
+ im_end_id=5,
135
+ text_start_id=6,
136
+ text_end_id=7,
137
+ image_token_id=8,
138
+ video_start_id=9,
139
+ video_end_id=10,
140
+ im_newline_id=11,
141
+ mask_init_id=12,
142
+ pretraining_tp=1,
143
+ tie_word_embeddings=False,
144
+ rope_theta=10000.0,
145
+ rope_scaling=None,
146
+ attention_bias=False,
147
+ mlp_bias=False,
148
+ attention_dropout=0.0,
149
+ use_qk_norm=False,
150
+ use_rotary_pos_emb=True,
151
+ use_cla=False,
152
+ cla_share_factor=1,
153
+ norm_type="hf_rms",
154
+ num_experts: Union[int, List] = 1,
155
+ use_mixed_mlp_moe=False,
156
+ num_shared_expert: Union[int, List] = 1,
157
+ moe_topk: Union[int, List] = 1,
158
+ capacity_factor: int = 1.0,
159
+ moe_drop_tokens=False,
160
+ moe_random_routing_dropped_token=False,
161
+ use_mla=False,
162
+ kv_lora_rank=512,
163
+ q_lora_rank=1536,
164
+ qk_rope_head_dim=64,
165
+ v_head_dim=128,
166
+ qk_nope_head_dim=128,
167
+ moe_layer_num_skipped=0,
168
+ norm_topk_prob=True,
169
+ routed_scaling_factor=1.0,
170
+ group_limited_greedy=False,
171
+ n_group=None,
172
+ topk_group=None,
173
+ add_classification_head=False,
174
+ class_num=0,
175
+ pool_type="last",
176
+ pad_id=-1,
177
+ # Added
178
+ moe_impl="eager",
179
+ vae_downsample_factor=(16, 16), # (h, w)
180
+ img_proj_type="unet",
181
+ patch_size=1,
182
+ patch_embed_hidden_dim=1024,
183
+ image_base_size=1024,
184
+ vae=None,
185
+ vit=None,
186
+ vit_processor=None,
187
+ vit_aligner=None,
188
+ **kwargs,
189
+ ):
190
+ self.vocab_size = vocab_size
191
+ self.max_position_embeddings = max_position_embeddings
192
+ self.hidden_size = hidden_size
193
+ self.intermediate_size = intermediate_size
194
+ self.moe_intermediate_size = moe_intermediate_size
195
+ self.num_hidden_layers = num_hidden_layers
196
+ self.num_attention_heads = num_attention_heads
197
+ self.moe_impl = moe_impl
198
+ self.num_experts = num_experts
199
+ self.use_mixed_mlp_moe = use_mixed_mlp_moe
200
+ self.num_shared_expert = num_shared_expert
201
+ self.moe_topk = moe_topk
202
+ self.capacity_factor = capacity_factor
203
+ self.moe_drop_tokens = moe_drop_tokens
204
+ self.moe_random_routing_dropped_token = moe_random_routing_dropped_token
205
+
206
+ if attention_head_dim is not None:
207
+ self.attention_head_dim = attention_head_dim
208
+ else:
209
+ self.attention_head_dim = self.hidden_size // num_attention_heads
210
+
211
+ # for backward compatibility
212
+ if num_key_value_heads is None:
213
+ num_key_value_heads = num_attention_heads
214
+
215
+ self.num_key_value_heads = num_key_value_heads
216
+ self.hidden_act = hidden_act
217
+ self.initializer_range = initializer_range
218
+ self.rms_norm_eps = rms_norm_eps
219
+ self.pretraining_tp = pretraining_tp
220
+ self.use_cache = use_cache
221
+ self.rope_theta = rope_theta
222
+ self.rope_scaling = rope_scaling
223
+ self.attention_bias = attention_bias
224
+ self.mlp_bias = mlp_bias
225
+ self.attention_dropout = attention_dropout
226
+ self.use_qk_norm = use_qk_norm
227
+ self.use_rotary_pos_emb = use_rotary_pos_emb
228
+ self.use_cla = use_cla
229
+ self.cla_share_factor = cla_share_factor
230
+ self.norm_type = norm_type
231
+ # MLA args
232
+ self.use_mla = use_mla
233
+ self.kv_lora_rank = kv_lora_rank
234
+ self.q_lora_rank = q_lora_rank
235
+ self.qk_rope_head_dim = qk_rope_head_dim
236
+ self.qk_nope_head_dim = qk_nope_head_dim
237
+ self.v_head_dim = v_head_dim
238
+
239
+ # DeepSeek related args
240
+ self.moe_layer_num_skipped = moe_layer_num_skipped
241
+ self.norm_topk_prob = norm_topk_prob
242
+ self.routed_scaling_factor = routed_scaling_factor
243
+ self.group_limited_greedy = group_limited_greedy
244
+ self.n_group = n_group
245
+ self.topk_group = topk_group
246
+ self.add_classification_head = add_classification_head
247
+ self.class_num = class_num
248
+ self.pool_type = pool_type
249
+ self.pad_id = pad_id
250
+
251
+ if self.class_num is not None:
252
+ self.dense_list = [self.hidden_size, self.class_num]
253
+
254
+ # ViT args
255
+ self.vit = vit
256
+ self.vit_processor = vit_processor
257
+ self.vit_aligner = vit_aligner
258
+
259
+ # Image Gen args
260
+ self.vae = vae
261
+ self.vae_downsample_factor = vae_downsample_factor
262
+ self.img_proj_type = img_proj_type
263
+ self.patch_size = patch_size
264
+ self.patch_embed_hidden_dim = patch_embed_hidden_dim
265
+ self.image_base_size = image_base_size
266
+
267
+ # token id
268
+ self.eod_token_id = eod_token_id
269
+ self.im_start_id = im_start_id
270
+ self.im_end_id = im_end_id
271
+ self.text_start_id = text_start_id
272
+ self.text_end_id = text_end_id
273
+ self.image_token_id = image_token_id
274
+ self.video_start_id = video_start_id
275
+ self.video_end_id = video_end_id
276
+ self.im_newline_id = im_newline_id
277
+ self.mask_init_id = mask_init_id
278
+
279
+ super().__init__(
280
+ pad_token_id=pad_token_id,
281
+ bos_token_id=bos_token_id,
282
+ eos_token_id=eos_token_id,
283
+ tie_word_embeddings=tie_word_embeddings,
284
+ **kwargs,
285
+ )
hunyuan.py ADDED
The diff for this file is too large to render. See raw diff
 
hunyuan_image_3_pipeline.py ADDED
@@ -0,0 +1,879 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+ #
14
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
15
+ #
16
+ # Licensed under the Apache License, Version 2.0 (the "License");
17
+ # you may not use this file except in compliance with the License.
18
+ # You may obtain a copy of the License at
19
+ #
20
+ # http://www.apache.org/licenses/LICENSE-2.0
21
+ #
22
+ # Unless required by applicable law or agreed to in writing, software
23
+ # distributed under the License is distributed on an "AS IS" BASIS,
24
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
+ # See the License for the specific language governing permissions and
26
+ # limitations under the License.
27
+ # ==============================================================================================
28
+
29
+ import inspect
30
+ import math
31
+ from dataclasses import dataclass
32
+ from typing import Any, Callable, Dict, List
33
+ from typing import Optional, Tuple, Union
34
+
35
+ import numpy as np
36
+ import torch
37
+ from PIL import Image
38
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
39
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
40
+ from diffusers.image_processor import VaeImageProcessor
41
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
42
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
43
+ from diffusers.utils import BaseOutput, logging
44
+ from diffusers.utils.torch_utils import randn_tensor
45
+
46
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
47
+
48
+
49
+ def retrieve_timesteps(
50
+ scheduler,
51
+ num_inference_steps: Optional[int] = None,
52
+ device: Optional[Union[str, torch.device]] = None,
53
+ timesteps: Optional[List[int]] = None,
54
+ sigmas: Optional[List[float]] = None,
55
+ **kwargs,
56
+ ):
57
+ """
58
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
59
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
60
+
61
+ Args:
62
+ scheduler (`SchedulerMixin`):
63
+ The scheduler to get timesteps from.
64
+ num_inference_steps (`int`):
65
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
66
+ must be `None`.
67
+ device (`str` or `torch.device`, *optional*):
68
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
69
+ timesteps (`List[int]`, *optional*):
70
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
71
+ `num_inference_steps` and `sigmas` must be `None`.
72
+ sigmas (`List[float]`, *optional*):
73
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
74
+ `num_inference_steps` and `timesteps` must be `None`.
75
+
76
+ Returns:
77
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
78
+ second element is the number of inference steps.
79
+ """
80
+ if timesteps is not None and sigmas is not None:
81
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
82
+ if timesteps is not None:
83
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
84
+ if not accepts_timesteps:
85
+ raise ValueError(
86
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
87
+ f" timestep schedules. Please check whether you are using the correct scheduler."
88
+ )
89
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
90
+ timesteps = scheduler.timesteps
91
+ num_inference_steps = len(timesteps)
92
+ elif sigmas is not None:
93
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
94
+ if not accept_sigmas:
95
+ raise ValueError(
96
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
97
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
98
+ )
99
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
100
+ timesteps = scheduler.timesteps
101
+ num_inference_steps = len(timesteps)
102
+ else:
103
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
104
+ timesteps = scheduler.timesteps
105
+ return timesteps, num_inference_steps
106
+
107
+
108
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
109
+ r"""
110
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
111
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
112
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf).
113
+
114
+ Args:
115
+ noise_cfg (`torch.Tensor`):
116
+ The predicted noise tensor for the guided diffusion process.
117
+ noise_pred_text (`torch.Tensor`):
118
+ The predicted noise tensor for the text-guided diffusion process.
119
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
120
+ A rescale factor applied to the noise predictions.
121
+ Returns:
122
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
123
+ """
124
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
125
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
126
+ # rescale the results from guidance (fixes overexposure)
127
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
128
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
129
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
130
+ return noise_cfg
131
+
132
+
133
+ @dataclass
134
+ class HunyuanImage3Text2ImagePipelineOutput(BaseOutput):
135
+ samples: Union[List[Any], np.ndarray]
136
+
137
+
138
+ @dataclass
139
+ class FlowMatchDiscreteSchedulerOutput(BaseOutput):
140
+ """
141
+ Output class for the scheduler's `step` function output.
142
+
143
+ Args:
144
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
145
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
146
+ denoising loop.
147
+ """
148
+
149
+ prev_sample: torch.FloatTensor
150
+
151
+
152
+ class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
153
+ """
154
+ Euler scheduler.
155
+
156
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
157
+ methods the library implements for all schedulers such as loading and saving.
158
+
159
+ Args:
160
+ num_train_timesteps (`int`, defaults to 1000):
161
+ The number of diffusion steps to train the model.
162
+ timestep_spacing (`str`, defaults to `"linspace"`):
163
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
164
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
165
+ shift (`float`, defaults to 1.0):
166
+ The shift value for the timestep schedule.
167
+ reverse (`bool`, defaults to `True`):
168
+ Whether to reverse the timestep schedule.
169
+ """
170
+
171
+ _compatibles = []
172
+ order = 1
173
+
174
+ @register_to_config
175
+ def __init__(
176
+ self,
177
+ num_train_timesteps: int = 1000,
178
+ shift: float = 1.0,
179
+ reverse: bool = True,
180
+ solver: str = "euler",
181
+ use_flux_shift: bool = False,
182
+ flux_base_shift: float = 0.5,
183
+ flux_max_shift: float = 1.15,
184
+ n_tokens: Optional[int] = None,
185
+ ):
186
+ sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
187
+
188
+ if not reverse:
189
+ sigmas = sigmas.flip(0)
190
+
191
+ self.sigmas = sigmas
192
+ # the value fed to model
193
+ self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
194
+ self.timesteps_full = (sigmas * num_train_timesteps).to(dtype=torch.float32)
195
+
196
+ self._step_index = None
197
+ self._begin_index = None
198
+
199
+ self.supported_solver = [
200
+ "euler",
201
+ "heun-2", "midpoint-2",
202
+ "kutta-4",
203
+ ]
204
+ if solver not in self.supported_solver:
205
+ raise ValueError(f"Solver {solver} not supported. Supported solvers: {self.supported_solver}")
206
+
207
+ # empty dt and derivative (for heun)
208
+ self.derivative_1 = None
209
+ self.derivative_2 = None
210
+ self.derivative_3 = None
211
+ self.dt = None
212
+
213
+ @property
214
+ def step_index(self):
215
+ """
216
+ The index counter for current timestep. It will increase 1 after each scheduler step.
217
+ """
218
+ return self._step_index
219
+
220
+ @property
221
+ def begin_index(self):
222
+ """
223
+ The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
224
+ """
225
+ return self._begin_index
226
+
227
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
228
+ def set_begin_index(self, begin_index: int = 0):
229
+ """
230
+ Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
231
+
232
+ Args:
233
+ begin_index (`int`):
234
+ The begin index for the scheduler.
235
+ """
236
+ self._begin_index = begin_index
237
+
238
+ def _sigma_to_t(self, sigma):
239
+ return sigma * self.config.num_train_timesteps
240
+
241
+ @property
242
+ def state_in_first_order(self):
243
+ return self.derivative_1 is None
244
+
245
+ @property
246
+ def state_in_second_order(self):
247
+ return self.derivative_2 is None
248
+
249
+ @property
250
+ def state_in_third_order(self):
251
+ return self.derivative_3 is None
252
+
253
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None,
254
+ n_tokens: int = None):
255
+ """
256
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
257
+
258
+ Args:
259
+ num_inference_steps (`int`):
260
+ The number of diffusion steps used when generating samples with a pre-trained model.
261
+ device (`str` or `torch.device`, *optional*):
262
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
263
+ n_tokens (`int`, *optional*):
264
+ Number of tokens in the input sequence.
265
+ """
266
+ self.num_inference_steps = num_inference_steps
267
+
268
+ sigmas = torch.linspace(1, 0, num_inference_steps + 1)
269
+
270
+ # Apply timestep shift
271
+ if self.config.use_flux_shift:
272
+ assert isinstance(n_tokens, int), "n_tokens should be provided for flux shift"
273
+ mu = self.get_lin_function(y1=self.config.flux_base_shift, y2=self.config.flux_max_shift)(n_tokens)
274
+ sigmas = self.flux_time_shift(mu, 1.0, sigmas)
275
+ elif self.config.shift != 1.:
276
+ sigmas = self.sd3_time_shift(sigmas)
277
+
278
+ if not self.config.reverse:
279
+ sigmas = 1 - sigmas
280
+
281
+ self.sigmas = sigmas
282
+ self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
283
+ self.timesteps_full = (sigmas * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
284
+
285
+ # empty dt and derivative (for kutta)
286
+ self.derivative_1 = None
287
+ self.derivative_2 = None
288
+ self.derivative_3 = None
289
+ self.dt = None
290
+
291
+ # Reset step index
292
+ self._step_index = None
293
+
294
+ def index_for_timestep(self, timestep, schedule_timesteps=None):
295
+ if schedule_timesteps is None:
296
+ schedule_timesteps = self.timesteps
297
+
298
+ indices = (schedule_timesteps == timestep).nonzero()
299
+
300
+ # The sigma index that is taken for the **very** first `step`
301
+ # is always the second index (or the last index if there is only 1)
302
+ # This way we can ensure we don't accidentally skip a sigma in
303
+ # case we start in the middle of the denoising schedule (e.g. for image-to-image)
304
+ pos = 1 if len(indices) > 1 else 0
305
+
306
+ return indices[pos].item()
307
+
308
+ def _init_step_index(self, timestep):
309
+ if self.begin_index is None:
310
+ if isinstance(timestep, torch.Tensor):
311
+ timestep = timestep.to(self.timesteps.device)
312
+ self._step_index = self.index_for_timestep(timestep)
313
+ else:
314
+ self._step_index = self._begin_index
315
+
316
+ def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
317
+ return sample
318
+
319
+ @staticmethod
320
+ def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15):
321
+ m = (y2 - y1) / (x2 - x1)
322
+ b = y1 - m * x1
323
+ return lambda x: m * x + b
324
+
325
+ @staticmethod
326
+ def flux_time_shift(mu: float, sigma: float, t: torch.Tensor):
327
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
328
+
329
+ def sd3_time_shift(self, t: torch.Tensor):
330
+ return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
331
+
332
+ def step(
333
+ self,
334
+ model_output: torch.FloatTensor,
335
+ timestep: Union[float, torch.FloatTensor],
336
+ sample: torch.FloatTensor,
337
+ pred_uncond: torch.FloatTensor = None,
338
+ generator: Optional[torch.Generator] = None,
339
+ n_tokens: Optional[int] = None,
340
+ return_dict: bool = True,
341
+ ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
342
+ """
343
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
344
+ process from the learned model outputs (most often the predicted noise).
345
+
346
+ Args:
347
+ model_output (`torch.FloatTensor`):
348
+ The direct output from learned diffusion model.
349
+ timestep (`float`):
350
+ The current discrete timestep in the diffusion chain.
351
+ sample (`torch.FloatTensor`):
352
+ A current instance of a sample created by the diffusion process.
353
+ generator (`torch.Generator`, *optional*):
354
+ A random number generator.
355
+ n_tokens (`int`, *optional*):
356
+ Number of tokens in the input sequence.
357
+ return_dict (`bool`):
358
+ Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
359
+ tuple.
360
+
361
+ Returns:
362
+ [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
363
+ If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
364
+ returned, otherwise a tuple is returned where the first element is the sample tensor.
365
+ """
366
+
367
+ if (
368
+ isinstance(timestep, int)
369
+ or isinstance(timestep, torch.IntTensor)
370
+ or isinstance(timestep, torch.LongTensor)
371
+ ):
372
+ raise ValueError(
373
+ (
374
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
375
+ " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
376
+ " one of the `scheduler.timesteps` as a timestep."
377
+ ),
378
+ )
379
+
380
+ if self.step_index is None:
381
+ self._init_step_index(timestep)
382
+
383
+ # Upcast to avoid precision issues when computing prev_sample
384
+ sample = sample.to(torch.float32)
385
+ model_output = model_output.to(torch.float32)
386
+ pred_uncond = pred_uncond.to(torch.float32) if pred_uncond is not None else None
387
+
388
+ # dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
389
+ sigma = self.sigmas[self.step_index]
390
+ sigma_next = self.sigmas[self.step_index + 1]
391
+
392
+ last_inner_step = True
393
+ if self.config.solver == "euler":
394
+ derivative, dt, sample, last_inner_step = self.first_order_method(model_output, sigma, sigma_next, sample)
395
+ elif self.config.solver in ["heun-2", "midpoint-2"]:
396
+ derivative, dt, sample, last_inner_step = self.second_order_method(model_output, sigma, sigma_next, sample)
397
+ elif self.config.solver == "kutta-4":
398
+ derivative, dt, sample, last_inner_step = self.fourth_order_method(model_output, sigma, sigma_next, sample)
399
+ else:
400
+ raise ValueError(f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}")
401
+
402
+ prev_sample = sample + derivative * dt
403
+
404
+ # Cast sample back to model compatible dtype
405
+ # prev_sample = prev_sample.to(model_output.dtype)
406
+
407
+ # upon completion increase step index by one
408
+ if last_inner_step:
409
+ self._step_index += 1
410
+
411
+ if not return_dict:
412
+ return (prev_sample,)
413
+
414
+ return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
415
+
416
+ def first_order_method(self, model_output, sigma, sigma_next, sample):
417
+ derivative = model_output
418
+ dt = sigma_next - sigma
419
+ return derivative, dt, sample, True
420
+
421
+ def second_order_method(self, model_output, sigma, sigma_next, sample):
422
+ if self.state_in_first_order:
423
+ # store for 2nd order step
424
+ self.derivative_1 = model_output
425
+ self.dt = sigma_next - sigma
426
+ self.sample = sample
427
+
428
+ derivative = model_output
429
+ if self.config.solver == 'heun-2':
430
+ dt = self.dt
431
+ elif self.config.solver == 'midpoint-2':
432
+ dt = self.dt / 2
433
+ else:
434
+ raise NotImplementedError(f"Solver {self.config.solver} not supported.")
435
+ last_inner_step = False
436
+
437
+ else:
438
+ if self.config.solver == 'heun-2':
439
+ derivative = 0.5 * (self.derivative_1 + model_output)
440
+ elif self.config.solver == 'midpoint-2':
441
+ derivative = model_output
442
+ else:
443
+ raise NotImplementedError(f"Solver {self.config.solver} not supported.")
444
+
445
+ # 3. take prev timestep & sample
446
+ dt = self.dt
447
+ sample = self.sample
448
+ last_inner_step = True
449
+
450
+ # free dt and derivative
451
+ # Note, this puts the scheduler in "first order mode"
452
+ self.derivative_1 = None
453
+ self.dt = None
454
+ self.sample = None
455
+
456
+ return derivative, dt, sample, last_inner_step
457
+
458
+ def fourth_order_method(self, model_output, sigma, sigma_next, sample):
459
+ if self.state_in_first_order:
460
+ self.derivative_1 = model_output
461
+ self.dt = sigma_next - sigma
462
+ self.sample = sample
463
+ derivative = model_output
464
+ dt = self.dt / 2
465
+ last_inner_step = False
466
+
467
+ elif self.state_in_second_order:
468
+ self.derivative_2 = model_output
469
+ derivative = model_output
470
+ dt = self.dt / 2
471
+ last_inner_step = False
472
+
473
+ elif self.state_in_third_order:
474
+ self.derivative_3 = model_output
475
+ derivative = model_output
476
+ dt = self.dt
477
+ last_inner_step = False
478
+
479
+ else:
480
+ derivative = (1/6 * self.derivative_1 + 1/3 * self.derivative_2 + 1/3 * self.derivative_3 +
481
+ 1/6 * model_output)
482
+
483
+ # 3. take prev timestep & sample
484
+ dt = self.dt
485
+ sample = self.sample
486
+ last_inner_step = True
487
+
488
+ # free dt and derivative
489
+ # Note, this puts the scheduler in "first order mode"
490
+ self.derivative_1 = None
491
+ self.derivative_2 = None
492
+ self.derivative_3 = None
493
+ self.dt = None
494
+ self.sample = None
495
+
496
+ return derivative, dt, sample, last_inner_step
497
+
498
+ def __len__(self):
499
+ return self.config.num_train_timesteps
500
+
501
+
502
+ class ClassifierFreeGuidance:
503
+ def __init__(
504
+ self,
505
+ use_original_formulation: bool = False,
506
+ start: float = 0.0,
507
+ stop: float = 1.0,
508
+ ):
509
+ super().__init__()
510
+ self.use_original_formulation = use_original_formulation
511
+
512
+ def __call__(
513
+ self,
514
+ pred_cond: torch.Tensor,
515
+ pred_uncond: Optional[torch.Tensor],
516
+ guidance_scale: float,
517
+ step: int,
518
+ ) -> torch.Tensor:
519
+
520
+ shift = pred_cond - pred_uncond
521
+ pred = pred_cond if self.use_original_formulation else pred_uncond
522
+ pred = pred + guidance_scale * shift
523
+
524
+ return pred
525
+
526
+
527
+ class HunyuanImage3Text2ImagePipeline(DiffusionPipeline):
528
+ r"""
529
+ Pipeline for condition-to-sample generation using Stable Diffusion.
530
+
531
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
532
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
533
+
534
+ Args:
535
+ model ([`ModelMixin`]):
536
+ A model to denoise the diffused latents.
537
+ scheduler ([`SchedulerMixin`]):
538
+ A scheduler to be used in combination with `diffusion_model` to denoise the diffused latents. Can be one of
539
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
540
+ """
541
+
542
+ model_cpu_offload_seq = ""
543
+ _optional_components = []
544
+ _exclude_from_cpu_offload = []
545
+ _callback_tensor_inputs = ["latents"]
546
+
547
+ def __init__(
548
+ self,
549
+ model,
550
+ scheduler: SchedulerMixin,
551
+ vae,
552
+ progress_bar_config: Dict[str, Any] = None,
553
+ ):
554
+ super().__init__()
555
+
556
+ # ==========================================================================================
557
+ if progress_bar_config is None:
558
+ progress_bar_config = {}
559
+ if not hasattr(self, '_progress_bar_config'):
560
+ self._progress_bar_config = {}
561
+ self._progress_bar_config.update(progress_bar_config)
562
+ # ==========================================================================================
563
+
564
+ self.register_modules(
565
+ model=model,
566
+ scheduler=scheduler,
567
+ vae=vae,
568
+ )
569
+
570
+ # should be a tuple or a list corresponding to the size of latents (batch_size, channel, *size)
571
+ # if None, will be treated as a tuple of 1
572
+ self.latent_scale_factor = self.model.config.vae_downsample_factor
573
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.latent_scale_factor)
574
+
575
+ # Must start with APG_mode_
576
+ self.cfg_operator = ClassifierFreeGuidance()
577
+
578
+ @staticmethod
579
+ def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
580
+ """
581
+ Denormalize an image array to [0,1].
582
+ """
583
+ return (images / 2 + 0.5).clamp(0, 1)
584
+
585
+ @staticmethod
586
+ def pt_to_numpy(images: torch.Tensor) -> np.ndarray:
587
+ """
588
+ Convert a PyTorch tensor to a NumPy image.
589
+ """
590
+ images = images.cpu().permute(0, 2, 3, 1).float().numpy()
591
+ return images
592
+
593
+ @staticmethod
594
+ def numpy_to_pil(images: np.ndarray):
595
+ """
596
+ Convert a numpy image or a batch of images to a PIL image.
597
+ """
598
+ if images.ndim == 3:
599
+ images = images[None, ...]
600
+ images = (images * 255).round().astype("uint8")
601
+ if images.shape[-1] == 1:
602
+ # special case for grayscale (single channel) images
603
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
604
+ else:
605
+ pil_images = [Image.fromarray(image) for image in images]
606
+
607
+ return pil_images
608
+
609
+ def prepare_extra_func_kwargs(self, func, kwargs):
610
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
611
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
612
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
613
+ # and should be between [0, 1]
614
+ extra_kwargs = {}
615
+
616
+ for k, v in kwargs.items():
617
+ accepts = k in set(inspect.signature(func).parameters.keys())
618
+ if accepts:
619
+ extra_kwargs[k] = v
620
+ return extra_kwargs
621
+
622
+ def prepare_latents(self, batch_size, latent_channel, image_size, dtype, device, generator, latents=None):
623
+ if self.latent_scale_factor is None:
624
+ latent_scale_factor = (1,) * len(image_size)
625
+ elif isinstance(self.latent_scale_factor, int):
626
+ latent_scale_factor = (self.latent_scale_factor,) * len(image_size)
627
+ elif isinstance(self.latent_scale_factor, tuple) or isinstance(self.latent_scale_factor, list):
628
+ assert len(self.latent_scale_factor) == len(image_size), \
629
+ "len(latent_scale_factor) shoudl be the same as len(image_size)"
630
+ latent_scale_factor = self.latent_scale_factor
631
+ else:
632
+ raise ValueError(
633
+ f"latent_scale_factor should be either None, int, tuple of int, or list of int, "
634
+ f"but got {self.latent_scale_factor}"
635
+ )
636
+
637
+ latents_shape = (
638
+ batch_size,
639
+ latent_channel,
640
+ *[int(s) // f for s, f in zip(image_size, latent_scale_factor)],
641
+ )
642
+ if isinstance(generator, list) and len(generator) != batch_size:
643
+ raise ValueError(
644
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
645
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
646
+ )
647
+
648
+ if latents is None:
649
+ latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
650
+ else:
651
+ latents = latents.to(device)
652
+
653
+ # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
654
+ if hasattr(self.scheduler, "init_noise_sigma"):
655
+ # scale the initial noise by the standard deviation required by the scheduler
656
+ latents = latents * self.scheduler.init_noise_sigma
657
+
658
+ return latents
659
+
660
+ @property
661
+ def guidance_scale(self):
662
+ return self._guidance_scale
663
+
664
+ @property
665
+ def guidance_rescale(self):
666
+ return self._guidance_rescale
667
+
668
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
669
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
670
+ # corresponds to doing no classifier free guidance.
671
+ @property
672
+ def do_classifier_free_guidance(self):
673
+ return self._guidance_scale > 1.0
674
+
675
+ @property
676
+ def num_timesteps(self):
677
+ return self._num_timesteps
678
+
679
+ def set_scheduler(self, new_scheduler):
680
+ self.register_modules(scheduler=new_scheduler)
681
+
682
+ @torch.no_grad()
683
+ def __call__(
684
+ self,
685
+ batch_size: int,
686
+ image_size: List[int],
687
+ num_inference_steps: int = 50,
688
+ timesteps: List[int] = None,
689
+ sigmas: List[float] = None,
690
+ guidance_scale: float = 7.5,
691
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
692
+ latents: Optional[torch.Tensor] = None,
693
+ output_type: Optional[str] = "pil",
694
+ return_dict: bool = True,
695
+ guidance_rescale: float = 0.0,
696
+ callback_on_step_end: Optional[
697
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
698
+ ] = None,
699
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
700
+ model_kwargs: Dict[str, Any] = None,
701
+ **kwargs,
702
+ ):
703
+ r"""
704
+ The call function to the pipeline for generation.
705
+
706
+ Args:
707
+ prompt (`str` or `List[str]`):
708
+ The text to guide image generation.
709
+ image_size (`Tuple[int]` or `List[int]`):
710
+ The size (height, width) of the generated image.
711
+ num_inference_steps (`int`, *optional*, defaults to 50):
712
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
713
+ expense of slower inference.
714
+ timesteps (`List[int]`, *optional*):
715
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
716
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
717
+ passed will be used. Must be in descending order.
718
+ sigmas (`List[float]`, *optional*):
719
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
720
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
721
+ will be used.
722
+ guidance_scale (`float`, *optional*, defaults to 7.5):
723
+ A higher guidance scale value encourages the model to generate samples closely linked to the
724
+ `condition` at the expense of lower sample quality. Guidance scale is enabled when `guidance_scale > 1`.
725
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
726
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
727
+ generation deterministic.
728
+ latents (`torch.Tensor`, *optional*):
729
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for sample
730
+ generation. Can be used to tweak the same generation with different conditions. If not provided,
731
+ a latents tensor is generated by sampling using the supplied random `generator`.
732
+ output_type (`str`, *optional*, defaults to `"pil"`):
733
+ The output format of the generated sample.
734
+ return_dict (`bool`, *optional*, defaults to `True`):
735
+ Whether or not to return a [`~DiffusionPipelineOutput`] instead of a
736
+ plain tuple.
737
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
738
+ Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
739
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
740
+ using zero terminal SNR.
741
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
742
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
743
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
744
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
745
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
746
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
747
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
748
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
749
+ `._callback_tensor_inputs` attribute of your pipeline class.
750
+
751
+ Examples:
752
+
753
+ Returns:
754
+ [`~DiffusionPipelineOutput`] or `tuple`:
755
+ If `return_dict` is `True`, [`~DiffusionPipelineOutput`] is returned,
756
+ otherwise a `tuple` is returned where the first element is a list with the generated samples.
757
+ """
758
+
759
+ callback_steps = kwargs.pop("callback_steps", None)
760
+ pbar_steps = kwargs.pop("pbar_steps", None)
761
+
762
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
763
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
764
+
765
+ self._guidance_scale = guidance_scale
766
+ self._guidance_rescale = guidance_rescale
767
+
768
+ cfg_factor = 1 + self.do_classifier_free_guidance
769
+
770
+ # Define call parameters
771
+ device = self._execution_device
772
+
773
+ # Prepare timesteps
774
+ timesteps, num_inference_steps = retrieve_timesteps(
775
+ self.scheduler, num_inference_steps, device, timesteps, sigmas,
776
+ )
777
+
778
+ # Prepare latent variables
779
+ latents = self.prepare_latents(
780
+ batch_size=batch_size,
781
+ latent_channel=self.model.config.vae["latent_channels"],
782
+ image_size=image_size,
783
+ dtype=torch.bfloat16,
784
+ device=device,
785
+ generator=generator,
786
+ latents=latents,
787
+ )
788
+
789
+ # Prepare extra step kwargs.
790
+ _scheduler_step_extra_kwargs = self.prepare_extra_func_kwargs(
791
+ self.scheduler.step, {"generator": generator}
792
+ )
793
+
794
+ # Prepare model kwargs
795
+ input_ids = model_kwargs.pop("input_ids")
796
+ attention_mask = self.model._prepare_attention_mask_for_generation( # noqa
797
+ input_ids, self.model.generation_config, model_kwargs=model_kwargs,
798
+ )
799
+ model_kwargs["attention_mask"] = attention_mask.to(latents.device)
800
+
801
+ # Sampling loop
802
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
803
+ self._num_timesteps = len(timesteps)
804
+
805
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
806
+ for i, t in enumerate(timesteps):
807
+ # expand the latents if we are doing classifier free guidance
808
+ latent_model_input = torch.cat([latents] * cfg_factor)
809
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
810
+
811
+ t_expand = t.repeat(latent_model_input.shape[0])
812
+
813
+ model_inputs = self.model.prepare_inputs_for_generation(
814
+ input_ids,
815
+ images=latent_model_input,
816
+ timestep=t_expand,
817
+ **model_kwargs,
818
+ )
819
+
820
+ with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
821
+ model_output = self.model(**model_inputs, first_step=(i == 0))
822
+ pred = model_output["diffusion_prediction"]
823
+ pred = pred.to(dtype=torch.float32)
824
+
825
+ # perform guidance
826
+ if self.do_classifier_free_guidance:
827
+ pred_cond, pred_uncond = pred.chunk(2)
828
+ pred = self.cfg_operator(pred_cond, pred_uncond, self.guidance_scale, step=i)
829
+
830
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
831
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
832
+ pred = rescale_noise_cfg(pred, pred_cond, guidance_rescale=self.guidance_rescale)
833
+
834
+ # compute the previous noisy sample x_t -> x_t-1
835
+ latents = self.scheduler.step(pred, t, latents, **_scheduler_step_extra_kwargs, return_dict=False)[0]
836
+
837
+ if i != len(timesteps) - 1:
838
+ model_kwargs = self.model._update_model_kwargs_for_generation( # noqa
839
+ model_output,
840
+ model_kwargs,
841
+ )
842
+ if input_ids.shape[1] != model_kwargs["position_ids"].shape[1]:
843
+ input_ids = torch.gather(input_ids, 1, index=model_kwargs["position_ids"])
844
+
845
+ if callback_on_step_end is not None:
846
+ callback_kwargs = {}
847
+ for k in callback_on_step_end_tensor_inputs:
848
+ callback_kwargs[k] = locals()[k]
849
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
850
+
851
+ latents = callback_outputs.pop("latents", latents)
852
+
853
+ # call the callback, if provided
854
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
855
+ progress_bar.update()
856
+
857
+ if hasattr(self.vae.config, 'scaling_factor') and self.vae.config.scaling_factor:
858
+ latents = latents / self.vae.config.scaling_factor
859
+ if hasattr(self.vae.config, 'shift_factor') and self.vae.config.shift_factor:
860
+ latents = latents + self.vae.config.shift_factor
861
+
862
+ if hasattr(self.vae, "ffactor_temporal"):
863
+ latents = latents.unsqueeze(2)
864
+
865
+ with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
866
+ image = self.vae.decode(latents, return_dict=False, generator=generator)[0]
867
+
868
+ # b c t h w
869
+ if hasattr(self.vae, "ffactor_temporal"):
870
+ assert image.shape[2] == 1, "image should have shape [B, C, T, H, W] and T should be 1"
871
+ image = image.squeeze(2)
872
+
873
+ do_denormalize = [True] * image.shape[0]
874
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
875
+
876
+ if not return_dict:
877
+ return (image,)
878
+
879
+ return HunyuanImage3Text2ImagePipelineOutput(samples=image)
image_processor.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+
14
+ from typing import Tuple
15
+
16
+ from PIL import Image
17
+ from torchvision import transforms
18
+ from transformers import Siglip2ImageProcessorFast
19
+
20
+ from .tokenizer_wrapper import ImageInfo, JointImageInfo, ResolutionGroup
21
+
22
+
23
+ def resize_and_crop(image: Image.Image, target_size: Tuple[int, int]) -> Image.Image:
24
+ tw, th = target_size
25
+ w, h = image.size
26
+
27
+ tr = th / tw
28
+ r = h / w
29
+
30
+ # resize
31
+ if r < tr:
32
+ resize_height = th
33
+ resize_width = int(round(th / h * w))
34
+ else:
35
+ resize_width = tw
36
+ resize_height = int(round(tw / w * h))
37
+
38
+ image = image.resize((resize_width, resize_height), resample=Image.Resampling.LANCZOS)
39
+
40
+ # center crop
41
+ crop_top = int(round((resize_height - th) / 2.0))
42
+ crop_left = int(round((resize_width - tw) / 2.0))
43
+
44
+ image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
45
+ return image
46
+
47
+
48
+ class HunyuanImage3ImageProcessor(object):
49
+ def __init__(self, config):
50
+ self.config = config
51
+
52
+ self.reso_group = ResolutionGroup(base_size=config.image_base_size)
53
+ self.vae_processor = transforms.Compose([
54
+ transforms.ToTensor(),
55
+ transforms.Normalize([0.5], [0.5]), # transform to [-1, 1]
56
+ ])
57
+ self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(config.vit_processor)
58
+
59
+ def build_image_info(self, image_size):
60
+ # parse image size (HxW, H:W, or <img_ratio_i>)
61
+ if isinstance(image_size, str):
62
+ if image_size.startswith("<img_ratio_"):
63
+ ratio_index = int(image_size.split("_")[-1].rstrip(">"))
64
+ reso = self.reso_group[ratio_index]
65
+ image_size = reso.height, reso.width
66
+ elif 'x' in image_size:
67
+ image_size = [int(s) for s in image_size.split('x')]
68
+ elif ':' in image_size:
69
+ image_size = [int(s) for s in image_size.split(':')]
70
+ else:
71
+ raise ValueError(
72
+ f"`image_size` should be in the format of 'HxW', 'H:W' or <img_ratio_i>, got {image_size}.")
73
+ assert len(image_size) == 2, f"`image_size` should be in the format of 'HxW', got {image_size}."
74
+ elif isinstance(image_size, (list, tuple)):
75
+ assert len(image_size) == 2 and all(isinstance(s, int) for s in image_size), \
76
+ f"`image_size` should be a tuple of two integers or a string in the format of 'HxW', got {image_size}."
77
+ else:
78
+ raise ValueError(f"`image_size` should be a tuple of two integers or a string in the format of 'WxH', "
79
+ f"got {image_size}.")
80
+ image_width, image_height = self.reso_group.get_target_size(image_size[1], image_size[0])
81
+ token_height = image_height // (self.config.vae_downsample_factor[0] * self.config.patch_size)
82
+ token_width = image_width // (self.config.vae_downsample_factor[1] * self.config.patch_size)
83
+ base_size, ratio_idx = self.reso_group.get_base_size_and_ratio_index(image_size[1], image_size[0])
84
+ image_info = ImageInfo(
85
+ image_type="gen_image", image_width=image_width, image_height=image_height,
86
+ token_width=token_width, token_height=token_height, base_size=base_size, ratio_index=ratio_idx,
87
+ )
88
+ return image_info
89
+
90
+ def preprocess(self, image: Image.Image):
91
+ # ==== VAE processor ====
92
+ image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
93
+ resized_image = resize_and_crop(image, (image_width, image_height))
94
+ image_tensor = self.vae_processor(resized_image)
95
+ token_height = image_height // (self.config.vae_downsample_factor[0] * self.config.patch_size)
96
+ token_width = image_width // (self.config.vae_downsample_factor[1] * self.config.patch_size)
97
+ base_size, ratio_index = self.reso_group.get_base_size_and_ratio_index(width=image_width, height=image_height)
98
+ vae_image_info = ImageInfo(
99
+ image_type="vae",
100
+ image_tensor=image_tensor.unsqueeze(0), # include batch dim
101
+ image_width=image_width, image_height=image_height,
102
+ token_width=token_width, token_height=token_height,
103
+ base_size=base_size, ratio_index=ratio_index,
104
+ )
105
+
106
+ # ==== ViT processor ====
107
+ inputs = self.vision_encoder_processor(image)
108
+ image = inputs["pixel_values"].squeeze(0) # seq_len x dim
109
+ pixel_attention_mask = inputs["pixel_attention_mask"].squeeze(0) # seq_len
110
+ spatial_shapes = inputs["spatial_shapes"].squeeze(0) # 2 (h, w)
111
+ vision_encoder_kwargs = dict(
112
+ pixel_attention_mask=pixel_attention_mask,
113
+ spatial_shapes=spatial_shapes,
114
+ )
115
+ vision_image_info = ImageInfo(
116
+ image_type="vit",
117
+ image_tensor=image.unsqueeze(0), # 1 x seq_len x dim
118
+ image_width=spatial_shapes[1].item() * self.config.vit_processor["patch_size"],
119
+ image_height=spatial_shapes[0].item() * self.config.vit_processor["patch_size"],
120
+ token_width=spatial_shapes[1].item(),
121
+ token_height=spatial_shapes[0].item(),
122
+ image_token_length=self.config.vit_processor["max_num_patches"],
123
+ # may not equal to token_width * token_height
124
+ )
125
+ return JointImageInfo(vae_image_info, vision_image_info, vision_encoder_kwargs)
siglip2.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+ #
14
+ # Copyright 2025 The HuggingFace Inc. team.
15
+ #
16
+ # Licensed under the Apache License, Version 2.0 (the "License");
17
+ # you may not use this file except in compliance with the License.
18
+ # You may obtain a copy of the License at
19
+ #
20
+ # http://www.apache.org/licenses/LICENSE-2.0
21
+ #
22
+ # Unless required by applicable law or agreed to in writing, software
23
+ # distributed under the License is distributed on an "AS IS" BASIS,
24
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
+ # See the License for the specific language governing permissions and
26
+ # limitations under the License.
27
+ # ==============================================================================
28
+
29
+ from typing import Optional, Tuple, Union
30
+ import warnings
31
+
32
+ import torch
33
+ import torch.nn as nn
34
+ import torch.nn.functional as F
35
+
36
+ from transformers.activations import ACT2FN
37
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
38
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
39
+
40
+
41
+ class Config(object):
42
+ def __init__(self, config):
43
+ if config is not None:
44
+ for key, value in config.items():
45
+ setattr(self, key, value)
46
+
47
+ def __getitem__(self, key):
48
+ return getattr(self, key, None)
49
+
50
+ def __setitem__(self, key, value):
51
+ return setattr(self, key, value)
52
+
53
+
54
+ class Siglip2VisionEmbeddings(nn.Module):
55
+ def __init__(self, config):
56
+ super().__init__()
57
+ self.config = config
58
+ self.embed_dim = config.hidden_size
59
+ self.patch_size = config.patch_size
60
+
61
+ self.patch_embedding = nn.Linear(
62
+ in_features=config.num_channels * self.patch_size * self.patch_size,
63
+ out_features=self.embed_dim,
64
+ )
65
+
66
+ self.num_patches = config.num_patches
67
+ self.position_embedding_size = int(self.num_patches**0.5)
68
+ self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
69
+
70
+ @staticmethod
71
+ def resize_positional_embeddings(
72
+ positional_embeddings: torch.Tensor,
73
+ spatial_shapes: torch.LongTensor,
74
+ max_length: int,
75
+ ) -> torch.Tensor:
76
+ """
77
+ Resize positional embeddings to image-specific size and pad to a fixed size.
78
+
79
+ Args:
80
+ positional_embeddings (`torch.Tensor`):
81
+ Position embeddings of shape (height, width, embed_dim)
82
+ spatial_shapes (`torch.LongTensor`):
83
+ Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
84
+ max_length (`int`):
85
+ Maximum length of the positional embeddings to pad resized positional embeddings to
86
+
87
+ Returns:
88
+ `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
89
+ """
90
+ batch_size = spatial_shapes.shape[0]
91
+ embed_dim = positional_embeddings.shape[-1]
92
+ source_dtype = positional_embeddings.dtype
93
+
94
+ resulted_positional_embeddings = torch.empty(
95
+ (batch_size, max_length, embed_dim),
96
+ device=positional_embeddings.device,
97
+ dtype=source_dtype,
98
+ )
99
+
100
+ # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
101
+ positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
102
+
103
+ # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
104
+ if positional_embeddings.device.type == "cpu":
105
+ positional_embeddings = positional_embeddings.to(torch.float32)
106
+
107
+ for i in range(batch_size):
108
+ # (1, dim, height, width) -> (1, dim, target_height, target_width)
109
+ height, width = spatial_shapes[i]
110
+ resized_embeddings = F.interpolate(
111
+ positional_embeddings,
112
+ size=(height, width),
113
+ mode="bilinear",
114
+ align_corners=False,
115
+ antialias=True,
116
+ )
117
+
118
+ # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
119
+ resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
120
+
121
+ # Cast to original dtype
122
+ resized_embeddings = resized_embeddings.to(source_dtype)
123
+
124
+ resulted_positional_embeddings[i, : height * width] = resized_embeddings
125
+ resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
126
+
127
+ return resulted_positional_embeddings
128
+
129
+ def forward(self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor) -> torch.Tensor:
130
+ """
131
+ Args:
132
+ pixel_values (`torch.FloatTensor`):
133
+ Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
134
+ spatial_shapes (`List[Tuple[int, int]]`):
135
+ Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
136
+ """
137
+
138
+ # Apply patch embeddings to already patchified pixel values
139
+ target_dtype = self.patch_embedding.weight.dtype
140
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
141
+
142
+ # Get positional resized and padded positional embeddings
143
+ positional_embeddings = self.position_embedding.weight.reshape(
144
+ self.position_embedding_size, self.position_embedding_size, -1
145
+ )
146
+ resized_positional_embeddings = self.resize_positional_embeddings(
147
+ positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
148
+ )
149
+
150
+ # Add positional embeddings to patch embeddings
151
+ embeddings = patch_embeds + resized_positional_embeddings
152
+ return embeddings
153
+
154
+
155
+ class Siglip2Attention(nn.Module):
156
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
157
+
158
+ def __init__(self, config):
159
+ super().__init__()
160
+ self.config = config
161
+ self.embed_dim = config.hidden_size
162
+ self.num_heads = config.num_attention_heads
163
+ self.head_dim = self.embed_dim // self.num_heads
164
+ if self.head_dim * self.num_heads != self.embed_dim:
165
+ raise ValueError(
166
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
167
+ f" {self.num_heads})."
168
+ )
169
+ self.scale = self.head_dim**-0.5
170
+ self.dropout = config.attention_dropout
171
+
172
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
173
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
174
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
175
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
176
+
177
+ def forward(
178
+ self,
179
+ hidden_states: torch.Tensor,
180
+ attention_mask: Optional[torch.Tensor] = None,
181
+ output_attentions: Optional[bool] = False,
182
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
183
+ """Input shape: Batch x Time x Channel"""
184
+
185
+ batch_size, q_len, _ = hidden_states.size()
186
+
187
+ query_states = self.q_proj(hidden_states)
188
+ key_states = self.k_proj(hidden_states)
189
+ value_states = self.v_proj(hidden_states)
190
+
191
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
192
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
193
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
194
+
195
+ k_v_seq_len = key_states.shape[-2]
196
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
197
+
198
+ if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
199
+ raise ValueError(
200
+ f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
201
+ f" {attn_weights.size()}"
202
+ )
203
+
204
+ if attention_mask is not None:
205
+ if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
206
+ raise ValueError(
207
+ f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, "
208
+ f"but is {attention_mask.size()}"
209
+ )
210
+ attn_weights = attn_weights + attention_mask
211
+
212
+ # upcast attention to fp32
213
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
214
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
215
+ attn_output = torch.matmul(attn_weights, value_states)
216
+
217
+ if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
218
+ raise ValueError(
219
+ f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
220
+ f" {attn_output.size()}"
221
+ )
222
+
223
+ attn_output = attn_output.transpose(1, 2).contiguous()
224
+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
225
+
226
+ attn_output = self.out_proj(attn_output)
227
+
228
+ return attn_output, attn_weights
229
+
230
+ class Siglip2SdpaAttention(Siglip2Attention):
231
+ """
232
+ Siglip2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
233
+ `Siglip2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt
234
+ to SDPA API.
235
+ """
236
+
237
+ is_causal = False
238
+
239
+ # Adapted from Siglip2Attention.forward and transformers.models.llama.modeling_llama.LlamaSdpaAttention.forward
240
+ def forward(
241
+ self,
242
+ hidden_states: torch.Tensor,
243
+ attention_mask: Optional[torch.Tensor] = None,
244
+ output_attentions: Optional[bool] = False,
245
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
246
+ if output_attentions:
247
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
248
+ # once this is implemented.
249
+ warnings.warn(
250
+ "Siglip2Model is using Siglip2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` "
251
+ "does not support `output_attentions=True`. Falling back to the manual attention implementation, "
252
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. '
253
+ 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
254
+ )
255
+ return super().forward(
256
+ hidden_states=hidden_states,
257
+ attention_mask=attention_mask,
258
+ output_attentions=output_attentions,
259
+ )
260
+
261
+ batch_size, q_len, _ = hidden_states.size()
262
+
263
+ query_states = self.q_proj(hidden_states)
264
+ key_states = self.k_proj(hidden_states)
265
+ value_states = self.v_proj(hidden_states)
266
+
267
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
268
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
269
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
270
+
271
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
272
+ # custom attn_mask,
273
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
274
+ if query_states.device.type == "cuda" and attention_mask is not None:
275
+ query_states = query_states.contiguous()
276
+ key_states = key_states.contiguous()
277
+ value_states = value_states.contiguous()
278
+
279
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an
280
+ # inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph options.
281
+ # An inline conditional prevents dynamic shapes from compiling.
282
+ is_causal = True if self.is_causal and q_len > 1 else False
283
+
284
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
285
+ query_states,
286
+ key_states,
287
+ value_states,
288
+ attn_mask=attention_mask,
289
+ dropout_p=self.dropout if self.training else 0.0,
290
+ is_causal=is_causal,
291
+ )
292
+
293
+ attn_output = attn_output.transpose(1, 2).contiguous()
294
+ attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
295
+
296
+ attn_output = self.out_proj(attn_output)
297
+
298
+ return attn_output, None
299
+
300
+
301
+ class Siglip2MLP(nn.Module):
302
+ def __init__(self, config):
303
+ super().__init__()
304
+ self.config = config
305
+ self.activation_fn = ACT2FN[config.hidden_act]
306
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
307
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
308
+
309
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
310
+ hidden_states = self.fc1(hidden_states)
311
+ hidden_states = self.activation_fn(hidden_states)
312
+ hidden_states = self.fc2(hidden_states)
313
+ return hidden_states
314
+
315
+
316
+ class Siglip2EncoderLayer(nn.Module):
317
+ def __init__(self, config):
318
+ super().__init__()
319
+ self.embed_dim = config.hidden_size
320
+ self.self_attn = Siglip2Attention(config=config)
321
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
322
+ self.mlp = Siglip2MLP(config)
323
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
324
+
325
+ # Ignore copy
326
+ def forward(
327
+ self,
328
+ hidden_states: torch.Tensor,
329
+ attention_mask: torch.Tensor,
330
+ output_attentions: Optional[bool] = False,
331
+ ) -> Tuple[torch.FloatTensor]:
332
+ """
333
+ Args:
334
+ hidden_states (`torch.FloatTensor`):
335
+ Input to the layer of shape `(batch, seq_len, embed_dim)`.
336
+ attention_mask (`torch.FloatTensor`):
337
+ Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very
338
+ large negative values.
339
+ output_attentions (`bool`, *optional*, defaults to `False`):
340
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
341
+ returned tensors for more detail.
342
+ """
343
+ residual = hidden_states
344
+
345
+ hidden_states = self.layer_norm1(hidden_states)
346
+ hidden_states, attn_weights = self.self_attn(
347
+ hidden_states=hidden_states,
348
+ attention_mask=attention_mask,
349
+ output_attentions=output_attentions,
350
+ )
351
+ hidden_states = residual + hidden_states
352
+
353
+ residual = hidden_states
354
+ hidden_states = self.layer_norm2(hidden_states)
355
+ hidden_states = self.mlp(hidden_states)
356
+ hidden_states = residual + hidden_states
357
+
358
+ outputs = (hidden_states,)
359
+
360
+ if output_attentions:
361
+ outputs += (attn_weights,)
362
+
363
+ return outputs
364
+
365
+
366
+ class Siglip2Encoder(nn.Module):
367
+ """
368
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
369
+ [`Siglip2EncoderLayer`].
370
+
371
+ Args:
372
+ config: Siglip2Config
373
+ """
374
+
375
+ def __init__(self, config):
376
+ super().__init__()
377
+ self.config = config
378
+ self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
379
+ self.gradient_checkpointing = True
380
+
381
+ # Ignore copy
382
+ def forward(
383
+ self,
384
+ inputs_embeds,
385
+ attention_mask: Optional[torch.Tensor] = None,
386
+ output_attentions: Optional[bool] = None,
387
+ output_hidden_states: Optional[bool] = None,
388
+ return_dict: Optional[bool] = None,
389
+ ) -> Union[Tuple, BaseModelOutput]:
390
+ r"""
391
+ Args:
392
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
393
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
394
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
395
+ than the model's internal embedding lookup matrix.
396
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
397
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
398
+
399
+ - 1 for tokens that are **not masked**,
400
+ - 0 for tokens that are **masked**.
401
+
402
+ [What are attention masks?](../glossary#attention-mask)
403
+ output_attentions (`bool`, *optional*):
404
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
405
+ returned tensors for more detail.
406
+ output_hidden_states (`bool`, *optional*):
407
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
408
+ for more detail.
409
+ return_dict (`bool`, *optional*):
410
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
411
+ """
412
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
413
+ output_hidden_states = (
414
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
415
+ )
416
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
417
+
418
+ encoder_states = () if output_hidden_states else None
419
+ all_attentions = () if output_attentions else None
420
+
421
+ hidden_states = inputs_embeds
422
+ for layer_index, encoder_layer in enumerate(self.layers): # len(self.layers): 27
423
+ if output_hidden_states:
424
+ encoder_states = encoder_states + (hidden_states,)
425
+
426
+ layer_outputs = encoder_layer(
427
+ hidden_states,
428
+ attention_mask,
429
+ output_attentions=output_attentions,
430
+ )
431
+
432
+ hidden_states = layer_outputs[0]
433
+
434
+ if output_attentions:
435
+ all_attentions = all_attentions + (layer_outputs[1],)
436
+
437
+ if output_hidden_states:
438
+ encoder_states = encoder_states + (hidden_states,)
439
+
440
+ if not return_dict:
441
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
442
+ return BaseModelOutput(
443
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
444
+ )
445
+
446
+
447
+ class Siglip2MultiheadAttentionPoolingHead(nn.Module):
448
+ """Multihead Attention Pooling."""
449
+
450
+ def __init__(self, config):
451
+ super().__init__()
452
+
453
+ self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
454
+ self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
455
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
456
+ self.mlp = Siglip2MLP(config)
457
+ self.num_heads = config.num_attention_heads
458
+
459
+ def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
460
+ batch_size = hidden_state.shape[0]
461
+ probe = self.probe.repeat(batch_size, 1, 1)
462
+
463
+ if attention_mask is not None:
464
+ target_len, source_len = probe.shape[1], hidden_state.shape[1]
465
+ attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
466
+ attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
467
+ attention_mask = attention_mask.reshape(-1, target_len, source_len)
468
+
469
+ hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
470
+
471
+ residual = hidden_state
472
+ hidden_state = self.layernorm(hidden_state)
473
+ hidden_state = residual + self.mlp(hidden_state)
474
+
475
+ return hidden_state[:, 0]
476
+
477
+
478
+ class Siglip2VisionTransformer(nn.Module):
479
+ def __init__(self, config):
480
+ super().__init__()
481
+ config = Config(config)
482
+ self.config = config
483
+ embed_dim = config.hidden_size
484
+
485
+ self.embeddings = Siglip2VisionEmbeddings(config)
486
+ self.encoder = Siglip2Encoder(config)
487
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
488
+ self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
489
+ if self.use_head:
490
+ self.head = Siglip2MultiheadAttentionPoolingHead(config)
491
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
492
+
493
+ def forward(
494
+ self,
495
+ pixel_values: torch.FloatTensor,
496
+ attention_mask: torch.Tensor,
497
+ spatial_shapes: torch.LongTensor,
498
+ output_attentions: Optional[bool] = None,
499
+ output_hidden_states: Optional[bool] = None,
500
+ return_dict: Optional[bool] = None,
501
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
502
+ r"""
503
+ Returns:
504
+
505
+ """
506
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
507
+ output_hidden_states = (
508
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
509
+ )
510
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
511
+
512
+ hidden_states = self.embeddings(pixel_values, spatial_shapes)
513
+
514
+ if attention_mask is not None and not self._use_flash_attention_2:
515
+ # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
516
+ encoder_attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
517
+ else:
518
+ encoder_attention_mask = attention_mask
519
+
520
+ encoder_outputs = self.encoder(
521
+ inputs_embeds=hidden_states,
522
+ attention_mask=encoder_attention_mask,
523
+ output_attentions=output_attentions,
524
+ output_hidden_states=output_hidden_states,
525
+ return_dict=return_dict,
526
+ )
527
+
528
+ last_hidden_state = encoder_outputs[0]
529
+ last_hidden_state = self.post_layernorm(last_hidden_state)
530
+
531
+ pooler_output = self.head(last_hidden_state, attention_mask) if self.use_head else None
532
+ if not return_dict:
533
+ return (last_hidden_state, pooler_output) + encoder_outputs[1:]
534
+
535
+ return BaseModelOutputWithPooling(
536
+ last_hidden_state=last_hidden_state,
537
+ pooler_output=pooler_output,
538
+ hidden_states=encoder_outputs.hidden_states,
539
+ attentions=encoder_outputs.attentions,
540
+ )
541
+
542
+
543
+ class LightProjector(nn.Module):
544
+ def __init__(self, config):
545
+ config = Config(config)
546
+ super().__init__()
547
+
548
+ if config.projector_type == "linear":
549
+ modules = nn.Linear(config.input_dim, config.n_embed)
550
+
551
+ elif config.projector_type == "mlp_gelu":
552
+ modules = [nn.Linear(config.input_dim, config.n_embed)]
553
+ for _ in range(1, config.depth):
554
+ modules.append(nn.GELU())
555
+ modules.append(nn.Linear(config.n_embed, config.n_embed))
556
+ modules = nn.Sequential(*modules)
557
+
558
+ else:
559
+ raise ValueError(f"Unknown projector type: {config.projector_type}")
560
+
561
+ self.layers = modules
562
+
563
+ def forward(self, x):
564
+ return self.layers(x)
system_prompt.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+
14
+ t2i_system_prompt_en_vanilla = """
15
+ You are an advanced AI text-to-image generation system. Given a detailed text prompt, your task is to create a high-quality, visually compelling image that accurately represents the described scene, characters, or objects. Pay careful attention to style, color, lighting, perspective, and any specific instructions provided.
16
+ """
17
+
18
+ # 775
19
+ t2i_system_prompt_en_recaption = """
20
+ You are a world-class image generation prompt expert. Your task is to rewrite a user's simple description into a **structured, objective, and detail-rich** professional-level prompt.
21
+
22
+ The final output must be wrapped in `<recaption>` tags.
23
+
24
+ ### **Universal Core Principles**
25
+
26
+ When rewriting the prompt (inside the `<recaption>` tags), you must adhere to the following principles:
27
+
28
+ 1. **Absolute Objectivity**: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad". Convey aesthetic qualities through specific descriptions of color, light, shadow, and composition.
29
+ 2. **Physical and Logical Consistency**: All scene elements (e.g., gravity, light, shadows, reflections, spatial relationships, object proportions) must strictly adhere to real-world physics and common sense. For example, tennis players must be on opposite sides of the net; objects cannot float without a cause.
30
+ 3. **Structured Description**: Strictly follow a logical order: from general to specific, background to foreground, and primary to secondary elements. Use directional terms like "foreground," "mid-ground," "background," and "left side of the frame" to clearly define the spatial layout.
31
+ 4. **Use Present Tense**: Describe the scene from an observer's perspective using the present tense, such as "A man stands..." or "Light shines on..."
32
+ 5. **Use Rich and Specific Descriptive Language**: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects, subjects, and text. Vague expressions are strictly prohibited.
33
+
34
+ If the user specifies a style (e.g., oil painting, anime, UI design, text rendering), strictly adhere to that style. Otherwise, first infer a suitable style from the user's input. If there is no clear stylistic preference, default to an **ultra-realistic photographic style**. Then, generate the detailed rewritten prompt according to the **Style-Specific Creation Guide** below:
35
+
36
+ ### **Style-Specific Creation Guide**
37
+
38
+ Based on the determined artistic style, apply the corresponding professional knowledge.
39
+
40
+ **1. Photography and Realism Style**
41
+ * Utilize professional photography terms (e.g., lighting, lens, composition) and meticulously detail material textures, physical attributes of subjects, and environmental details.
42
+
43
+ **2. Illustration and Painting Style**
44
+ * Clearly specify the artistic school (e.g., Japanese Cel Shading, Impasto Oil Painting) and focus on describing its unique medium characteristics, such as line quality, brushstroke texture, or paint properties.
45
+
46
+ **3. Graphic/UI/APP Design Style**
47
+ * Objectively describe the final product, clearly defining the layout, elements, and color palette. All text on the interface must be enclosed in double quotes `""` to specify its exact content (e.g., "Login"). Vague descriptions are strictly forbidden.
48
+
49
+ **4. Typographic Art**
50
+ * The text must be described as a complete physical object. The description must begin with the text itself. Use a straightforward front-on or top-down perspective to ensure the entire text is visible without cropping.
51
+
52
+ ### **Final Output Requirements**
53
+
54
+ 1. **Output the Final Prompt Only**: Do not show any thought process, Markdown formatting, or line breaks.
55
+ 2. **Adhere to the Input**: You must retain the core concepts, attributes, and any specified text from the user's input.
56
+ 3. **Style Reinforcement**: Mention the core style 3-5 times within the prompt and conclude with a style declaration sentence.
57
+ 4. **Avoid Self-Reference**: Describe the image content directly. Remove redundant phrases like "This image shows..." or "The scene depicts..."
58
+ 5. **The final output must be wrapped in `<recaption>xxxx</recaption>` tags.**
59
+
60
+ The user will now provide an input prompt. You will provide the expanded prompt.
61
+ """
62
+
63
+ # 890
64
+ t2i_system_prompt_en_think_recaption = """
65
+ You will act as a top-tier Text-to-Image AI. Your core task is to deeply analyze the user's text input and transform it into a detailed, artistic, and fully user-intent-compliant image.
66
+
67
+ Your workflow is divided into two phases:
68
+
69
+ 1. Thinking Phase (<think>): In the <think> tag, you need to conduct a structured thinking process, progressively breaking down and enriching the constituent elements of the image. This process must include, but is not limited to, the following dimensions:
70
+
71
+ Subject: Clearly define the core character(s) or object(s) in the scene, including their appearance, posture, expression, and emotion.
72
+ Composition: Set the camera angle and layout, such as close-up, long shot, bird's-eye view, golden ratio composition, etc.
73
+ Environment/Background: Describe the scene where the subject is located, including the location, time of day, weather, and other elements in the background.
74
+ Lighting: Define the type, direction, and quality of the light source, such as soft afternoon sunlight, cool tones of neon lights, dramatic Rembrandt lighting, etc., to create a specific atmosphere.
75
+ Color Palette: Set the main color tone and color scheme of the image, such as vibrant and saturated, low-saturation Morandi colors, black and white, etc.
76
+ Quality/Style: Determine the artistic style and technical details of the image. This includes user-specified styles (e.g., anime, oil painting) or the default realistic style, as well as camera parameters (e.g., focal length, aperture, depth of field).
77
+ Details: Add minute elements that enhance the realism and narrative quality of the image, such as a character's accessories, the texture of a surface, dust particles in the air, etc.
78
+
79
+
80
+ 2. Recaption Phase (<recaption>): In the <recaption> tag, merge all the key details from the thinking process into a coherent, precise, and visually evocative final description. This description is the direct instruction for generating the image, so it must be clear, unambiguous, and organized in a way that is most suitable for an image generation engine to understand.
81
+
82
+ Absolutely Objective: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad." Convey aesthetic sense through concrete descriptions of colors, light, shadow, and composition.
83
+
84
+ Physical and Logical Consistency: All scene elements (e.g., gravity, light and shadow, reflections, spatial relationships, object proportions) must strictly adhere to the physical laws of the real world and common sense. For example, in a tennis match, players must be on opposite sides of the net; objects cannot float without reason.
85
+
86
+ Structured Description: Strictly follow a logical order: from whole to part, background to foreground, and primary to secondary. Use directional words like "foreground," "mid-ground," "background," "left side of the frame" to clearly define the spatial layout.
87
+
88
+ Use Present Tense: Describe from an observer's perspective using the present tense, such as "a man stands," "light shines on..."
89
+ Use Rich and Specific Descriptive Language: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects/characters/text. Absolutely avoid any vague expressions.
90
+
91
+
92
+ Output Format:
93
+ <think>Thinking process</think><recaption>Refined image description</recaption>Generate Image
94
+
95
+
96
+ You must strictly adhere to the following rules:
97
+
98
+ 1. Faithful to Intent, Reasonable Expansion: You can creatively add details to the user's description to enhance the image's realism and artistic quality. However, all additions must be highly consistent with the user's core intent and never introduce irrelevant or conflicting elements.
99
+ 2. Style Handling: When the user does not specify a style, you must default to an "Ultra-realistic, Photorealistic" style. If the user explicitly specifies a style (e.g., anime, watercolor, oil painting, cyberpunk, etc.), both your thinking process and final description must strictly follow and reflect that specified style.
100
+ 3. Text Rendering: If specific text needs to appear in the image (such as words on a sign, a book title), you must enclose this text in English double quotes (""). Descriptive text must not use double quotes.
101
+ 4. Design-related Images: You need to specify all text and graphical elements that appear in the image and clearly describe their design details, including font, color, size, position, arrangement, visual effects, etc.
102
+ """
103
+
104
+ t2i_system_prompts = {
105
+ "en_vanilla": [t2i_system_prompt_en_vanilla],
106
+ "en_recaption": [t2i_system_prompt_en_recaption],
107
+ "en_think_recaption": [t2i_system_prompt_en_think_recaption]
108
+ }
109
+
110
+
111
+ def get_system_prompt(sys_type, bot_task, system_prompt=None):
112
+ if sys_type == 'None':
113
+ return None
114
+ elif sys_type in ['en_vanilla', 'en_recaption', 'en_think_recaption']:
115
+ return t2i_system_prompts[sys_type][0]
116
+ elif sys_type == "dynamic":
117
+ if bot_task == "think":
118
+ return t2i_system_prompts["en_think_recaption"][0]
119
+ elif bot_task == "recaption":
120
+ return t2i_system_prompts["en_recaption"][0]
121
+ elif bot_task == "image":
122
+ return t2i_system_prompts["en_vanilla"][0].strip("\n")
123
+ else:
124
+ return system_prompt
125
+ elif sys_type == 'custom':
126
+ return system_prompt
127
+ else:
128
+ raise NotImplementedError(f"Unsupported system prompt type: {sys_type}")
tokenizer_wrapper.py ADDED
@@ -0,0 +1,1425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ # ==============================================================================
13
+
14
+ import warnings
15
+ import random
16
+ from typing import List, Optional, Union, Dict, Any
17
+ from collections import defaultdict
18
+ from copy import deepcopy
19
+
20
+ import numpy as np
21
+ import torch
22
+ import torch.nn.functional as F
23
+ from transformers import AutoTokenizer
24
+ from diffusers.utils import BaseOutput
25
+
26
+
27
+ def default(value, default_value):
28
+ return value if value is not None else default_value
29
+
30
+
31
+ def ensure_list(value):
32
+ if value is None:
33
+ return []
34
+ if isinstance(value, (list, tuple)):
35
+ return list(value)
36
+ return [value]
37
+
38
+
39
+ class Resolution(object):
40
+ def __init__(self, size, *args):
41
+ if isinstance(size, str):
42
+ if 'x' in size:
43
+ size = size.split('x')
44
+ size = (int(size[0]), int(size[1]))
45
+ else:
46
+ size = int(size)
47
+ if len(args) > 0:
48
+ size = (size, args[0])
49
+ if isinstance(size, int):
50
+ size = (size, size)
51
+
52
+ self.h = self.height = size[0]
53
+ self.w = self.width = size[1]
54
+ self.r = self.ratio = self.height / self.width
55
+
56
+ def __getitem__(self, idx):
57
+ if idx == 0:
58
+ return self.h
59
+ elif idx == 1:
60
+ return self.w
61
+ else:
62
+ raise IndexError(f'Index {idx} out of range')
63
+
64
+ def __str__(self):
65
+ return f'{self.h}x{self.w}'
66
+
67
+
68
+ class ResolutionGroup(object):
69
+ def __init__(self, base_size=None, step=None, align=1):
70
+ self.align = align
71
+ self.base_size = base_size
72
+ assert base_size % align == 0, f'base_size {base_size} is not divisible by align {align}'
73
+ if base_size is not None and not isinstance(base_size, int):
74
+ raise ValueError(f'base_size must be None or int, but got {type(base_size)}')
75
+ if step is None:
76
+ step = base_size // 16
77
+ if step is not None and step > base_size // 2:
78
+ raise ValueError(f'step must be smaller than base_size // 2, but got {step} > {base_size // 2}')
79
+
80
+ self.step = step
81
+ self.data = self._calc_by_step()
82
+
83
+ self.ratio = np.array([x.ratio for x in self.data])
84
+ self.attr = ['' for _ in range(len(self.data))]
85
+ self.prefix_space = 0
86
+
87
+ def __len__(self):
88
+ return len(self.data)
89
+
90
+ def __getitem__(self, idx):
91
+ return self.data[idx]
92
+
93
+ def __repr__(self):
94
+ prefix = self.prefix_space * ' '
95
+ prefix_close = (self.prefix_space - 4) * ' '
96
+ res_str = f'ResolutionGroup(base_size={self.base_size}, step={self.step}, data='
97
+ attr_maxlen = max([len(x) for x in self.attr] + [5])
98
+ res_str += \
99
+ f'\n{prefix}ID: height width ratio {" " * max(0, attr_maxlen - 4)}count h/16 w/16 tokens\n{prefix}'
100
+ res_str += \
101
+ ('\n' + prefix).join([f'{i:2d}: ({x.h:4d}, {x.w:4d}) {self.ratio[i]:.4f} {self.attr[i]:>{attr_maxlen}s} '
102
+ f'({x.h // 16:3d}, {x.w // 16:3d}) {x.h // 16 * x.w // 16:6d}'
103
+ for i, x in enumerate(self.data)])
104
+ res_str += f'\n{prefix_close})'
105
+ return res_str
106
+
107
+ def _calc_by_step(self):
108
+ assert self.align <= self.step, f'align {self.align} must be smaller than step {self.step}'
109
+
110
+ min_height = self.base_size // 2
111
+ min_width = self.base_size // 2
112
+ max_height = self.base_size * 2
113
+ max_width = self.base_size * 2
114
+
115
+ resolutions = [Resolution(self.base_size, self.base_size)]
116
+
117
+ cur_height, cur_width = self.base_size, self.base_size
118
+ while True:
119
+ if cur_height >= max_height and cur_width <= min_width:
120
+ break
121
+
122
+ cur_height = min(cur_height + self.step, max_height)
123
+ cur_width = max(cur_width - self.step, min_width)
124
+ resolutions.append(Resolution(cur_height // self.align * self.align, cur_width // self.align * self.align))
125
+
126
+ cur_height, cur_width = self.base_size, self.base_size
127
+ while True:
128
+ if cur_height <= min_height and cur_width >= max_width:
129
+ break
130
+
131
+ cur_height = max(cur_height - self.step, min_height)
132
+ cur_width = min(cur_width + self.step, max_width)
133
+ resolutions.append(Resolution(cur_height // self.align * self.align, cur_width // self.align * self.align))
134
+
135
+ resolutions = sorted(resolutions, key=lambda x: x.ratio)
136
+
137
+ return resolutions
138
+
139
+ def get_target_size(self, width, height):
140
+ ratio = height / width
141
+ idx = np.argmin(np.abs(self.ratio - ratio))
142
+ reso = self.data[idx]
143
+ return reso.w, reso.h
144
+
145
+ def get_base_size_and_ratio_index(self, width, height):
146
+ ratio = height / width
147
+ idx = np.argmin(np.abs(self.ratio - ratio))
148
+ return self.base_size, idx
149
+
150
+
151
+ class ImageInfo:
152
+ """ Class to store image information for processing and generation. """
153
+
154
+ def __init__(
155
+ self,
156
+ image_type: str = None,
157
+ image_tensor: torch.Tensor = None,
158
+ image_width: int = None,
159
+ image_height: int = None,
160
+ token_width: int = None,
161
+ token_height: int = None,
162
+ image_token_length: int = None,
163
+ base_size: int = None,
164
+ ratio_index: int = None,
165
+ **kwargs,
166
+ ):
167
+ self.image_type = image_type
168
+ self.image_tensor = image_tensor
169
+ self.image_width = image_width
170
+ self.w = image_width
171
+ self.image_height = image_height
172
+ self.h = image_height
173
+ self.token_width = token_width
174
+ self.tk_w = token_width
175
+ self.token_height = token_height
176
+ self.tk_h = token_height
177
+ self.image_token_length = default(
178
+ image_token_length,
179
+ token_width * token_height if token_width is not None and token_height is not None else None
180
+ )
181
+ self.base_size = base_size
182
+ self.ratio_index = ratio_index
183
+
184
+ self.add_timestep_token = kwargs.get("add_timestep_token", True)
185
+ self.add_guidance_token = kwargs.get("add_guidance_token", False)
186
+ self.use_front_boi_token = kwargs.get("use_front_boi_token", True)
187
+ self.add_image_shape_token = kwargs.get("add_image_shape_token", True)
188
+
189
+ def __getitem__(self, key: str) -> Any:
190
+ """Allow dictionary-like access to attributes."""
191
+ if hasattr(self, key):
192
+ return getattr(self, key)
193
+ raise KeyError(f"Key '{key}' not found in ImageInfo")
194
+
195
+ def __setitem__(self, key: str, value: Any) -> None:
196
+ """Allow dictionary-like assignment to attributes."""
197
+ if hasattr(self, key):
198
+ setattr(self, key, value)
199
+ else:
200
+ raise KeyError(f"Key '{key}' not found in ImageInfo")
201
+
202
+ def __contains__(self, key: str) -> bool:
203
+ """Check if the key exists in the ImageInfo object."""
204
+ return hasattr(self, key)
205
+
206
+ def __repr__(self):
207
+ return (f"ImageInfo(image_type={self.image_type}, image_tensor={self.image_tensor}, "
208
+ f"image_width={self.image_width}, image_height={self.image_height}, "
209
+ f"token_width={self.token_width}, token_height={self.token_height}, "
210
+ f"image_token_length={self.image_token_length}, "
211
+ f"base_size={self.base_size}, ratio_index={self.ratio_index}")
212
+
213
+ @property
214
+ def meta_info(self):
215
+ # Used for image sections of tkwrapper.encode_general()
216
+ if self.image_type in ["vae", "gen_image"]:
217
+ return dict(
218
+ token_length=self.image_token_length,
219
+ add_timestep_token=self.add_timestep_token,
220
+ add_guidance_token=self.add_guidance_token,
221
+ use_front_boi_token=self.use_front_boi_token,
222
+ add_image_shape_token=self.add_image_shape_token,
223
+ base_size=self.base_size,
224
+ ratio_idx=self.ratio_index,
225
+ # for rope 2d
226
+ token_height=self.token_height,
227
+ token_width=self.token_width,
228
+ # for bc
229
+ image_height=self.image_height,
230
+ image_width=self.image_width,
231
+ )
232
+ elif self.image_type in ["vit"]:
233
+ return dict(
234
+ token_length=self.image_token_length,
235
+ use_front_boi_token=self.use_front_boi_token,
236
+ add_image_shape_token=self.add_image_shape_token,
237
+ # for rope 2d
238
+ token_height=self.token_height,
239
+ token_width=self.token_width,
240
+ # for bc
241
+ image_height=self.image_height,
242
+ image_width=self.image_width,
243
+ )
244
+ else:
245
+ raise ValueError(f"Unknown image type '{self.image_type}'")
246
+
247
+ @property
248
+ def num_special_tokens(self):
249
+ if self.args is None:
250
+ raise ValueError("meta_info requires `args` attribute to be set.")
251
+ if self.image_type in ["vae", "src_image", "gen_image"]:
252
+ count = (
253
+ 2 + # <boi> + <eoi> or <src_boi> + <src_eoi>
254
+ (1 if self.add_timestep_token else 0) +
255
+ (1 if self.add_guidance_token else 0) +
256
+ (2 if self.add_image_shape_token else 0)
257
+ )
258
+ else:
259
+ raise ValueError(f"Unknown image_type: {self.image_type}")
260
+ return count
261
+
262
+ def copy(self, copy_image_tensor=True):
263
+ if copy_image_tensor and self.image_tensor is None:
264
+ raise ValueError("image_tensor is None, cannot copy")
265
+ return ImageInfo(
266
+ image_type=self.image_type,
267
+ image_tensor=self.image_tensor.clone() if copy_image_tensor else None,
268
+ image_width=self.image_width,
269
+ image_height=self.image_height,
270
+ token_width=self.token_width,
271
+ token_height=self.token_height,
272
+ image_token_length=self.image_token_length,
273
+ base_size=self.base_size,
274
+ ratio_index=self.ratio_index,
275
+ )
276
+
277
+ def zeros_(self):
278
+ self.image_tensor = torch.zeros_like(self.image_tensor)
279
+
280
+
281
+ class ImageTensor(torch.Tensor):
282
+ # This class is just for type hinting purposes. Attribute `i` should be defined
283
+ # as an instance attribute of the torch.Tensor instance, like: tensor.i = ImageInfo(...)
284
+ i: ImageInfo
285
+ vision_encoder_kwargs: dict
286
+
287
+
288
+ class JointImageInfo(object):
289
+ def __init__(self, vae_image_info: ImageInfo, vision_image_info: ImageInfo, vision_encoder_kwargs: dict = None):
290
+ self.vae_image_info = vae_image_info
291
+ self.vision_image_info = vision_image_info
292
+ self.vision_encoder_kwargs = vision_encoder_kwargs
293
+
294
+ # Define key attributes to align with ImageInfo for uniformity
295
+ self.image_type = "joint_image"
296
+ self.image_token_length = vae_image_info.image_token_length + vision_image_info.image_token_length
297
+
298
+ self.add_timestep_token = vae_image_info.add_timestep_token
299
+ self.use_front_boi_token = vae_image_info.use_front_boi_token
300
+ self.add_image_shape_token = vae_image_info.add_image_shape_token
301
+
302
+ def __repr__(self):
303
+ return f"JointImageInfo(vae_image={self.vae_image_info}, vision_image={self.vision_image_info})"
304
+
305
+ @property
306
+ def meta_info(self):
307
+ # Used for image sections of tkwrapper.encode_general()
308
+ return dict(
309
+ token_length=[self.vae_image_info.image_token_length, self.vision_image_info.image_token_length],
310
+ add_timestep_token=self.add_timestep_token,
311
+ use_front_boi_token=self.use_front_boi_token,
312
+ add_image_shape_token=self.add_image_shape_token,
313
+ base_size=self.vae_image_info.base_size,
314
+ ratio_idx=self.vae_image_info.ratio_index,
315
+ # for rope 2d
316
+ token_height=[self.vae_image_info.token_height, self.vision_image_info.token_height],
317
+ token_width=[self.vae_image_info.token_width, self.vision_image_info.token_width],
318
+ # for bc
319
+ image_height=[self.vae_image_info.image_height, self.vision_image_info.image_height],
320
+ image_width=[self.vae_image_info.image_width, self.vision_image_info.image_width],
321
+ )
322
+
323
+ @property
324
+ def num_special_tokens(self):
325
+ return (
326
+ 2 + # <boi> + <eoi>
327
+ (1 if self.add_timestep_token else 0) +
328
+ (2 if self.add_image_shape_token else 0) +
329
+ 1 # <joint_image_sep>
330
+ )
331
+
332
+ def copy(self, copy_image_tensor=True):
333
+ if copy_image_tensor and (
334
+ self.vae_image_info.image_tensor is None or self.vision_image_info.image_tensor is None):
335
+ raise ValueError("image_tensor is None, cannot copy")
336
+ return JointImageInfo(
337
+ self.vae_image_info.copy(copy_image_tensor),
338
+ self.vision_image_info.copy(copy_image_tensor),
339
+ self.vision_encoder_kwargs,
340
+ )
341
+
342
+ def zeros_(self):
343
+ self.vae_image_info.zeros_()
344
+ self.vision_image_info.zeros_()
345
+
346
+
347
+ class JointImage(object):
348
+ def __init__(self, vae_image: ImageTensor, vision_image: ImageTensor):
349
+ self.vae_image = vae_image
350
+ self.vision_image = vision_image
351
+ self.i = JointImageInfo(vae_image.i, vision_image.i)
352
+
353
+
354
+ class TokenizerEncodeOutput(BaseOutput):
355
+ tokens: torch.Tensor = None
356
+ timestep_scatter_index: Optional[torch.Tensor] = None
357
+ guidance_scatter_index: Optional[torch.Tensor] = None
358
+ text_slices: Optional[List[slice]] = None
359
+ gen_image_slices: Optional[List[slice]] = None
360
+ joint_image_slices: Optional[List[slice]] = None
361
+ cond_vae_image_slices: Optional[List[slice]] = None
362
+ cond_vit_image_slices: Optional[List[slice]] = None
363
+ text_mask: Optional[torch.Tensor] = None
364
+ gen_image_mask: Optional[torch.Tensor] = None
365
+ cond_vae_image_mask: Optional[torch.Tensor] = None
366
+ cond_vit_image_mask: Optional[torch.Tensor] = None
367
+ real_pos: Optional[torch.Tensor] = None
368
+ all_image_slices: Optional[List[slice]] = None
369
+ cond_timestep_scatter_index: Optional[torch.Tensor] = None
370
+ gen_timestep_scatter_index: Optional[torch.Tensor] = None
371
+
372
+
373
+ class Conversation:
374
+ roles: List[str] = ["User", "Assistant"]
375
+ sep: str = "\n\n"
376
+
377
+
378
+ class TokenizerWrapper(object):
379
+ def __init__(self, tokenizer):
380
+ if isinstance(tokenizer, str):
381
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
382
+ else:
383
+ self.tokenizer = tokenizer
384
+
385
+ # Define short names
386
+ self.bos_token_id = self.tokenizer.bos_token_id
387
+ self.eos_token_id = self.tokenizer.eos_token_id
388
+ self.pad_token_id = self.tokenizer.pad_token_id
389
+ self.boi_token_id = self.tokenizer.convert_tokens_to_ids("<boi>")
390
+ self.eoi_token_id = self.tokenizer.convert_tokens_to_ids("<eoi>")
391
+ self.img_token_id = self.tokenizer.convert_tokens_to_ids("<img>")
392
+ self.cfg_token_id = self.tokenizer.convert_tokens_to_ids("<cfg>")
393
+ self.end_answer_token_id = self.tokenizer.convert_tokens_to_ids("</answer>")
394
+ self.end_recaption_token_id = self.tokenizer.convert_tokens_to_ids("</recaption>")
395
+ self.ratio_token_offset = self.tokenizer.convert_tokens_to_ids("<img_ratio_0>")
396
+ self.special_token_map = self.tokenizer.added_tokens_encoder
397
+
398
+ def pad(self, tensor_list, dim=0, pad_val=None):
399
+ if pad_val is None:
400
+ pad_val = self.pad_token_id
401
+ max_len = max([t.shape[dim] for t in tensor_list])
402
+ padded_tensor_list = []
403
+ for t in tensor_list:
404
+ if t.shape[dim] < max_len:
405
+ assert pad_val is not False, "Not allowed pad."
406
+ t = F.pad(t, (0, max_len - t.shape[dim]), value=pad_val)
407
+ padded_tensor_list.append(t)
408
+ return padded_tensor_list
409
+
410
+ def encode(self, *args, **kwargs):
411
+ return self.tokenizer.encode(*args, **kwargs)
412
+
413
+ def decode(self, *args, **kwargs):
414
+ return self.tokenizer.decode(*args, **kwargs)
415
+
416
+ def encode_text(
417
+ self,
418
+ *texts,
419
+ uncond_enabled: Optional[Union[bool, List[bool]]] = None,
420
+ uncond_p: Optional[float] = None,
421
+ max_length: Optional[int] = None,
422
+ pad: Optional[str] = None,
423
+ return_lengths: bool = False,
424
+ ):
425
+ """
426
+ Encode text and image for AR-like model training of the text-to-image/instruction tuning tasks.
427
+ Support encode multiple texts at once. Each text can be separately conditioned or unconditioned
428
+ based on the uncond_flags and a uniform uncond_p.
429
+ **<bos> token is always prepended to the text tokens.**
430
+
431
+ Parameters
432
+ ----------
433
+ texts: str or List[str]
434
+ List of texts to be encoded.
435
+ uncond_enabled: bool or List[bool]
436
+ List of flags to indicate whether the text should be unconditioned.
437
+ If False, the text will never be unconditioned.
438
+ If True, the text will be unconditioned with uncond_p.
439
+ uncond_p: float
440
+ Probability to the unconditional text. Only works when uncond_enabled is True.
441
+ max_length: int
442
+ Maximum length of the encoded text.
443
+ pad: Optional[str]
444
+ Padding method. Can be 'left' or 'right'.
445
+ return_lengths: bool
446
+ Whether to return the length of each encoded text.
447
+ """
448
+ if pad is not None:
449
+ assert max_length is not None, "max_length should be provided when pad is not None."
450
+
451
+ if uncond_enabled is None:
452
+ uncond_enabled = [True] * len(texts)
453
+ elif isinstance(uncond_enabled, bool):
454
+ uncond_enabled = [uncond_enabled] * len(texts)
455
+ if len(uncond_enabled) != len(texts):
456
+ print(uncond_enabled, texts)
457
+ assert len(uncond_enabled) == len(texts), (
458
+ f"Length of uncond_flags should be equal to the number of texts, "
459
+ f"but got {len(uncond_enabled)} and {len(texts)}."
460
+ )
461
+
462
+ # Prepare text/uncond tokens
463
+ # TODO: If len(texts) > 1, such as instruction + prompt in inpainting, we need to determine how to do uncond.
464
+ # Now all texts will be cond or uncond at the same time.
465
+ do_uncond_drop = (uncond_p is not None) and (random.random() < uncond_p)
466
+ text_tokens, lengths = [], []
467
+ cum_length = 0
468
+ for text, uncond_flag in zip(texts, uncond_enabled):
469
+ # If reach the max_length and there still have unencoded texts, give a warning message and break the loop.
470
+ if max_length is not None and cum_length >= max_length:
471
+ warnings.warn(
472
+ f"Text length exceeds the max_length({max_length}). The remaining texts will be ignored: "
473
+ f"{text[:80]}..."
474
+ )
475
+ break
476
+ # Set add_special_tokens=False to avoid adding <bos> token in some LLMs.
477
+ if isinstance(text, str):
478
+ text_token = self.tokenizer.encode(text, add_special_tokens=False)
479
+ else:
480
+ text_token = text
481
+ if uncond_flag and do_uncond_drop:
482
+ text_token = [self.cfg_token_id] * len(text_token)
483
+ # Cutoff the text by max_length if necessary
484
+ if max_length is not None and (cum_length + len(text_token)) > max_length:
485
+ text_token = text_token[:max_length - cum_length]
486
+ text_tokens.extend(text_token)
487
+ lengths.append(len(text_token))
488
+ cum_length += len(text_token)
489
+
490
+ # Prepend/Append <pad> tokens if applicable
491
+ if pad is not None and (pad_length := max_length - len(text_tokens)) > 0:
492
+ if pad == 'left':
493
+ text_tokens = [self.pad_token_id] * pad_length + text_tokens
494
+ elif pad == 'right':
495
+ text_tokens = text_tokens + [self.pad_token_id] * pad_length
496
+ else:
497
+ raise ValueError(f"Unsupported padding method: {pad}.")
498
+
499
+ if return_lengths:
500
+ return text_tokens, lengths
501
+ return text_tokens
502
+
503
+ @staticmethod
504
+ def _check_key_number_matched(keys, data):
505
+ # Assert keys and token_source are matched
506
+ assert set(keys) == set(data.keys()), (
507
+ f"Keys in the template and token source should be matched, but got {set(keys)} and {list(data.keys())}."
508
+ )
509
+ key_counts = {k: 0 for k in keys}
510
+ for key in keys:
511
+ key_counts[key] += 1
512
+ for key, count in key_counts.items():
513
+ assert len(data[key]) == count, (
514
+ f"Number of `{key}` in the token source should be matched with the template, but got "
515
+ f"{data[key]}({len(data[key])}) and {count}."
516
+ )
517
+
518
+ def _add_image_meta_info_token(self, token_seq, token_count, extra_token_pos, add_timestep_token=False,
519
+ add_image_shape_token=False, base_size=None, ratio_idx=None, image_type=None,
520
+ add_guidance_token=False):
521
+ if add_image_shape_token:
522
+ token_seq.extend([
523
+ self.special_token_map[f"<img_size_{base_size}>"],
524
+ self.special_token_map[f"<img_ratio_{ratio_idx}>"]
525
+ ])
526
+ token_count += 2
527
+ if add_timestep_token:
528
+ token_seq.extend([self.special_token_map["<timestep>"]])
529
+ extra_token_pos['timestep'].append(token_count)
530
+ if image_type is not None:
531
+ if image_type == "gen_image":
532
+ extra_token_pos['gen_timestep'].append(token_count)
533
+ elif image_type in ["joint_image"]:
534
+ extra_token_pos['cond_timestep'].append(token_count)
535
+ else:
536
+ raise ValueError(f"Unsupported image type: {image_type}.")
537
+ token_count += 1
538
+ if add_guidance_token:
539
+ token_seq.extend([self.special_token_map["<guidance>"]])
540
+ extra_token_pos['guidance'].append(token_count)
541
+ token_count += 1
542
+ return token_count
543
+
544
+ @staticmethod
545
+ def _shorten_text(text):
546
+ import re
547
+ text = re.sub(r"(<img>)+", lambda m: f"[<img>]{{{len(m.group(0)) // 5}}}", text)
548
+ text = re.sub(r"(<pad>)+", lambda m: f"[<pad>]{{{len(m.group(0)) // 5}}}", text)
549
+ return text
550
+
551
+ def encode_sequence(
552
+ self,
553
+ template: str,
554
+ token_source: Dict[str, List],
555
+ total_length=None,
556
+ add_timestep_token=False,
557
+ add_guidance_token=False,
558
+ last_key_only_prefix=False,
559
+ add_eos=True,
560
+ use_front_boi_token=True,
561
+ add_pad=True,
562
+ add_bos=True,
563
+ drop_last: Union[str, bool] = 'auto',
564
+ add_image_shape_token=False,
565
+ ):
566
+ """
567
+ Encode a sequence based on the template (e.g., `text-image` for t2i, `text-image-image` for instruction tuning)
568
+ and token source.
569
+
570
+ Parameters
571
+ ----------
572
+ template: str
573
+ Template of the sequence. E.g., "text-gen_image" means the sequence is composed of text and an image.
574
+ "text-text-gen_image" means the sequence is composed of two sections of text and an image.
575
+ token_source: Dict[str, List]
576
+ Token source for each key in the template, in order.
577
+ - text: List[Dict].
578
+ - gen_image: List[Dict].
579
+ - joint_image: List[Dict].
580
+ total_length: int
581
+ Total length of the encoded sequence, include padding tokens.
582
+ add_timestep_token: bool
583
+ Whether to add timestep token before the image tokens.
584
+ (Right after the <img_ratio_*><img_size_*> tokens)
585
+ add_guidance_token: bool
586
+ Whether to add guidance token before the image tokens.
587
+ last_key_only_prefix: bool
588
+ Whether to only use the modal prefix in the last key.
589
+ add_eos: bool or 'auto'
590
+ Whether to add eos token at the end of the sequence. If True, always add eos token. If 'auto',
591
+ add eos token only when the total_length is not reached and the last token is not <eos>.
592
+ use_front_boi_token: bool:
593
+ Whether to put the <boi> token at the front of iw, ih and timestep tokens.
594
+ add_pad: bool or 'auto'
595
+ Whether to add padding tokens to the sequence. If True and total_length is not reached, add padding tokens.
596
+ add_bos: bool
597
+ Whether to add bos token at the beginning of the sequence.
598
+ drop_last: bool or 'auto'
599
+ - If auto, drop last tokens exceeding the total_length if the total_length is provided. If cut point is
600
+ in the middle of the image tokens, an error will raised.
601
+ - If True, drop last tokens exceeding the total_length. If cut point is in the middle of the image tokens,
602
+ all the successive image tokens will be dropped.
603
+ - If False, keep the last tokens exceeding the total_length, even if the total_length is reached.
604
+ add_image_shape_token: bool
605
+ Whether to add image shape token before the image tokens. (Right before the <timestep> token)
606
+
607
+ Returns
608
+ -------
609
+ token_seq: list
610
+ Encoded token sequence.
611
+ extra_token_pos: dict
612
+ Positions of extra tokens.
613
+ """
614
+ if last_key_only_prefix:
615
+ assert add_eos is not True, "add_eos should not be True when last_key_only_prefix is True."
616
+ if drop_last is True and total_length is None:
617
+ raise ValueError("total_length should be provided when drop_last is True.")
618
+
619
+ keys = template.split('-')
620
+ modal_length = len(keys)
621
+ index_indicator = {k: 0 for k in token_source}
622
+ for k, v in token_source.items():
623
+ assert isinstance(v, (list, tuple)), (
624
+ f"Value of `{k}` in the token source should be a list or tuple, but got {type(v)}."
625
+ )
626
+ self._check_key_number_matched(keys, token_source)
627
+
628
+ token_seq = []
629
+ token_count = 0
630
+ extra_token_pos = defaultdict(list)
631
+ if add_bos:
632
+ token_seq.append(self.bos_token_id)
633
+ token_count += 1
634
+ # If drop_last is True, we check the token_count on the fly and exit the loop if the total_length is reached.
635
+ # This check is only applied to the block tokens. Block tokens mean the tokens that are unsplittable, like
636
+ # image tokens. Text tokens are splittable, so we don't need to check the token_count for text.
637
+ # If the loop is broken by drop_last, we don't add the eos token at the end because the sequence is not
638
+ # complete.
639
+ drop_last_break = False
640
+ for i, key in enumerate(keys):
641
+ source = token_source[key][index_indicator[key]]
642
+ if key == "text":
643
+ token_seq.extend(source) # text token sequence
644
+ extra_token_pos["<text>_start"].append(token_count)
645
+ token_count += len(source)
646
+ extra_token_pos["<text>_end"].append(token_count - 1)
647
+
648
+ elif key == "gen_image":
649
+ if isinstance(source, int):
650
+ source = {'length': source}
651
+ extra_count = 2 + (
652
+ 1 if source.get('timestep', add_timestep_token) else 0) + (
653
+ 1 if source.get('guidance', add_guidance_token) else 0) + (
654
+ 2 if source.get('image_shape', add_image_shape_token) else 0
655
+ )
656
+ if drop_last is True and token_count + extra_count + source['length'] > total_length:
657
+ drop_last_break = True
658
+ break
659
+ if source.get('front_boi', use_front_boi_token):
660
+ token_seq.append(self.boi_token_id)
661
+ extra_token_pos["boi"].append(token_count)
662
+ token_count += 1
663
+ token_count = self._add_image_meta_info_token(
664
+ token_seq=token_seq,
665
+ token_count=token_count,
666
+ extra_token_pos=extra_token_pos,
667
+ add_timestep_token=source.get('timestep', add_timestep_token),
668
+ add_guidance_token=source.get('guidance', add_guidance_token),
669
+ add_image_shape_token=source.get('image_shape', add_image_shape_token),
670
+ base_size=source.get('base_size'),
671
+ ratio_idx=source.get('ratio_idx'),
672
+ image_type=key,
673
+ )
674
+ if not source.get('front_boi', use_front_boi_token):
675
+ token_seq.append(self.boi_token_id)
676
+ extra_token_pos["boi"].append(token_count)
677
+ token_count += 1
678
+ if last_key_only_prefix and i == modal_length - 1:
679
+ pass # for AR inference
680
+ else:
681
+ token_seq.extend(
682
+ [self.img_token_id] * source['length'] + # token number
683
+ [self.eoi_token_id]
684
+ )
685
+ extra_token_pos["<img>_start"].append(token_count)
686
+ extra_token_pos["<all_img>_start"].append(token_count)
687
+ token_count += source['length']
688
+ extra_token_pos["<img>_end"].append(token_count - 1)
689
+ extra_token_pos["<all_img>_end"].append(token_count - 1)
690
+ extra_token_pos["eoi"].append(token_count)
691
+ token_count += 1 # <eoi>
692
+
693
+ elif key == "joint_image":
694
+ assert isinstance(source['length'], list) and len(
695
+ source['length']) == 2, "joint_image length should be a list of two integers"
696
+ extra_count = 2 + 1 + ( # boi, eoi, joint_img_sep
697
+ 1 if source.get('timestep', add_timestep_token) else 0) + (
698
+ 2 if source.get('image_shape', add_image_shape_token) else 0
699
+ )
700
+ if drop_last is True and token_count + extra_count + sum(source['length']) > total_length:
701
+ drop_last_break = True
702
+ break
703
+ if source.get('front_boi', use_front_boi_token):
704
+ token_seq.append(self.boi_token_id) # Use patched boi for Janus, otherwise useing default <boi>
705
+ extra_token_pos["boi"].append(token_count)
706
+ token_count += 1
707
+ token_count = self._add_image_meta_info_token(
708
+ token_seq=token_seq,
709
+ token_count=token_count,
710
+ extra_token_pos=extra_token_pos,
711
+ add_timestep_token=source.get('timestep', add_timestep_token),
712
+ add_image_shape_token=source.get('image_shape', add_image_shape_token),
713
+ base_size=source.get('base_size'),
714
+ ratio_idx=source.get('ratio_idx'),
715
+ image_type=key,
716
+ )
717
+ if not source.get('front_boi', use_front_boi_token):
718
+ token_seq.append(self.boi_token_id)
719
+ extra_token_pos["boi"].append(token_count)
720
+ token_count += 1
721
+ if last_key_only_prefix and i == modal_length - 1:
722
+ pass # for AR inference
723
+ else:
724
+ token_seq.extend(
725
+ [self.img_token_id] * source['length'][0]
726
+ )
727
+ extra_token_pos["<vae_img>_start"].append(token_count)
728
+ extra_token_pos["<joint_img>_start"].append(token_count)
729
+ extra_token_pos["<all_img>_start"].append(token_count)
730
+ token_count += source['length'][0]
731
+ extra_token_pos["<vae_img>_end"].append(token_count - 1)
732
+ extra_token_pos["<all_img>_end"].append(token_count - 1)
733
+
734
+ token_seq.extend(
735
+ [self.special_token_map["<joint_img_sep>"]]
736
+ )
737
+ extra_token_pos["joint_img_sep"].append(token_count)
738
+ token_count += 1
739
+
740
+ token_seq.extend(
741
+ [self.img_token_id] * source['length'][1]
742
+ )
743
+ extra_token_pos["<vit_img>_start"].append(token_count)
744
+ extra_token_pos["<all_img>_start"].append(token_count)
745
+ token_count += source['length'][1]
746
+ extra_token_pos["<vit_img>_end"].append(token_count - 1)
747
+ extra_token_pos["<joint_img>_end"].append(token_count - 1)
748
+ extra_token_pos["<all_img>_end"].append(token_count - 1)
749
+
750
+ token_seq.extend(
751
+ [self.eoi_token_id]
752
+ )
753
+ extra_token_pos["eoi"].append(token_count)
754
+ token_count += 1 # <eoi>
755
+
756
+ else:
757
+ raise ValueError(f"Not supported key: {key}")
758
+ index_indicator[key] += 1
759
+
760
+ if add_eos is True and not drop_last_break:
761
+ # Typically used for t2i task.
762
+ token_seq.append(self.eos_token_id)
763
+ extra_token_pos["eos"].append(token_count)
764
+ token_count += 1
765
+ elif add_eos == 'auto' and not drop_last_break:
766
+ # Typically used for lm and mmu task.
767
+ if token_seq[-1] != self.eos_token_id and (total_length is None or token_count < total_length):
768
+ token_seq.append(self.eos_token_id)
769
+ extra_token_pos["eos"].append(token_count)
770
+ token_count += 1
771
+
772
+ if total_length:
773
+ # Check token count and clip sequence if necessary
774
+ if token_count > total_length and drop_last:
775
+ # Assert clip position is not in the middle of the block-wise tokens (gen_image, joint_image)
776
+ for start_key, end_key in [
777
+ ("<img>_start", "<img>_end"), ("<joint_img>_start", "<joint_img>_end"),
778
+ ("<vae_img>_start", "<vae_img>_end"), ("<vit_img>_start", "<vit_img>_end"),
779
+ ]:
780
+ if start_key in extra_token_pos and end_key in extra_token_pos:
781
+ assert all(
782
+ (start > total_length or end + 1 < total_length)
783
+ for start, end in zip(extra_token_pos[start_key], extra_token_pos[end_key])
784
+ ), ("Clip position should not be in the middle of the image tokens.\n"
785
+ f"Below is the text:\n{self._shorten_text(self.tokenizer.decode(token_seq))}")
786
+ token_seq = token_seq[:total_length]
787
+
788
+ # Pad the sequence if necessary
789
+ pad_num = max(0, total_length - len(token_seq))
790
+ if add_pad and pad_num:
791
+ token_seq.extend([self.pad_token_id] * pad_num)
792
+ extra_token_pos["first_pad"].append(token_count)
793
+
794
+ return token_seq, extra_token_pos
795
+
796
+ def batch_gen_infer(
797
+ self,
798
+ infer_fn,
799
+ prompt_list: list,
800
+ negative_prompt_list: list = None,
801
+ infer_fn_kwargs_list: List[Dict[str, int]] = None,
802
+ do_classifier_free_guidance=False,
803
+ condition_repeat_times: int = 1,
804
+ uncondition_repeat_times: int = 1,
805
+ ):
806
+ """
807
+ Batch inference for the AR-like model training of the text-to-image/instruction tuning tasks.
808
+
809
+ Parameters
810
+ ----------
811
+ infer_fn: callable
812
+ Inference function to encode the prompt.
813
+ prompt_list: list
814
+ List of prompts. Each element can be a single prompt or a list of prompts passed to the infer_fn.
815
+ negative_prompt_list: list
816
+ List of negative prompts. Only used when do_classifier_free_guidance is True. If None, will use <cfg>
817
+ token sequence as negative prompt.
818
+ infer_fn_kwargs_list: List[Dict[str, int]]
819
+ List of keyword arguments for the infer_fn.
820
+ do_classifier_free_guidance: bool
821
+ Whether to do classifier-free guidance.
822
+ condition_repeat_times: int
823
+ Support multi-condition.
824
+ uncondition_repeat_times: int
825
+ Support multi-uncondition.
826
+ """
827
+ if infer_fn_kwargs_list is None:
828
+ infer_fn_kwargs_list = [{} for _ in prompt_list]
829
+
830
+ # [n_output, bsz]
831
+ cond_results_list = None
832
+ uncond_results_list = None
833
+ output_type_list = []
834
+
835
+ for prompt_idx, (prompt, infer_fn_kwargs) in enumerate(zip(prompt_list, infer_fn_kwargs_list)):
836
+ if not isinstance(prompt, (list, tuple)):
837
+ prompt = [prompt]
838
+ cond_kwargs = {"uncond_p": 0.0} if do_classifier_free_guidance else {}
839
+ results = infer_fn(
840
+ *prompt,
841
+ **infer_fn_kwargs,
842
+ **cond_kwargs,
843
+ )
844
+ output_type_list.append((type(results), len(results) if isinstance(results, (list, tuple)) else 1))
845
+ if isinstance(results, dict):
846
+ raise ValueError("Make batch on dict is not supported. Please return list or tuple for infer_fn.")
847
+ if not isinstance(results, (list, tuple)):
848
+ results = (results,)
849
+ if cond_results_list is None:
850
+ cond_results_list = [[] for _ in results]
851
+ uncond_results_list = [[] for _ in results]
852
+ for i, result in enumerate(results):
853
+ cond_results_list[i].append(result)
854
+
855
+ if do_classifier_free_guidance:
856
+ if negative_prompt_list is None:
857
+ uncond_kwargs = {"uncond_p": 1.0}
858
+ uncond_results = infer_fn(
859
+ *prompt,
860
+ **infer_fn_kwargs,
861
+ **uncond_kwargs,
862
+ )
863
+ else:
864
+ negative_prompt = negative_prompt_list[prompt_idx]
865
+ if not isinstance(negative_prompt, (list, tuple)):
866
+ negative_prompt = [negative_prompt]
867
+ uncond_results = infer_fn(
868
+ *negative_prompt,
869
+ **infer_fn_kwargs,
870
+ )
871
+ if isinstance(uncond_results, TokenizerEncodeOutput):
872
+ uncond_results_list.append(uncond_results)
873
+ else:
874
+ for i, result in enumerate(uncond_results):
875
+ uncond_results_list[i].append(result)
876
+
877
+ assert all(output_type_list[0] == n for n in output_type_list), \
878
+ f"Number of outputs should be equal for all samples, but got {output_type_list}."
879
+ output_type, output_num = output_type_list[0]
880
+
881
+ def make_batch(batch_cond_item, batch_uncond_item):
882
+ # Process each output item to make batch
883
+ first = batch_cond_item[0] # The first element in the batch
884
+ if isinstance(first, torch.Tensor):
885
+ stacked_item = torch.stack(self.pad(
886
+ batch_cond_item * condition_repeat_times + batch_uncond_item * uncondition_repeat_times,
887
+ ))
888
+
889
+ elif first is None:
890
+ assert all(item is None for item in batch_cond_item + batch_uncond_item), \
891
+ (f"The first cond item is None, but some items are not None:\n\n"
892
+ f"condition: {batch_cond_item}\n\n"
893
+ f"uncondition: {batch_uncond_item}")
894
+ stacked_item = None
895
+
896
+ elif isinstance(first, (list, tuple)):
897
+ # If the output item is a list or tuple, we treat it as a whole, and won't make nested batch any more.
898
+ stacked_item = batch_cond_item * condition_repeat_times + batch_uncond_item * uncondition_repeat_times
899
+
900
+ elif isinstance(first, TokenizerEncodeOutput):
901
+ stacked_item = {}
902
+ # Traverse not-None attributes
903
+ for key in list(first.keys()):
904
+ merged_list = [cond_item[key] for cond_item in batch_cond_item] * condition_repeat_times + \
905
+ [uncond_item[key] for uncond_item in batch_uncond_item] * uncondition_repeat_times
906
+ if isinstance(first[key], torch.Tensor):
907
+ if 'mask' in key:
908
+ pad_val = 0.0
909
+ elif key == 'tokens':
910
+ pad_val = self.special_token_map["<pad>"]
911
+ else:
912
+ pad_val = False # Should not pad for other tensors
913
+ stacked_item[key] = torch.stack(self.pad(merged_list, pad_val=pad_val), dim=0)
914
+ elif isinstance(first[key], list):
915
+ stacked_item[key] = merged_list
916
+ elif first[key] is None:
917
+ pass
918
+ else:
919
+ raise ValueError(f"Unsupported type of {key}: {type(first[key])}.")
920
+ stacked_item = TokenizerEncodeOutput(stacked_item)
921
+
922
+ else:
923
+ raise TypeError(f"Making batch on type {type(first)} is not supported.")
924
+
925
+ return stacked_item
926
+
927
+ stacked_outputs = []
928
+ for cond_results, uncond_results in zip(cond_results_list, uncond_results_list):
929
+ stacked_outputs.append(make_batch(cond_results, uncond_results))
930
+
931
+ if output_type == list:
932
+ return stacked_outputs
933
+ elif output_type == tuple:
934
+ return tuple(stacked_outputs)
935
+ elif output_num == 1:
936
+ return stacked_outputs[0]
937
+ else:
938
+ raise ValueError(f"Unsupported output type: {output_type}.")
939
+
940
+ @staticmethod
941
+ def parse_extra_token_pos(extra_token_pos, prefix, tokens, rng=None):
942
+ if rng is None:
943
+ rng = slice(None)
944
+ image_slices = [
945
+ slice(start, end + 1)
946
+ for start, end in zip(extra_token_pos[f'<{prefix}>_start'][rng], extra_token_pos[f'<{prefix}>_end'][rng])
947
+ ] if f'<{prefix}>_start' in extra_token_pos and f'<{prefix}>_end' in extra_token_pos else []
948
+ if image_slices:
949
+ image_mask = torch.zeros_like(tokens, dtype=torch.bool)
950
+ for image_slice in image_slices:
951
+ image_mask[image_slice] = True
952
+ else:
953
+ image_mask = None
954
+ return image_slices, image_mask
955
+
956
+ def encode_general(
957
+ self,
958
+ sections: Optional[List[Dict[str, Any]]] = None,
959
+ max_token_length: Optional[int] = None,
960
+ add_eos='auto',
961
+ use_text_mask=True,
962
+ add_pad='auto',
963
+ add_bos=True,
964
+ drop_last='auto',
965
+ ):
966
+ """
967
+ General encode function to encode a sequence with multiple sections of text and images.
968
+ Each section is a dict with a `type` key and other keys depending on the type.
969
+ Supported section types:
970
+ - text: dict with keys:
971
+ - text: str or List[int], text to be encoded. Either `text` or `tokens` should be provided.
972
+ - tokens: List[int], pre-encoded text tokens. Either `text` or `tokens` should be provided.
973
+ - uncond_enabled: bool, whether to enable uncondition for this text section.
974
+ - uncond_p: float, probability to drop the text section for uncondition.
975
+ - max_length: int, maximum length of the text section.
976
+ - ignore: bool, whether to ignore this text section in the text mask.
977
+ - start_offset: int, start offset of the text mask.
978
+ - end_offset: int, end offset of the text mask.
979
+ - gen_image: dict with keys:
980
+ - token_length: int, number of image tokens.
981
+ - add_timestep_token: bool, whether to add timestep token before the image tokens.
982
+ - add_guidance_token: bool, whether to add guidance token before the image tokens.
983
+ - use_front_boi_token: bool, whether to put the <boi> token at the front of size, ratio and timestep tokens.
984
+ - add_image_shape_token: bool, whether to add image shape token before the image tokens.
985
+ - base_size: int, base size of the image.
986
+ - ratio_idx: int, ratio index of the image.
987
+ - joint_image: dict with keys:
988
+ - token_length: List[int], number of image tokens for the two images.
989
+ - add_timestep_token: bool, whether to add timestep token before the image tokens.
990
+ - use_front_boi_token: bool, whether to put the <boi> token at the front of size, ratio and timestep tokens.
991
+ - add_image_shape_token: bool, whether to add image shape token before the image tokens.
992
+ - base_size: int, base size of the image.
993
+ - ratio_idx: int, ratio index of the image.
994
+
995
+ Parameters
996
+ ----------
997
+ sections: List[Dict[str, Any]]
998
+ List of sections to be encoded.
999
+ max_token_length: int
1000
+ Maximum length of the encoded token sequence.
1001
+ add_eos: bool or 'auto'
1002
+ Whether to add eos token at the end of the sequence. If True, always add eos
1003
+ token. If 'auto', add eos token only when the total_length is not reached and the last token is not <eos>.
1004
+ use_text_mask: bool
1005
+ Whether to generate text mask.
1006
+ add_pad: bool or 'auto'
1007
+ Whether to add padding tokens to the sequence. If True and total_length is not reached,
1008
+ add padding tokens.
1009
+ add_bos: bool
1010
+ Whether to add bos token at the beginning of the sequence.
1011
+ drop_last: bool or 'auto'
1012
+ - If auto, drop last tokens exceeding the total_length if the total_length is provided.
1013
+ If cut point is in the middle of the image tokens, an error will raised.
1014
+ - If True, drop last tokens exceeding the total_length. If cut point is in the
1015
+ middle of the image tokens, all the successive image tokens will be dropped.
1016
+ - If False, keep the last tokens exceeding the total_length, even if the total_length
1017
+ is reached.
1018
+
1019
+ Returns
1020
+ -------
1021
+ TokenizerEncodeOutput
1022
+ Encoded token sequence and extra information.
1023
+ """
1024
+ if sections is None:
1025
+ raise ValueError("sections must be provided.")
1026
+ template = '-'.join([section['type'] for section in sections])
1027
+
1028
+ sections = deepcopy(sections)
1029
+ token_source = defaultdict(list)
1030
+ text_mask_specs = []
1031
+ for section in sections:
1032
+ if section['type'] == 'text':
1033
+ text = self.encode_text(
1034
+ section['text'] if 'text' in section else section['tokens'],
1035
+ uncond_enabled=section.get('uncond_enabled'),
1036
+ uncond_p=section.get('uncond_p'),
1037
+ max_length=section.get('max_length'),
1038
+ )
1039
+ token_source['text'].append(text)
1040
+ text_mask_specs.append(dict(
1041
+ ignore=section.get('ignore', False),
1042
+ start_offset=section.get('start_offset', 0),
1043
+ end_offset=section.get('end_offset', 0),
1044
+ ))
1045
+ elif section['type'] == 'gen_image':
1046
+ token_source['gen_image'].append(dict(
1047
+ length=section['token_length'],
1048
+ timestep=section.get('add_timestep_token', False),
1049
+ guidance=section.get('add_guidance_token', False),
1050
+ front_boi=section.get('use_front_boi_token', False),
1051
+ image_shape=section.get('add_image_shape_token', False),
1052
+ base_size=section.get('base_size'),
1053
+ ratio_idx=section.get('ratio_idx'),
1054
+ ))
1055
+ elif section['type'] == 'joint_image':
1056
+ token_source['joint_image'].append(dict(
1057
+ length=section['token_length'],
1058
+ timestep=section.get('add_timestep_token', False),
1059
+ front_boi=section.get('use_front_boi_token', False),
1060
+ image_shape=section.get('add_image_shape_token', False),
1061
+ base_size=section.get('base_size'),
1062
+ ratio_idx=section.get('ratio_idx'),
1063
+ ))
1064
+ else:
1065
+ raise ValueError(f"Invalid section type: {section['type']}")
1066
+
1067
+ # Combine text and image tokens
1068
+ full_token_seq, extra_token_pos = self.encode_sequence(
1069
+ template=template,
1070
+ token_source=dict(token_source),
1071
+ total_length=max_token_length,
1072
+ add_eos=add_eos,
1073
+ add_pad=add_pad,
1074
+ add_bos=add_bos,
1075
+ drop_last=drop_last,
1076
+ )
1077
+ full_seq_token_tensor = torch.tensor(full_token_seq, dtype=torch.long)
1078
+
1079
+ timestep_scatter_index = torch.tensor(extra_token_pos['timestep'], dtype=torch.long) \
1080
+ if 'timestep' in extra_token_pos else None
1081
+ guidance_scatter_index = torch.tensor(extra_token_pos['guidance'], dtype=torch.long) \
1082
+ if 'guidance' in extra_token_pos else None
1083
+ cond_timestep_scatter_index = torch.tensor(extra_token_pos['cond_timestep'], dtype=torch.long) \
1084
+ if 'cond_timestep' in extra_token_pos else None
1085
+ gen_timestep_scatter_index = torch.tensor(extra_token_pos['gen_timestep'], dtype=torch.long) \
1086
+ if 'gen_timestep' in extra_token_pos else None
1087
+
1088
+ # Gen image mask
1089
+ gen_image_slices, gen_image_mask = self.parse_extra_token_pos(extra_token_pos, 'img', full_seq_token_tensor)
1090
+ # Joint image
1091
+ joint_image_slices, _ = self.parse_extra_token_pos(extra_token_pos, 'joint_img', full_seq_token_tensor)
1092
+ # Conditional vae image
1093
+ cond_vae_image_slices, cond_vae_image_mask = self.parse_extra_token_pos(
1094
+ extra_token_pos, 'vae_img', full_seq_token_tensor)
1095
+ # Conditional vit image
1096
+ cond_vit_image_slices, cond_vit_image_mask = self.parse_extra_token_pos(
1097
+ extra_token_pos, 'vit_img', full_seq_token_tensor)
1098
+ # All image slices (gen_image, joint_image)
1099
+ all_image_slices = [
1100
+ slice(start, end + 1)
1101
+ for start, end in zip(extra_token_pos['<all_img>_start'], extra_token_pos['<all_img>_end'])
1102
+ ] if '<all_img>_start' in extra_token_pos and '<all_img>_end' in extra_token_pos else []
1103
+
1104
+ # Text mask
1105
+ text_slices = [
1106
+ slice(start, end + 1)
1107
+ for start, end in zip(extra_token_pos['<text>_start'], extra_token_pos['<text>_end'])
1108
+ ] if '<text>_start' in extra_token_pos and '<text>_end' in extra_token_pos else []
1109
+ assert len(text_slices) <= len(text_mask_specs), \
1110
+ (f"Number of text slices ({len(text_slices)}) should be less than or equal to "
1111
+ f"number of text mask specs ({len(text_mask_specs)})")
1112
+ if use_text_mask:
1113
+ text_mask = torch.zeros_like(full_seq_token_tensor, dtype=torch.float32)
1114
+ for text_slice, mask_spec in zip(text_slices, text_mask_specs):
1115
+ if not mask_spec['ignore']:
1116
+ real_slice = slice(
1117
+ text_slice.start + mask_spec['start_offset'],
1118
+ text_slice.stop + mask_spec['end_offset']
1119
+ )
1120
+ text_mask[real_slice] = 1.0
1121
+ else:
1122
+ text_mask = None
1123
+
1124
+ # real_pos is the first position of the <pad> token
1125
+ real_pos = torch.tensor(extra_token_pos.get('first_pad', [full_seq_token_tensor.shape[0]]), dtype=torch.long)
1126
+
1127
+ return TokenizerEncodeOutput(
1128
+ tokens=full_seq_token_tensor,
1129
+ timestep_scatter_index=timestep_scatter_index,
1130
+ guidance_scatter_index=guidance_scatter_index,
1131
+ text_slices=text_slices,
1132
+ gen_image_slices=gen_image_slices,
1133
+ joint_image_slices=joint_image_slices,
1134
+ cond_vae_image_slices=cond_vae_image_slices,
1135
+ cond_vit_image_slices=cond_vit_image_slices,
1136
+ text_mask=text_mask,
1137
+ gen_image_mask=gen_image_mask,
1138
+ cond_vae_image_mask=cond_vae_image_mask,
1139
+ cond_vit_image_mask=cond_vit_image_mask,
1140
+ real_pos=real_pos,
1141
+ all_image_slices=all_image_slices,
1142
+ cond_timestep_scatter_index=cond_timestep_scatter_index,
1143
+ gen_timestep_scatter_index=gen_timestep_scatter_index,
1144
+ )
1145
+
1146
+ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_think=False):
1147
+ if not cot_text: # None or empty
1148
+ return []
1149
+ if '<think>' in cot_text and '</think>' in cot_text:
1150
+ before_think_sec = cot_text.split('<think>')[0]
1151
+ after_think_sec = cot_text.split('</think>')[1]
1152
+ think_sec = cot_text.split('<think>')[1].split('</think>')[0]
1153
+ return self.get_cot_sections(before_think_sec, uncond_kwargs, drop_think=drop_think) + \
1154
+ ([
1155
+ dict(type="text", text="<think>"),
1156
+ dict(type="text", text=think_sec, max_length=cot_max_length, **uncond_kwargs),
1157
+ dict(type="text", text="</think>")
1158
+ ] if not drop_think else []) + \
1159
+ self.get_cot_sections(after_think_sec, uncond_kwargs, drop_think=drop_think)
1160
+
1161
+ if '<recaption>' in cot_text and '</recaption>' in cot_text:
1162
+ before_recaption_sec = cot_text.split('<recaption>')[0]
1163
+ after_recaption_sec = cot_text.split('</recaption>')[1]
1164
+ recaption_sec = cot_text.split('<recaption>')[1].split('</recaption>')[0]
1165
+ return self.get_cot_sections(before_recaption_sec, uncond_kwargs, drop_think=drop_think) + \
1166
+ [
1167
+ dict(type="text", text="<recaption>"),
1168
+ dict(type="text", text=recaption_sec, max_length=cot_max_length, **uncond_kwargs),
1169
+ dict(type="text", text="</recaption>")
1170
+ ] + \
1171
+ self.get_cot_sections(after_recaption_sec, uncond_kwargs, drop_think=drop_think)
1172
+
1173
+ return [
1174
+ dict(type="text", text=cot_text, **uncond_kwargs),
1175
+ ]
1176
+
1177
+ def apply_general_template(
1178
+ self,
1179
+ message_list,
1180
+ max_length=None,
1181
+ add_assistant_prefix=False,
1182
+ answer="auto",
1183
+ bot_task="auto",
1184
+ sequence_template="instruct",
1185
+ uncond_p=0.0,
1186
+ cfg_factor=1,
1187
+ batchify=False,
1188
+ image_base_size=1024,
1189
+ drop_think=False,
1190
+ ):
1191
+ # If cfg_factor > 1, we need to repeat the unconditioned part
1192
+ if batchify:
1193
+ assert isinstance(message_list[0], list), \
1194
+ f"When batchify is True, message_list should be a list of list, but got [{type(message_list[0])}, ...]."
1195
+ return self.batch_gen_infer(
1196
+ infer_fn=self.apply_general_template,
1197
+ prompt_list=[[]],
1198
+ infer_fn_kwargs_list=[dict(
1199
+ message_list=message_list_i,
1200
+ max_length=max_length,
1201
+ add_assistant_prefix=add_assistant_prefix,
1202
+ answer=answer,
1203
+ bot_task=bot_task,
1204
+ sequence_template=sequence_template,
1205
+ image_base_size=image_base_size,
1206
+ drop_think=drop_think,
1207
+ ) for message_list_i in message_list],
1208
+ do_classifier_free_guidance=cfg_factor > 1,
1209
+ condition_repeat_times=1,
1210
+ uncondition_repeat_times=cfg_factor - 1,
1211
+ )
1212
+
1213
+ conv = Conversation()
1214
+ uncond_kwargs = dict(uncond_enabled=uncond_p == 1.0, uncond_p=uncond_p)
1215
+
1216
+ def process_successive_message(_message_list, _cur_message_idx, role, prefix, suffix,
1217
+ answer_prefix="", answer_suffix=""):
1218
+ _sub_sections = []
1219
+ while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]['role'] == role:
1220
+ message = _message_list[_cur_message_idx]
1221
+ if message['type'] == 'text':
1222
+ text = message['content']
1223
+ if role == "system":
1224
+ _sub_sections.append(dict(type="text", text=text))
1225
+ elif role == "assistant":
1226
+ if ("<recaption>" in text and "</recaption>" in text) or (
1227
+ "<think>" in text and "</think>" in text):
1228
+ _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
1229
+ else:
1230
+ _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
1231
+ else:
1232
+ _sub_sections.append(dict(
1233
+ type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs))
1234
+ elif message['type'] == 'gen_image':
1235
+ info = message['content']
1236
+ assert isinstance(info, ImageInfo), f"Expected ImageInfo, but got {type(info)}"
1237
+ if role == "assistant":
1238
+ _sub_sections.append(dict(type="text", text=answer_prefix))
1239
+ _sub_sections.append(dict(type=message['type'], **info.meta_info))
1240
+ if role == "assistant":
1241
+ _sub_sections.append(dict(type="text", text=answer_suffix))
1242
+ elif message['type'] == 'joint_image':
1243
+ info = message['content']
1244
+ assert isinstance(info, JointImageInfo), f"Expected JointImageInfo, but got {type(info)}"
1245
+ _sub_sections.append(dict(type=message['type'], **info.meta_info))
1246
+ else:
1247
+ raise ValueError(f"Unknown message type: {message['type']}")
1248
+ _cur_message_idx += 1
1249
+ if len(_sub_sections) > 0:
1250
+ # Add role prefix and suffix
1251
+ _sub_sections.insert(0, dict(type='text', text=prefix))
1252
+ _sub_sections.append(dict(type='text', text=suffix))
1253
+ return _sub_sections, _cur_message_idx
1254
+
1255
+ # Define assistant prefix and suffix
1256
+ if (answer == "auto" and sequence_template == "instruct") or answer is True:
1257
+ answer_prefix, answer_suffix = "<answer>", "</answer>"
1258
+ else:
1259
+ answer_prefix, answer_suffix = "", ""
1260
+ if sequence_template == "pretrain":
1261
+ system_suffix = ""
1262
+ user_prefix = ""
1263
+ user_suffix = ""
1264
+ bot_prefix = ""
1265
+ bot_suffix = ""
1266
+ else:
1267
+ system_suffix = f"{conv.sep}"
1268
+ user_prefix = f"{conv.roles[0]}: "
1269
+ user_suffix = f"{conv.sep}"
1270
+ bot_prefix = f"{conv.roles[1]}: "
1271
+ bot_suffix = f"{conv.sep}"
1272
+
1273
+ # Process successive user and assistant messages
1274
+ sections = []
1275
+ cur_message_idx = 0
1276
+ final_role = None
1277
+ while cur_message_idx < len(message_list):
1278
+ # Process successive system messages
1279
+ sub_sections, cur_message_idx = process_successive_message(
1280
+ message_list, cur_message_idx, role="system", prefix="", suffix=system_suffix)
1281
+ # Add to the template and sections
1282
+ sections.extend(sub_sections)
1283
+ if len(sub_sections) > 0:
1284
+ final_role = "system"
1285
+
1286
+ # Process successive user messages
1287
+ sub_sections, cur_message_idx = process_successive_message(
1288
+ message_list, cur_message_idx, role="user", prefix=user_prefix, suffix=user_suffix)
1289
+ # Add to the template and sections
1290
+ sections.extend(sub_sections)
1291
+ if len(sub_sections) > 0:
1292
+ final_role = "user"
1293
+
1294
+ # Process successive assistant messages
1295
+ sub_sections, cur_message_idx = process_successive_message(
1296
+ message_list, cur_message_idx, role="assistant", prefix=bot_prefix, suffix=bot_suffix,
1297
+ answer_prefix=answer_prefix, answer_suffix=answer_suffix,
1298
+ )
1299
+ # Add to the template and sections
1300
+ sections.extend(sub_sections)
1301
+ if len(sub_sections) > 0:
1302
+ final_role = "assistant"
1303
+
1304
+ if add_assistant_prefix:
1305
+ if final_role == "assistant":
1306
+ # Avoid adding prefix twice
1307
+ _bot_prefix = ""
1308
+ # Remove the final bot_suffix
1309
+ if len(sections) > 0 and sections[-1]['type'] == 'text' and sections[-1]['text'] == bot_suffix:
1310
+ sections = sections[:-1]
1311
+ else:
1312
+ _bot_prefix = bot_prefix
1313
+ # We can add special tokens for the bot lastest message according to different tasks
1314
+ bot_response_prefix = dict(
1315
+ auto=_bot_prefix,
1316
+ think=f"{_bot_prefix}<think>",
1317
+ recaption=f"{_bot_prefix}<recaption>",
1318
+ img_ratio=f"{_bot_prefix}{answer_prefix}<boi><img_size_{image_base_size}>",
1319
+ )[bot_task]
1320
+ sections.append(dict(type='text', text=bot_response_prefix))
1321
+
1322
+ output = self.encode_general(
1323
+ sections=sections,
1324
+ use_text_mask=False,
1325
+ add_eos=False,
1326
+ add_pad=False,
1327
+ )
1328
+
1329
+ if max_length is not None:
1330
+ if output.tokens.shape[-1] > max_length:
1331
+ raise ValueError(
1332
+ f"Encoded token length {output.tokens.shape[-1]} exceeds max_length {max_length}.\n"
1333
+ f"Please set a larger max_length or check the input messages:\n{message_list}"
1334
+ )
1335
+
1336
+ return output, sections
1337
+
1338
+ def apply_chat_template(
1339
+ self,
1340
+ batch_prompt: Optional[List[str]] = None,
1341
+ batch_message_list: Optional[List[List[Dict[str, Any]]]] = None,
1342
+ mode: str = "gen_text",
1343
+ batch_gen_image_info: Optional[List[ImageInfo]] = None,
1344
+ batch_cond_image_info: Optional[Union[List[JointImageInfo], List[List[JointImageInfo]]]] = None,
1345
+ batch_system_prompt: Optional[List[str]] = None,
1346
+ batch_cot_text: Optional[List[str]] = None,
1347
+ max_length: Optional[int] = None,
1348
+ bot_task: str = "auto", # auto/think/recaption/img_ratio
1349
+ image_base_size: int = 1024,
1350
+ sequence_template: str = "pretrain",
1351
+ cfg_factor: int = 1,
1352
+ add_assistant_prefix: Optional[bool] = None,
1353
+ drop_think: bool = False,
1354
+ ) -> Dict[str, Any]:
1355
+ assert bot_task in ["auto", "think", "recaption", "img_ratio"], \
1356
+ f"bot_task should be one of ['auto', 'think', 'recaption', 'img_ratio'], but got {bot_task}."
1357
+
1358
+ if batch_message_list is None:
1359
+ # Simple text-to-image or text-cot-to-image task
1360
+ batch_size = len(batch_prompt)
1361
+
1362
+ # Batchify inputs
1363
+ if not isinstance(batch_system_prompt, list):
1364
+ batch_system_prompt = [batch_system_prompt] * batch_size
1365
+ if not isinstance(batch_gen_image_info, list):
1366
+ batch_gen_image_info = [batch_gen_image_info] * batch_size
1367
+ if batch_cot_text is not None:
1368
+ assert len(batch_cot_text) == batch_size, \
1369
+ (f"batch_cot_text should have the same length as batch_size ({batch_size}), "
1370
+ f"but got {len(batch_cot_text)}.")
1371
+ else:
1372
+ batch_cot_text = [None] * batch_size
1373
+ if batch_cond_image_info is not None:
1374
+ assert len(batch_cond_image_info) == batch_size, \
1375
+ (f"batch_cond_image_info should have the same length as batch_size ({batch_size}), "
1376
+ f"but got {len(batch_cond_image_info)}.")
1377
+ batch_cond_image_info = [
1378
+ cond_image_info if isinstance(cond_image_info, list) else [cond_image_info]
1379
+ for cond_image_info in batch_cond_image_info
1380
+ ]
1381
+ else:
1382
+ batch_cond_image_info = [[] for _ in range(batch_size)]
1383
+
1384
+ # Convert single round materials into standard message list
1385
+ batch_message_list = []
1386
+ for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip(
1387
+ batch_prompt, batch_system_prompt, batch_cot_text, batch_gen_image_info,
1388
+ batch_cond_image_info,
1389
+ ):
1390
+ message_list = []
1391
+ # 1. system prompt section
1392
+ if system_prompt:
1393
+ message_list.append(dict(
1394
+ role="system", type="text", content=system_prompt, context_type="str"))
1395
+ # 2. user inputs sections
1396
+ # 2.1 image inputs
1397
+ if len(cond_image_info_list) > 0:
1398
+ message_list.extend([
1399
+ dict(role="user", type="joint_image", content=cond_image_info, context_type="image_info")
1400
+ for cond_image_info in cond_image_info_list
1401
+ ])
1402
+ # 2.2 text inputs
1403
+ message_list.append(dict(
1404
+ role="user", type="text", content=prompt, context_type="str"))
1405
+ # 3. assistant answer sections
1406
+ if cot_text is not None:
1407
+ message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str"))
1408
+ if mode == "gen_image":
1409
+ message_list.append(dict(
1410
+ role="assistant", type="gen_image", content=gen_image_info, context_type="image_info"))
1411
+ # ---
1412
+ batch_message_list.append(message_list)
1413
+
1414
+ output, sections = self.apply_general_template(
1415
+ message_list=batch_message_list,
1416
+ max_length=max_length,
1417
+ add_assistant_prefix=default(add_assistant_prefix, mode != "gen_image"),
1418
+ bot_task=bot_task,
1419
+ sequence_template=sequence_template,
1420
+ cfg_factor=cfg_factor,
1421
+ batchify=True,
1422
+ image_base_size=image_base_size,
1423
+ drop_think=drop_think,
1424
+ )
1425
+ return dict(output=output, sections=sections)