denk commited on
Commit
df86cdf
·
1 Parent(s): 9092514
Files changed (3) hide show
  1. README.md +122 -0
  2. config.json +25 -0
  3. diffusion_pytorch_model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ tags:
6
+ - video
7
+ - video-generation
8
+ - video-to-video
9
+ - controlnet
10
+ - diffusers
11
+ - wan2.2
12
+ ---
13
+ # Controlnet for Wan2.2 (tile)
14
+
15
+ This repo contains the code for controlnet module for Wan2.2. See <a href="https://github.com/TheDenk/wan2.2-controlnet">Github code</a>.
16
+ Same approach as controlnet for [Wan2.1](https://github.com/TheDenk/wan2.1-dilated-controlnet).
17
+
18
+ <video controls autoplay src=""></video>
19
+
20
+ ### For ComfyUI
21
+ Use the cool [ComfyUI-WanVideoWrapper](https://github.com/kijai/ComfyUI-WanVideoWrapper).
22
+
23
+ ### Inference examples
24
+ #### Simple inference with cli
25
+ ```bash
26
+ python -m inference.cli_demo \
27
+ --video_path "resources/bubble.mp4" \
28
+ --prompt "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color." \
29
+ --controlnet_type "tile" \
30
+ --base_model_path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
31
+ --controlnet_model_path TheDenk/wan2.2-ti2v-5b-controlnet-tile-v1
32
+ ```
33
+ #### Minimal code example
34
+ ```python
35
+ import os
36
+ os.environ['CUDA_VISIBLE_DEVICES'] = "0"
37
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
38
+
39
+ import cv2
40
+ from PIL import Image
41
+ import torch
42
+ from diffusers.utils import load_video, export_to_video
43
+ from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
44
+
45
+ from wan_controlnet import WanControlnet
46
+ from wan_transformer import CustomWanTransformer3DModel
47
+ from wan_t2v_controlnet_pipeline import WanTextToVideoControlnetPipeline
48
+
49
+ base_model_path = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
50
+ controlnet_model_path = "TheDenk/wan2.2-ti2v-5b-controlnet-tile-v1"
51
+ vae = AutoencoderKLWan.from_pretrained(base_model_path, subfolder="vae", torch_dtype=torch.float32)
52
+ transformer = CustomWanTransformer3DModel.from_pretrained(base_model_path, subfolder="transformer", torch_dtype=torch.bfloat16)
53
+ controlnet = WanControlnet.from_pretrained(controlnet_model_path, torch_dtype=torch.bfloat16)
54
+ pipe = WanTextToVideoControlnetPipeline.from_pretrained(
55
+ pretrained_model_name_or_path=base_model_path,
56
+ controlnet=controlnet,
57
+ transformer=transformer,
58
+ vae=vae,
59
+ torch_dtype=torch.bfloat16
60
+ )
61
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
62
+ pipe.enable_model_cpu_offload()
63
+
64
+ img_h = 704 # 704 480
65
+ img_w = 1280 # 1280 832
66
+ num_frames = 121 # 121 81 49
67
+
68
+ def apply_gaussian_blur(image, ksize=5, sigmaX=1.0):
69
+ image_np = np.array(image)
70
+ if ksize % 2 == 0:
71
+ ksize += 1
72
+ blurred_image = cv2.GaussianBlur(image_np, (ksize, ksize), sigmaX=sigmaX)
73
+ return Image.fromarray(blurred_image)
74
+
75
+ video_path = 'bubble.mp4'
76
+ video_frames = load_video(video_path)[:num_frames]
77
+ ksize = 5
78
+ downscale_coef =4
79
+ controlnet_frames = [x.resize((img_w // downscale_coef, img_h // downscale_coef)) for x in video_frames]
80
+ controlnet_frames = [apply_gaussian_blur(x, ksize=ksize, sigmaX=ksize // 2) for x in controlnet_frames]
81
+ controlnet_frames = [x.resize((img_w, img_h)) for x in controlnet_frames]
82
+
83
+ prompt = "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color."
84
+ negative_prompt = "bad quality, worst quality"
85
+
86
+ output = pipe(
87
+ prompt=prompt,
88
+ negative_prompt=negative_prompt,
89
+ height=img_h,
90
+ width=img_w,
91
+ num_frames=num_frames,
92
+ guidance_scale=5,
93
+ generator=torch.Generator(device="cuda").manual_seed(42),
94
+ output_type="pil",
95
+
96
+ controlnet_frames=controlnet_frames,
97
+ controlnet_guidance_start=0.0,
98
+ controlnet_guidance_end=0.8,
99
+ controlnet_weight=0.8,
100
+
101
+ teacache_treshold=0.6,
102
+ ).frames[0]
103
+
104
+ export_to_video(output, "output.mp4", fps=16)
105
+ ```
106
+ ## Acknowledgements
107
+ Original code and models [Wan2.2](https://github.com/Wan-Video/Wan2.2).
108
+
109
+
110
+ ## Citations
111
+ ```
112
+ @misc{TheDenk,
113
+ title={Wam2.2 Controlnet},
114
+ author={Karachev Denis},
115
+ url={https://github.com/TheDenk/wan2.2-controlnet},
116
+ publisher={Github},
117
+ year={2025}
118
+ }
119
+ ```
120
+
121
+ ## Contacts
122
+ <p>Issues should be raised directly in the repository. For professional support and recommendations please <a>[email protected]</a>.</p>
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanControlnet",
3
+ "_diffusers_version": "0.35.0.dev0",
4
+ "added_kv_proj_dim": null,
5
+ "attention_head_dim": 128,
6
+ "cross_attn_norm": true,
7
+ "downscale_coef": 16,
8
+ "eps": 1e-06,
9
+ "ffn_dim": 8960,
10
+ "freq_dim": 256,
11
+ "image_dim": null,
12
+ "in_channels": 3,
13
+ "num_attention_heads": 12,
14
+ "num_layers": 6,
15
+ "out_proj_dim": 3072,
16
+ "patch_size": [
17
+ 1,
18
+ 2,
19
+ 2
20
+ ],
21
+ "qk_norm": "rms_norm_across_heads",
22
+ "rope_max_seq_len": 1024,
23
+ "text_dim": 4096,
24
+ "vae_channels": 48
25
+ }
diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50df6e054fb569ee8be77120525f39c3f6d02214a296be09831b83ece195c7e
3
+ size 691979056