denk commited on
Commit
4b7ac99
·
1 Parent(s): 781c16c
Files changed (3) hide show
  1. README.md +112 -0
  2. config.json +25 -0
  3. diffusion_pytorch_model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ tags:
6
+ - video
7
+ - video-generation
8
+ - video-to-video
9
+ - controlnet
10
+ - diffusers
11
+ - wan2.2
12
+ ---
13
+ # Controlnet for Wan2.2 A14B (depth)
14
+
15
+ This repo contains the code for controlnet module for Wan2.2. See <a href="https://github.com/TheDenk/wan2.2-controlnet">Github code</a>.
16
+ Same approach as controlnet for [Wan2.1](https://github.com/TheDenk/wan2.1-dilated-controlnet).
17
+
18
+ <video controls autoplay src=""></video>
19
+
20
+ ### For ComfyUI
21
+ Use the cool [ComfyUI-WanVideoWrapper](https://github.com/kijai/ComfyUI-WanVideoWrapper).
22
+
23
+ ### Inference examples
24
+ #### Simple inference with cli
25
+ ```bash
26
+ python -m inference.cli_demo \
27
+ --video_path "resources/bubble.mp4" \
28
+ --prompt "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color." \
29
+ --controlnet_type "depth" \
30
+ --base_model_path Wan-AI/Wan2.2-T2V-A14B \
31
+ --controlnet_model_path TheDenk/wan2.2-t2v-a14b-controlnet-depth-v1
32
+ ```
33
+ #### Minimal code example
34
+ ```python
35
+ import os
36
+ os.environ['CUDA_VISIBLE_DEVICES'] = "0"
37
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
38
+
39
+ import torch
40
+ from diffusers.utils import load_video, export_to_video
41
+ from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
42
+ from controlnet_aux import MidasDetector
43
+
44
+ from wan_controlnet import WanControlnet
45
+ from wan_transformer import CustomWanTransformer3DModel
46
+ from wan_t2v_controlnet_pipeline import WanTextToVideoControlnetPipeline
47
+
48
+ base_model_path = "Wan-AI/Wan2.2-T2V-A14B"
49
+ controlnet_model_path = "TheDenk/wan2.2-t2v-a14b-controlnet-depth-v1"
50
+ vae = AutoencoderKLWan.from_pretrained(base_model_path, subfolder="vae", torch_dtype=torch.float32)
51
+ transformer = CustomWanTransformer3DModel.from_pretrained(base_model_path, subfolder="transformer", torch_dtype=torch.bfloat16)
52
+ controlnet = WanControlnet.from_pretrained(controlnet_model_path, torch_dtype=torch.bfloat16)
53
+ pipe = WanTextToVideoControlnetPipeline.from_pretrained(
54
+ pretrained_model_name_or_path=base_model_path,
55
+ controlnet=controlnet,
56
+ transformer=transformer,
57
+ vae=vae,
58
+ torch_dtype=torch.bfloat16
59
+ )
60
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=12.0)
61
+ pipe.enable_model_cpu_offload()
62
+
63
+ controlnet_processor = MidasDetector.from_pretrained('lllyasviel/Annotators')
64
+ img_h = 704 # 704 480
65
+ img_w = 1280 # 1280 832
66
+ num_frames = 121 # 121 81 49
67
+
68
+ video_path = 'bubble.mp4'
69
+ video_frames = load_video(video_path)[:num_frames]
70
+ video_frames = [x.resize((img_w, img_h)) for x in video_frames]
71
+ controlnet_frames = [controlnet_processor(x) for x in video_frames]
72
+
73
+ prompt = "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color."
74
+ negative_prompt = "bad quality, worst quality"
75
+
76
+ output = pipe(
77
+ prompt=prompt,
78
+ negative_prompt=negative_prompt,
79
+ height=img_h,
80
+ width=img_w,
81
+ num_frames=num_frames,
82
+ guidance_scale=5,
83
+ generator=torch.Generator(device="cuda").manual_seed(42),
84
+ output_type="pil",
85
+
86
+ controlnet_frames=controlnet_frames,
87
+ controlnet_guidance_start=0.0,
88
+ controlnet_guidance_end=0.8,
89
+ controlnet_weight=0.8,
90
+
91
+ teacache_treshold=0.6,
92
+ ).frames[0]
93
+
94
+ export_to_video(output, "output.mp4", fps=16)
95
+ ```
96
+ ## Acknowledgements
97
+ Original code and models [Wan2.2](https://github.com/Wan-Video/Wan2.2).
98
+
99
+
100
+ ## Citations
101
+ ```
102
+ @misc{TheDenk,
103
+ title={Wam2.2 Controlnet},
104
+ author={Karachev Denis},
105
+ url={https://github.com/TheDenk/wan2.2-controlnet},
106
+ publisher={Github},
107
+ year={2025}
108
+ }
109
+ ```
110
+
111
+ ## Contacts
112
+ <p>Issues should be raised directly in the repository. For professional support and recommendations please <a>[email protected]</a>.</p>
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanControlnet",
3
+ "_diffusers_version": "0.35.0.dev0",
4
+ "added_kv_proj_dim": null,
5
+ "attention_head_dim": 128,
6
+ "cross_attn_norm": true,
7
+ "downscale_coef": 8,
8
+ "eps": 1e-06,
9
+ "ffn_dim": 8960,
10
+ "freq_dim": 256,
11
+ "image_dim": null,
12
+ "in_channels": 3,
13
+ "num_attention_heads": 12,
14
+ "num_layers": 6,
15
+ "out_proj_dim": 5120,
16
+ "patch_size": [
17
+ 1,
18
+ 2,
19
+ 2
20
+ ],
21
+ "qk_norm": "rms_norm_across_heads",
22
+ "rope_max_seq_len": 1024,
23
+ "text_dim": 4096,
24
+ "vae_channels": 16
25
+ }
diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:878a730283dc0471cf27c68497e3db4edd791a60763d2fd7e13a67c6ab3825ec
3
+ size 705206016