import torch import torch.nn as nn import os import numpy as np from diffusers.models.attention_processor import Attention class VisualTokenSelfAttn(torch.nn.Module): def __init__(self, in_dim=2792, out_dim=768, num_heads=8): super().__init__() self.meta_token_trans = nn.Sequential( nn.Linear(in_dim, out_dim * 4), nn.LayerNorm(out_dim * 4), nn.GELU(), nn.Linear(out_dim * 4, out_dim), nn.LayerNorm(out_dim) ) self.norm1 = nn.LayerNorm(out_dim, eps=1e-6) # important to avoid attention collapsing self.attn = Attention(query_dim=out_dim, heads=num_heads) self.norm2 = nn.LayerNorm(out_dim, eps=1e-6) self.mlp = nn.Sequential( nn.Linear(out_dim, out_dim * 4), nn.GELU(), nn.Linear(out_dim * 4, out_dim) ) def forward(self, x): x = self.meta_token_trans(x) x = x + self.attn(self.norm1(x)) x = x + self.mlp(self.norm2(x)) return x class EmotionEmbedding(nn.Module): def __init__(self, emotions, prompts_dir, feature_names, output_dim, prompt_len=16): super().__init__() input_dim = self.get_input_dim(feature_names=feature_names) self.self_attn = VisualTokenSelfAttn(in_dim=input_dim, out_dim=output_dim) self.emotions = emotions self.emotion2idx = {emotion: idx for idx, emotion in enumerate(emotions)} self.emotion_params = nn.ParameterList() self.emotion_init_features = self.get_features(emotions, prompts_dir, feature_names, prompt_len) for emotion in self.emotions: init_params = self.emotion_init_features[emotion] # init_params = torch.from_numpy(init_params).float() param = nn.Parameter(init_params) self.emotion_params.append(param) def get_features(self, emotions, prompts_dir, feature_names, prompt_len): emotion_init_features = {} for emotion in emotions: emotion_features = [] for feature_name in feature_names: features = np.load(os.path.join(prompts_dir, f'{emotion}_{feature_name}.npy'), allow_pickle=True) emotion_features.append(features) emotion_features = np.concatenate(emotion_features, axis=1) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=prompt_len, random_state=42) kmeans.fit_predict(emotion_features) token = torch.tensor(kmeans.cluster_centers_).unsqueeze(0) # print(token.shape) emotion_init_features[emotion] = token return emotion_init_features def get_input_dim(self, feature_names): if feature_names == ["clip"]: in_dim = 768 elif feature_names == ["vgg"]: in_dim = 1000 elif feature_names == ["dinov2"]: in_dim = 1024 elif feature_names == ["clip", "vgg"]: in_dim = 1768 elif feature_names == ["clip", "dinov2"]: in_dim = 1768 elif feature_names == ["vgg", "dinov2"]: in_dim = 2024 elif feature_names == ["clip", "vgg", "dinov2"]: in_dim = 2792 else: raise ValueError("Invalid feature names") return in_dim def params_to_prompts(self): self.emotion_prompts = {} for emotion in self.emotions: prompt = self.self_attn(self.emotion_params[self.emotion2idx[emotion]]) prompt = prompt.squeeze(0) self.emotion_prompts[emotion] = prompt def forward(self, emotion): if isinstance(emotion, str): emotions = [emotion] else: emotions = emotion self.params_to_prompts() selected_prompts = [self.emotion_prompts[emotion] for emotion in emotions] prompts = torch.stack(selected_prompts, dim=0) del self.emotion_prompts return prompts class EmotionEmbedding2(nn.Module): def __init__(self, emotions, input_dim, output_dim): super().__init__() self.self_attn = VisualTokenSelfAttn(in_dim=input_dim, out_dim=output_dim) self.emotions = emotions self.emotion2idx = {emotion: idx for idx, emotion in enumerate(emotions)} self.emotion_params = nn.Embedding(len(emotions), input_dim) def forward(self, emotion): if isinstance(emotion, str): emotions = [emotion] else: emotions = emotion emotions = [self.emotion2idx[emotion] for emotion in emotions] emotions = torch.tensor(emotions, device=self.emotion_params.weight.device) prompts = self.emotion_params(emotions).unsqueeze(1) prompts = self.self_attn(prompts) return prompts if __name__ == "__main__": # emotions = ["amusement", "anger", "awe", "contentment", # "disgust", "excitement", "fear", "sadness"] # feature_names = ["clip", "vgg", "dinov2"] # prompts_dir = "features/origin" # model = EmotionEmbedding(emotions, prompts_dir, feature_names, output_dim=2048, prompt_len=16).to("cuda") # optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # output = model('awe') # target = torch.ones_like(output) # loss = ((output - target) ** 2).mean() # print(output) emotions = ["amusement", "anger", "awe", "contentment", "disgust", "excitement", "fear", "sadness"] prompts_dir = "features/origin" model = EmotionEmbedding2(emotions, input_dim=2048, output_dim=2048, prompt_len=16).to("cuda") optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) output = model('awe') target = torch.ones_like(output) loss = ((output - target) ** 2).mean() print(output) # 反向传播 loss.backward() # 打印看看梯度 for name, param in model.named_parameters(): if param.grad is not None: print(f"{name} has gradient ✅, grad mean: {param.grad.mean().item()}") if name == "emotion_params.weight": print(param.grad) else: print(f"{name} has NO gradient ❌") # 更新一下参数 optimizer.step() print(output)