|
from transformers import PreTrainedModel, PretrainedConfig |
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
|
|
|
|
class SASOKConfig(PretrainedConfig): |
|
model_type = "sasok" |
|
|
|
def __init__(self, vocab_size=50000, hidden_size=512, num_heads=8, num_layers=4, **kwargs): |
|
super().__init__(**kwargs) |
|
self.vocab_size = vocab_size |
|
self.hidden_size = hidden_size |
|
self.num_heads = num_heads |
|
self.num_layers = num_layers |
|
|
|
|
|
class SASOKModel(PreTrainedModel): |
|
config_class = SASOKConfig |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.embedding = nn.Embedding(config.vocab_size, config.hidden_size) |
|
self.percept_bn = nn.BatchNorm1d(config.hidden_size) |
|
|
|
self.emotion_ln = nn.LayerNorm(config.hidden_size) |
|
self.attn = nn.MultiheadAttention(config.hidden_size, config.num_heads, batch_first=True) |
|
self.attn_ln = nn.LayerNorm(config.hidden_size) |
|
|
|
self.meta_stack = nn.ModuleList([ |
|
nn.TransformerEncoderLayer(d_model=config.hidden_size, nhead=config.num_heads, norm_first=True) |
|
for _ in range(config.num_layers) |
|
]) |
|
|
|
self.final_ln = nn.LayerNorm(config.hidden_size) |
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) |
|
|
|
self.init_weights() |
|
|
|
def forward(self, input_ids, attention_mask=None, labels=None): |
|
x = self.embedding(input_ids) |
|
x = x.transpose(1, 2) |
|
x = self.percept_bn(x).transpose(1, 2) |
|
|
|
x = self.emotion_ln(x) |
|
x_ln = self.attn_ln(x) |
|
x, _ = self.attn(x_ln, x_ln, x_ln) + x |
|
|
|
for layer in self.meta_stack: |
|
x = layer(x) |
|
|
|
x = self.final_ln(x) |
|
logits = self.lm_head(x) |
|
|
|
loss = None |
|
if labels is not None: |
|
loss_fn = nn.CrossEntropyLoss() |
|
loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1)) |
|
|
|
return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits} |