File size: 2,572 Bytes
24a2e59 be915aa 24a2e59 be915aa 24a2e59 be915aa 24a2e59 be915aa 24a2e59 be915aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from transformers import pipeline
import re
class ContextAwareLyricCleaner:
def __init__(self):
self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
self.replacements = {
r'\bfuck\b': 'frick',
r'\bshit\b': 'shoot',
r'\bfucking\b': 'flipping',
r'\bfucked\b': 'flipped',
r'\bshitty\b': 'soggy',
r'\bass\b': 'butt',
r'\basses\b': 'butts',
r'\basshole\b': 'jerkface',
r'\bbitch\b': 'witch',
r'\bbitches\b': 'witches',
r'\bdamn\b': 'darn',
r'\bcunt\b': 'punk',
r'\bcrap\b': 'junk',
r'\bdick\b': 'prick',
r'\bfag\b': 'nerd',
r'\bfaggot\b': 'loser',
r'\bmothafucka\b': 'motherlover',
r'\bmotherfucker\b': 'motherlover',
r'\bhell\b': 'heck',
r'\bprick\b': 'jerk',
r'\bpiss\b': 'pee',
r'\bpissed\b': 'mad',
r'\bshithead\b': 'knucklehead',
r'\bslut\b': 'scout',
r'\bwhore\b': 'score',
r'\bwtf\b': 'what the flip',
r'\bwtf\b': 'what the flip',
r'\bson of a bitch\b': 'son of a glitch',
r'\bbastard\b': 'rascal',
r'\bgod\b': 'gosh',
r'\blord\b': 'love',
# Add more...
}
self.patterns = {re.compile(k, re.IGNORECASE): v for k, v in self.replacements.items()}
self.explicit_labels = ["explicit", "offensive", "inappropriate"]
self.threshold = 0.7 # confidence threshold to consider line explicit
def is_explicit(self, text: str) -> bool:
result = self.classifier(text, candidate_labels=self.explicit_labels + ["clean"], multi_label=False)
scores = dict(zip(result['labels'], result['scores']))
# Check if any explicit label scores above threshold
return any(scores.get(label, 0) > self.threshold for label in self.explicit_labels)
def clean_line(self, line: str) -> str:
cleaned = line
for pattern, replacement in self.patterns.items():
cleaned = pattern.sub(replacement, cleaned)
return cleaned
def clean_lyrics(self, lyrics: str) -> str:
lines = lyrics.split('\n')
cleaned_lines = []
for line in lines:
if self.is_explicit(line):
cleaned_lines.append(self.clean_line(line))
else:
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
|