| from transformers import BertTokenizerFast, BertConfig | |
| from typing import Dict, List, Union, Tuple | |
| def num_unique_labels(dataset: Dict[str, Union[str, List[str]]]) -> Tuple[int, int]: | |
| """ | |
| Calculate the number of NER labels and INTENT labels in the dataset. | |
| Args: | |
| dataset (dict): A dictionary containing 'text', 'entities' and 'intent' keys. | |
| Returns: | |
| Tuple: Number of unique NER and INTENT lables. | |
| """ | |
| one_dimensional_ner = [tag for subset in dataset['entities'] for tag in subset] | |
| return len(set(one_dimensional_ner)), len(set(dataset['intent'])) | |
| def ner_labels_to_ids() -> Dict[str, int]: | |
| """ | |
| Map NER labels to corresponding numeric IDs. | |
| Returns: | |
| Dict[str, int]: A dictionary where keys are NER labels, and values are their corresponding IDs. | |
| """ | |
| labels_to_ids_ner = { | |
| 'O': 0, | |
| 'B-DATE': 1, | |
| 'I-DATE': 2, | |
| 'B-TIME': 3, | |
| 'I-TIME': 4, | |
| 'B-TASK': 5, | |
| 'I-TASK': 6, | |
| 'B-DUR': 7, | |
| 'I-DUR': 8 | |
| } | |
| return labels_to_ids_ner | |
| def ner_ids_to_labels(ner_labels_to_ids) -> Dict[int, str]: | |
| """ | |
| Map numeric IDs to corresponding NER labels. | |
| Returns: | |
| Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding NER labels. | |
| """ | |
| ner_ids_to_labels = {v: k for k, v in ner_labels_to_ids.items()} | |
| return ner_ids_to_labels | |
| def intent_labels_to_ids() -> Dict[str, int]: | |
| """ | |
| Map intent labels to corresponding numeric values. | |
| Returns: | |
| Dict[str, int]: A dictionary where keys are intent labels, and values are their corresponding numeric IDs. | |
| """ | |
| intent_labels_to_ids = { | |
| "'Schedule Appointment'": 0, | |
| "'Schedule Meeting'": 1, | |
| "'Set Alarm'": 2, | |
| "'Set Reminder'": 3, | |
| "'Set Timer'": 4 | |
| } | |
| return intent_labels_to_ids | |
| def intent_ids_to_labels(intent_labels_to_ids) -> Dict[int, str]: | |
| """ | |
| Map numeric values to corresponding intent labels. | |
| Returns: | |
| Dict[int, str]: A dictionary where keys are numeric IDs, and values are their corresponding intent labels. | |
| """ | |
| intent_ids_to_labels = {v: k for k, v in intent_labels_to_ids.items()} | |
| return intent_ids_to_labels | |
| def tokenizer() -> BertTokenizerFast: | |
| tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
| return tokenizer | |
| def bert_config() -> BertConfig: | |
| config = BertConfig.from_pretrained('bert-base-uncased') | |
| return config | |
| def structure_data(dataset): | |
| structured_data = {'text': [], 'entities': [], 'intent': []} | |
| for sample in dataset: | |
| structured_data['text'].append(sample['text']) | |
| structured_data['entities'].append(sample['entities'].split()) | |
| structured_data['intent'].append(sample['intent']) | |
| return structured_data |