File size: 2,370 Bytes
6a5d9aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

from datasets import load_dataset
dataset = load_dataset("audiofolder", data_dir="data")
#dataset= dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
from transformers import ASTForAudioClassification
from transformers import ASTFeatureExtractor
from transformers import TrainingArguments
import numpy as np
from transformers import Trainer
import evaluate
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10
labels=["noise","speech"]
num_labels = 2
max_duration = 5
model_id="bookbot/distil-ast-audioset"
model_name = "speechVSnoise"

label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

model = ASTForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,    label2id=label2id,
    id2label=id2label,
ignore_mismatched_sizes=True
)
feature_extractor = ASTFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=False
)


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,

    )
    return inputs
dataset_encoded = dataset.map(
    preprocess_function,
    batched=True,
    batch_size=1674,
    num_proc=1,
)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
training_args = TrainingArguments(
    f"{model_name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
#    metric_for_best_model="accuracy",
   # push_to_hub=True,
)

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["train"],
    tokenizer=feature_extractor,
 #   compute_metrics=compute_metrics,
)
trainer.train()