mgaido91 commited on
Commit
a09f6f5
·
verified ·
1 Parent(s): 18f5454

Upload FAMA small model

Browse files
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "activation_function": "relu",
4
+ "architectures": [
5
+ "ConformerEncoderDecoderForConditionalGeneration"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_conformer.Speech2TextConformerConfig",
10
+ "AutoModelForSpeechSeq2Seq": "conformer_model.ConformerEncoderDecoderForConditionalGeneration"
11
+ },
12
+ "batch_unsafe_relative_shift": false,
13
+ "bos_token_id": 0,
14
+ "conformer_attention_dropout": 0.1,
15
+ "conformer_conv_dropout": 0.1,
16
+ "conformer_conv_kernel_size": 31,
17
+ "conformer_feedforward_dropout": 0.1,
18
+ "conformer_half_step_residual": true,
19
+ "conv_channels": 1024,
20
+ "conv_expansion_factor": 2,
21
+ "conv_kernel_sizes": [
22
+ 5,
23
+ 5
24
+ ],
25
+ "ctc_compress_fixed_ratio": 4,
26
+ "ctc_compress_max_out_size": -1,
27
+ "ctc_compress_strategy": "none",
28
+ "d_model": 1024,
29
+ "decoder_attention_heads": 16,
30
+ "decoder_ffn_dim": 4096,
31
+ "decoder_layerdrop": 0.0,
32
+ "decoder_layers": 6,
33
+ "decoder_start_token_id": 2,
34
+ "dropout": 0.1,
35
+ "early_stopping": null,
36
+ "encoder_attention_heads": 16,
37
+ "encoder_layers": 12,
38
+ "eos_token_id": 2,
39
+ "feed_forward_expansion_factor": 4,
40
+ "init_std": 0.02,
41
+ "input_channels": 1,
42
+ "input_feat_per_channel": 80,
43
+ "is_encoder_decoder": true,
44
+ "max_length": null,
45
+ "max_source_positions": 6000,
46
+ "max_target_positions": 1024,
47
+ "model_type": "conformer_encoder_decoder",
48
+ "no_syncbatchnorm": false,
49
+ "num_beams": null,
50
+ "num_conv_layers": 2,
51
+ "num_hidden_layers": 12,
52
+ "pad_token_id": 1,
53
+ "scale_embedding": true,
54
+ "tie_word_embeddings": false,
55
+ "torch_dtype": "float32",
56
+ "transformers_version": "4.48.1",
57
+ "use_cache": true,
58
+ "vocab_size": 16004
59
+ }
configuration_conformer.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+ """Conformer model configuration"""
15
+
16
+ from transformers.configuration_utils import PretrainedConfig
17
+ from transformers.utils import logging
18
+
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+
23
+ class Speech2TextConformerConfig(PretrainedConfig):
24
+ r"""
25
+ This is the configuration class to store the configuration of a [`ConformerEncoderDecoderModel`]. It is used to
26
+ instantiate a Conformer model according to the specified arguments, defining the model architecture. Instantiating a
27
+ configuration with the defaults will yield a similar configuration to that of the conformer base architecture
28
+ in https://github.com/hlt-mt/FBK-fairseq/.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+
34
+ Args:
35
+ vocab_size (`int`, *optional*, defaults to 10000):
36
+ Vocabulary size of the Conformer model. Defines the number of different tokens that can be represented by
37
+ the `inputs_ids` passed when calling [`ConformerEncoderDecoderModel`]
38
+ encoder_layers (`int`, *optional*, defaults to 12):
39
+ Number of encoder layers.
40
+ feed_forward_expansion_factor (`int`, *optional*, defaults to 4):
41
+ Expansion factor that controls the size of the "intermediate" (often named feed-forward) layer in encoder.
42
+ conv_expansion_factor (`int`, *optional*, defaults to 2):
43
+ Expansion factor that controls the size of the intermediate convolution layers in the encoder.
44
+ conformer_feedforward_dropout (`float`, *optional*, defaults to 0.1):
45
+ Dropout probability of the Conformer FeedForward module.
46
+ conformer_attention_dropout (`float`, *optional*, defaults to 0.1):
47
+ Dropout probability of the Conformer Attention module.
48
+ conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
49
+ Dropout probability of the Conformer Convolution module.
50
+ conformer_conv_kernel_size (`int`, *optional*, defaults to 31):
51
+ Kernel size of the Conformer Convolution module.
52
+ conformer_half_step_residual (`bool`, *optional*, defaults to False):
53
+ Whether to use half step residual connections.
54
+ no_syncbatchnorm (`bool`, *optional*, defaults to False):
55
+ If `True`, SyncBatchNorm is replaced by BatchNorm1D in the Conformer Convolution module.
56
+ batch_unsafe_relative_shift (`bool`, *optional*, defaults to False):
57
+ If `True`, the relative_shift implementation disregards padding (returning different results
58
+ with different amount of padding for the same input) but is faster. This may lead to inconsistencies
59
+ with different batch sizes.
60
+ ctc_compress_strategy (`str`, *optional*, defaults to 'none'):
61
+ Strategy to use when compressing CTC output. Valid strategies are 'none', 'avg', 'weighted', 'softmax',
62
+ and 'fixed'.
63
+ ctc_compress_fixed_ratio ('int', *optional*, defaults to 4):
64
+ If ctc_compress_strategy is set to 'fixed', the fixed ratio controls how many consecutive steps to merge.
65
+ ctc_compress_max_out_size ('int', *optional*, defaults to -1):
66
+ If CTC compression is enabled (ctc_compress_strategy != 'none') and this argument is set to a positive
67
+ number, every input is forced to be at most as long as the value set for this parameter, even though the
68
+ CTC would not compress it enough. Intuitively, this parameter should be set to 1/4 of the max input length
69
+ to ensure that the maximum sequence length of the self-attention input is the same as in the case of models
70
+ having 2 initial convolutions with stride 2.
71
+ encoder_attention_heads (`int`, *optional*, defaults to 8):
72
+ Number of attention heads for each attention layer in the Transformer encoder.
73
+ decoder_layers (`int`, *optional*, defaults to 6):
74
+ Number of decoder layers.
75
+ decoder_ffn_dim (`int`, *optional*, defaults to 2048):
76
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
77
+ decoder_attention_heads (`int`, *optional*, defaults to 8):
78
+ Number of attention heads for each attention layer in the Transformer decoder.
79
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
80
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for
81
+ more details.
82
+ use_cache (`bool`, *optional*, defaults to `True`):
83
+ Whether the model should return the last key/values attentions (not used by all models).
84
+ is_encoder_decoder (`bool`, *optional*, defaults to `True`):
85
+ Whether the model is set up as an encoder-decoder architecture for sequence-to-sequence tasks.
86
+ activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
87
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
88
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
89
+ d_model (`int`, *optional*, defaults to 512):
90
+ Dimensionality of the layers and the pooler layer.
91
+ dropout (`float`, *optional*, defaults to 0.1):
92
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
93
+ attention_dropout (`float`, *optional*, defaults to 0.1):
94
+ The dropout ratio for the attention probabilities.
95
+ activation_dropout (`float`, *optional*, defaults to 0.1):
96
+ The dropout ratio for activations inside the fully connected layer.
97
+ init_std (`float`, *optional*, defaults to 0.02):
98
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
99
+ decoder_start_token_id (`int`, *optional*, defaults to 2):
100
+ The initial token ID of the decoder when decoding sequences.
101
+ scale_embedding (`bool`, *optional*, defaults to `True`):
102
+ Whether the embeddings are scaled by the square root of `d_model`.
103
+ pad_token_id (`int`, *optional*, defaults to 1):
104
+ Padding token id.
105
+ bos_token_id (`int`, *optional*, defaults to 0):
106
+ The id of the beginning-of-sequence token.
107
+ eos_token_id (`int`, *optional*, defaults to 2):
108
+ The id of the end-of-sequence token.
109
+ max_source_positions (`int`, *optional*, defaults to 6000):
110
+ The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
111
+ max_target_positions (`int`, *optional*, defaults to 1024):
112
+ The maximum sequence length that this model might ever be used with. Typically, set this to something large
113
+ just in case (e.g., 512 or 1024 or 2048).
114
+ num_conv_layers (`int`, *optional*, defaults to 2):
115
+ Number of 1D convolutional layers in the conv module.
116
+ conv_kernel_sizes (`Tuple[int]`, *optional*, defaults to `(5, 5)`):
117
+ A tuple of integers defining the kernel size of each 1D convolutional layer in the conv module. The length
118
+ of `conv_kernel_sizes` has to match `num_conv_layers`.
119
+ conv_channels (`int`, *optional*, defaults to 1024):
120
+ An integer defining the number of output channels of each convolution layers except the final one in the
121
+ conv module.
122
+ input_feat_per_channel (`int`, *optional*, defaults to 80):
123
+ An integer specifying the size of feature vector. This is also the dimensions of log-mel filter-bank
124
+ features.
125
+ input_channels (`int`, *optional*, defaults to 1):
126
+ An integer specifying number of input channels of the input feature vector.
127
+
128
+ Example:
129
+
130
+ ```python
131
+ >>> from transformers import Speech2TextConformerConfig, ConformerEncoderDecoderModel
132
+
133
+ >>> # Initializing a configuration with default params
134
+ >>> configuration = Speech2TextConformerConfig()
135
+
136
+ >>> # Initializing a model (with random weights) from the default configuration
137
+ >>> model = ConformerEncoderDecoderModel(configuration)
138
+
139
+ >>> # Accessing the model configuration
140
+ >>> configuration = model.config
141
+ ```"""
142
+
143
+ model_type = "conformer_encoder_decoder"
144
+ keys_to_ignore_at_inference = ["past_key_values"]
145
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
146
+
147
+ def __init__(
148
+ self,
149
+ vocab_size=10000,
150
+ encoder_layers=12,
151
+ feed_forward_expansion_factor=4,
152
+ conv_expansion_factor=2,
153
+ conformer_feedforward_dropout=0.1,
154
+ conformer_attention_dropout=0.1,
155
+ conformer_conv_dropout=0.1,
156
+ conformer_conv_kernel_size=31,
157
+ conformer_half_step_residual=True,
158
+ no_syncbatchnorm=False,
159
+ batch_unsafe_relative_shift=False,
160
+ ctc_compress_strategy="none",
161
+ ctc_compress_fixed_ratio=4,
162
+ ctc_compress_max_out_size=-1,
163
+ encoder_attention_heads=8,
164
+ decoder_layers=6,
165
+ decoder_ffn_dim=2048,
166
+ decoder_attention_heads=8,
167
+ decoder_layerdrop=0.0,
168
+ use_cache=True,
169
+ is_encoder_decoder=True,
170
+ activation_function="relu",
171
+ d_model=512,
172
+ dropout=0.1,
173
+ attention_dropout=0.1,
174
+ activation_dropout=0.1,
175
+ init_std=0.02,
176
+ decoder_start_token_id=2,
177
+ scale_embedding=True,
178
+ pad_token_id=1,
179
+ bos_token_id=0,
180
+ eos_token_id=2,
181
+ max_source_positions=6000,
182
+ max_target_positions=1024,
183
+ num_conv_layers=2,
184
+ conv_kernel_sizes=(5, 5),
185
+ conv_channels=1024,
186
+ input_feat_per_channel=80,
187
+ input_channels=1,
188
+ **kwargs,
189
+ ):
190
+ self.vocab_size = vocab_size
191
+ self.d_model = d_model
192
+ self.feed_forward_expansion_factor = feed_forward_expansion_factor
193
+ self.conv_expansion_factor = conv_expansion_factor
194
+ self.conformer_feedforward_dropout = conformer_feedforward_dropout
195
+ self.conformer_attention_dropout = conformer_attention_dropout
196
+ self.conformer_conv_dropout = conformer_conv_dropout
197
+ self.conformer_conv_kernel_size = conformer_conv_kernel_size
198
+ self.conformer_half_step_residual = conformer_half_step_residual
199
+ self.no_syncbatchnorm = no_syncbatchnorm
200
+ self.batch_unsafe_relative_shift = batch_unsafe_relative_shift
201
+ self.ctc_compress_strategy = ctc_compress_strategy
202
+ self.ctc_compress_fixed_ratio = ctc_compress_fixed_ratio
203
+ self.ctc_compress_max_out_size = ctc_compress_max_out_size
204
+ self.encoder_layers = encoder_layers
205
+ self.encoder_attention_heads = encoder_attention_heads
206
+ self.decoder_ffn_dim = decoder_ffn_dim
207
+ self.decoder_layers = decoder_layers
208
+ self.decoder_attention_heads = decoder_attention_heads
209
+ self.dropout = dropout
210
+ self.attention_dropout = attention_dropout
211
+ self.activation_dropout = activation_dropout
212
+ self.activation_function = activation_function
213
+ self.init_std = init_std
214
+ self.decoder_layerdrop = decoder_layerdrop
215
+ self.use_cache = use_cache
216
+ self.num_hidden_layers = encoder_layers
217
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
218
+ self.max_source_positions = max_source_positions
219
+ self.max_target_positions = max_target_positions
220
+ self.num_conv_layers = num_conv_layers
221
+ self.conv_kernel_sizes = list(conv_kernel_sizes)
222
+ self.conv_channels = conv_channels
223
+ self.input_feat_per_channel = input_feat_per_channel
224
+ self.input_channels = input_channels
225
+
226
+ if self.ctc_compress_strategy not in ['none', 'avg', 'weighted', 'softmax', 'fixed']:
227
+ raise ValueError(
228
+ f"Configuration value for ctc_compress_strategy is invalid. `{self.ctc_compress_strategy}` is set, "
229
+ f"but the allowed values are: `none`, `avg`, `weighted`, `softmax`, `fixed`.")
230
+
231
+ if len(self.conv_kernel_sizes) != self.num_conv_layers:
232
+ raise ValueError(
233
+ "Configuration for convolutional module is incorrect. "
234
+ "It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers` "
235
+ f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`, "
236
+ f"`config.num_conv_layers = {self.num_conv_layers}`."
237
+ )
238
+
239
+ super().__init__(
240
+ pad_token_id=pad_token_id,
241
+ bos_token_id=bos_token_id,
242
+ eos_token_id=eos_token_id,
243
+ is_encoder_decoder=is_encoder_decoder,
244
+ decoder_start_token_id=decoder_start_token_id,
245
+ **kwargs,
246
+ )
conformer_model.py ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+ # the code below contains parts copied from the Conformer implementation in
15
+ # https://github.com/hlt-mt/FBK-fairseq/blob/master/examples/speech_to_text/models/conformer.py
16
+ import math
17
+ from itertools import groupby
18
+ from typing import Union, Tuple, Optional
19
+
20
+ import torch
21
+ import transformers
22
+ from torch import nn, Tensor
23
+ from torch.nn import CrossEntropyLoss, functional as F
24
+
25
+ from transformers import Speech2TextPreTrainedModel, add_start_docstrings, GenerationMixin, Speech2TextProcessor, \
26
+ Speech2TextTokenizer, Speech2TextFeatureExtractor
27
+ from transformers.modeling_outputs import Seq2SeqModelOutput, BaseModelOutput, Seq2SeqLMOutput
28
+ from transformers.models.speech_to_text.modeling_speech_to_text import Speech2TextDecoder, \
29
+ SPEECH_TO_TEXT_INPUTS_DOCSTRING, shift_tokens_right
30
+ from transformers.utils import replace_return_docstrings, add_start_docstrings_to_model_forward, logging
31
+
32
+ from .configuration_conformer import Speech2TextConformerConfig
33
+
34
+
35
+ logger = logging.get_logger(__name__)
36
+
37
+ _CONFIG_FOR_DOC = "Speech2TextConformerConfig"
38
+
39
+ CONFORMER_START_DOCSTRING = r"""
40
+ This model is an implementation of an attention-based autoregressive encoder-decoder model, in which the encoder
41
+ is a Conformer Encoder and decoder is a Transformer Decoder. The encoder expects 80-feature spectrograms as input
42
+ as the [`Speech2TextModel`] and its implementation follows that of the paper:
43
+
44
+ `"When Good and Reproducible Results are a Giant with Feet of Clay: The Importance of Software Quality in NLP"
45
+ (Papi, et al, ACL 2024) <https://aclanthology.org/2024.acl-long.200/>`_.
46
+
47
+ This ensures consistency of results regardless of the presence of padding.
48
+
49
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
50
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
51
+ etc.)
52
+
53
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
54
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
55
+ and behavior.
56
+
57
+ Parameters:
58
+ config ([`Speech2TextConformerConfig`]):
59
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
60
+ load the weights associated with the model, only the configuration. Check out the
61
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
62
+ """
63
+
64
+
65
+ class Conv1dSubsampler(nn.Module):
66
+ """Convolutional subsampler: a stack of 1D convolution (along temporal
67
+ dimension) followed by non-linear activation via gated linear units
68
+ (https://arxiv.org/abs/1911.08460)
69
+ """
70
+
71
+ def __init__(self, config: Speech2TextConformerConfig):
72
+ super(Conv1dSubsampler, self).__init__()
73
+ self.n_layers = len(config.conv_kernel_sizes)
74
+ in_channels = config.input_feat_per_channel * config.input_channels
75
+ mid_channels = config.conv_channels
76
+ out_channels = config.d_model
77
+ self.conv_layers = nn.ModuleList(
78
+ nn.Conv1d(
79
+ in_channels if i == 0 else mid_channels // 2,
80
+ mid_channels if i < self.n_layers - 1 else out_channels * 2,
81
+ k,
82
+ stride=2,
83
+ padding=k // 2,
84
+ )
85
+ for i, k in enumerate(config.conv_kernel_sizes)
86
+ )
87
+
88
+ @staticmethod
89
+ def subsampled_sequence_len(seq_lens, kernel_size=5, padding=1, stride=2):
90
+ compressed_seq_lens = seq_lens.clone()
91
+ return ((compressed_seq_lens.float() - kernel_size + 2 * padding) / stride + 1).floor().long()
92
+
93
+ @staticmethod
94
+ def lengths_to_padding_mask(lens: torch.LongTensor) -> torch.BoolTensor:
95
+ bsz, max_lens = lens.size(0), torch.max(lens).item()
96
+ mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
97
+ mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
98
+ return mask
99
+
100
+ def forward(self, src_tokens: torch.FloatTensor, padding_mask: torch.IntTensor) -> torch.Tensor:
101
+ x = src_tokens.transpose(1, 2).contiguous() # B x T x (C x D) -> B x (C x D) x T
102
+ actual_src_lengths = padding_mask.sum(dim=1)
103
+ for conv in self.conv_layers:
104
+ x = conv(x)
105
+ x = nn.functional.glu(x, dim=1)
106
+ actual_src_lengths = self.subsampled_sequence_len(
107
+ actual_src_lengths,
108
+ kernel_size=conv.kernel_size[0],
109
+ padding=conv.padding[0],
110
+ stride=conv.stride[0])
111
+ x = x.masked_fill(
112
+ self.lengths_to_padding_mask(actual_src_lengths).unsqueeze(1), 0)
113
+ x = x.transpose(1, 2).transpose(0, 1).contiguous() # -> T x B x (C x D)
114
+ return x
115
+
116
+
117
+ class PositionalEncoding(nn.Module):
118
+ """
119
+ Positional Encoding proposed in "Attention Is All You Need".
120
+ "Attention Is All You Need" use sine and cosine functions of different frequencies:
121
+ PE_(pos, 2i) = sin(pos / power(10000, 2i / d_model))
122
+ PE_(pos, 2i+1) = cos(pos / power(10000, 2i / d_model))
123
+ The version implemented on Fairseq differs slightly from the paper, this implementation is faithful to the
124
+ original one. Please see
125
+ :func:`~fairseq.modules.sinusoidal_positional_embedding.SinusoidalPositionalEmbedding.get_embedding` for more
126
+ details.
127
+ """
128
+
129
+ def __init__(self, d_model: int = 512, max_len: int = 10000) -> None:
130
+ super(PositionalEncoding, self).__init__()
131
+ pe = torch.zeros(max_len, d_model, requires_grad=False)
132
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
133
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
134
+ pe[:, 0::2] = torch.sin(position * div_term)
135
+ pe[:, 1::2] = torch.cos(position * div_term)
136
+ pe = pe.unsqueeze(0)
137
+ self.register_buffer('pe', pe)
138
+
139
+ def forward(self, length: int) -> Tensor:
140
+ return self.pe[:, :length]
141
+
142
+
143
+ class RelativeMultiHeadAttention(nn.Module):
144
+ """
145
+ Multi-head attention with relative positional encoding.
146
+ This concept was proposed in the `"Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
147
+ <https://arxiv.org/pdf/1901.02860.pdf>`_.
148
+
149
+ Args:
150
+ d_model (int): The dimension of model
151
+ num_heads (int): The number of attention heads.
152
+ dropout_p (float): probability of dropout
153
+
154
+ Inputs: query, key, value, pos_embedding, mask
155
+ query (batch, time, dim): Tensor containing query vector
156
+ key (batch, time, dim): Tensor containing key vector
157
+ value (batch, time, dim): Tensor containing value vector
158
+ pos_embedding (batch, time, dim): Positional embedding tensor
159
+ mask (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
160
+
161
+ Returns:
162
+ **outputs**: Tensor produces by relative multi head attention module.
163
+ """
164
+
165
+ def __init__(
166
+ self,
167
+ d_model: int = 512,
168
+ num_heads: int = 16,
169
+ dropout_p: float = 0.1,
170
+ batch_unsafe_relative_shift: bool = False
171
+ ):
172
+ super(RelativeMultiHeadAttention, self).__init__()
173
+ assert d_model % num_heads == 0, "d_model % num_heads should be zero."
174
+ self.d_model = d_model
175
+ self.d_head = int(d_model / num_heads)
176
+ self.num_heads = num_heads
177
+ self.sqrt_dim = math.sqrt(d_model)
178
+
179
+ self.query_proj = nn.Linear(d_model, d_model)
180
+ nn.init.xavier_uniform_(self.query_proj.weight)
181
+ nn.init.zeros_(self.query_proj.bias)
182
+ self.key_proj = nn.Linear(d_model, d_model)
183
+ nn.init.xavier_uniform_(self.key_proj.weight)
184
+ nn.init.zeros_(self.key_proj.bias)
185
+ self.value_proj = nn.Linear(d_model, d_model)
186
+ nn.init.xavier_uniform_(self.value_proj.weight)
187
+ nn.init.zeros_(self.value_proj.bias)
188
+ self.pos_proj = nn.Linear(d_model, d_model, bias=False)
189
+ nn.init.xavier_uniform_(self.pos_proj.weight)
190
+
191
+ self.dropout = nn.Dropout(p=dropout_p)
192
+ # u and v are the trainable parameters of the Transformer-XL attention computation
193
+ self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
194
+ self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
195
+ nn.init.xavier_uniform_(self.u_bias)
196
+ nn.init.xavier_uniform_(self.v_bias)
197
+
198
+ self.out_proj = nn.Linear(d_model, d_model)
199
+ nn.init.xavier_uniform_(self.out_proj.weight)
200
+ nn.init.zeros_(self.out_proj.bias)
201
+ self.relative_shift_func = self._relative_shift_unsafe if batch_unsafe_relative_shift else self._relative_shift
202
+
203
+ def forward(
204
+ self,
205
+ query: Tensor,
206
+ key: Tensor,
207
+ value: Tensor,
208
+ pos_embedding: Tensor,
209
+ mask: Optional[Tensor] = None,
210
+ ) -> Tuple[Tensor, Tensor]:
211
+ batch_size = value.size(0)
212
+
213
+ query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
214
+ key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
215
+ value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
216
+ pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
217
+
218
+ # Attention weights computation using Q + u as in Transformer-XL
219
+ content_score = torch.matmul((query + self.u_bias).transpose(1, 2), key.transpose(2, 3))
220
+ # Relative positional weights computation using Q + v as in Transformer-XL
221
+ pos_score = torch.matmul((query + self.v_bias).transpose(1, 2), pos_embedding.permute(0, 2, 3, 1))
222
+ # Right shifting mechanism described in Transformer-XL
223
+ pos_score = self.relative_shift_func(pos_score, mask)
224
+ # Final attention weights obtained summing the attention with its relative positional embeddings
225
+ score = (content_score + pos_score) / self.sqrt_dim
226
+
227
+ if mask is not None:
228
+ mask = mask.unsqueeze(1)
229
+ score.masked_fill_(mask, -1e9 if mask.dtype == torch.float32 else -1e4)
230
+
231
+ attn = F.softmax(score, dim=-1)
232
+ # set to 0.0 all attention weights of padding elements
233
+ if mask is not None:
234
+ attn = attn.masked_fill(mask, 0.0)
235
+ attn = self.dropout(attn)
236
+
237
+ # Attention computation
238
+ context = torch.matmul(attn, value).transpose(1, 2)
239
+ context = context.contiguous().view(batch_size, -1, self.d_model)
240
+
241
+ return self.out_proj(context), attn
242
+
243
+ def _relative_shift(self, pos_score: Tensor, padding_mask: Tensor) -> Tensor:
244
+ """
245
+ This methods performs the relative shift operation row-wise.
246
+ Although inefficient, it enforces that each row is shifted accounting its padding,
247
+ which enforces that the result does not change depending on whether a given row
248
+ is padded or not.
249
+ """
250
+ batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
251
+ assert seq_length1 == seq_length2, "Currently we support only self-attention"
252
+ zeros = pos_score.new_zeros(batch_size, num_heads, seq_length1, 1)
253
+ padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
254
+
255
+ seq_lengths = (seq_length1 - (padding_mask[:, :, 0]).sum(-1)).tolist()
256
+ for b_i in range(batch_size):
257
+ padded_batch_pos_scores = padded_pos_score[b_i, :, :seq_lengths[b_i], :seq_lengths[b_i] + 1]
258
+ padded_batch_pos_scores = padded_batch_pos_scores.reshape(num_heads, seq_lengths[b_i] + 1, seq_lengths[b_i])
259
+ pos_score[b_i, :, :seq_lengths[b_i], :seq_lengths[b_i]] = padded_batch_pos_scores[:, 1:, :]
260
+ pos_score.masked_fill_(padding_mask.unsqueeze(1), 0.0)
261
+ return pos_score
262
+
263
+ def _relative_shift_unsafe(self, pos_score: Tensor, padding_mask: Tensor) -> Tensor:
264
+ """
265
+ This implementation reflects other open source ones (e.g. fairseq), which
266
+ shift the values from the row above in the batch. Although efficient,
267
+ this leads to inconsistencies in the results, as the same row has different
268
+ values according to whether it is padded (and how much it is) or not.
269
+ """
270
+ batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
271
+ zeros = pos_score.new_zeros(batch_size, num_heads, seq_length1, 1)
272
+ padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
273
+
274
+ padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
275
+ pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
276
+
277
+ return pos_score
278
+
279
+
280
+ class MultiHeadedSelfAttentionModule(nn.Module):
281
+ """
282
+ Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL,
283
+ the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention
284
+ module to generalize better on different input length and the resulting encoder is more robust to the variance of
285
+ the utterance length. Conformer use prenorm residual units with dropout which helps training
286
+ and regularizing deeper models.
287
+
288
+ Args:
289
+ d_model (int): The dimension of model
290
+ num_heads (int): The number of attention heads.
291
+ dropout_p (float): probability of dropout
292
+
293
+ Inputs: inputs, mask
294
+ x (batch, time, dim): Tensor containing input vector
295
+ mask (batch, time1, time2): Tensor containing indices to be masked
296
+
297
+ Returns:
298
+ **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
299
+ """
300
+ def __init__(self, d_model: int, num_heads: int, dropout_p: float = 0.1, batch_unsafe_relative_shift: bool = False):
301
+ super(MultiHeadedSelfAttentionModule, self).__init__()
302
+ self.positional_encoding = PositionalEncoding(d_model)
303
+ self.layer_norm = nn.LayerNorm(d_model)
304
+ self.attention = RelativeMultiHeadAttention(d_model, num_heads, dropout_p, batch_unsafe_relative_shift)
305
+ self.dropout = nn.Dropout(p=dropout_p)
306
+
307
+ def forward(
308
+ self, x: Tensor, encoder_padding_mask: Optional[Tensor] = None, output_attention: bool = False
309
+ ) -> Tuple[Tensor, Tensor]:
310
+ batch_size, seq_length, _ = x.size()
311
+ pos_embedding = self.positional_encoding(seq_length)
312
+ pos_embedding = pos_embedding.repeat(batch_size, 1, 1)
313
+ # we need attention padding mask (attn_mask) to be applied during the attention calculation,
314
+ # we obtain it from the encoder_padding_mask (B x T) by repeating it T times (x.shape[1]) and
315
+ # taking the logical or to correctly mask both T x T dimensions
316
+ att_mask = encoder_padding_mask.unsqueeze(1).repeat([1, x.shape[1], 1])
317
+ att_mask = att_mask.logical_or(att_mask.transpose(1, 2)) # B x T x T
318
+
319
+ x = self.layer_norm(x)
320
+ outputs, attn = self.attention(x, x, x, pos_embedding=pos_embedding, mask=att_mask)
321
+
322
+ return self.dropout(outputs), attn if output_attention else None
323
+
324
+
325
+ class FeedForwardModule(nn.Module):
326
+ """
327
+ Conformer Feed Forward Module follow pre-norm residual units and apply layer normalization within the residual unit
328
+ and on the input before the first linear layer. This module also apply Swish activation and dropout, which helps
329
+ regularizing the network.
330
+
331
+ Args:
332
+ encoder_dim (int): Dimension of conformer encoder
333
+ expansion_factor (int): Expansion factor of feed forward module.
334
+ dropout_p (float): Ratio of dropout
335
+
336
+ Inputs: inputs
337
+ x (batch, time, dim): Tensor contains input sequences
338
+
339
+ Outputs: outputs
340
+ **outputs** (batch, time, dim): Tensor produces by feed forward module.
341
+ """
342
+
343
+ def __init__(
344
+ self,
345
+ encoder_dim: int = 512,
346
+ expansion_factor: int = 4,
347
+ dropout_p: float = 0.1,
348
+ ) -> None:
349
+ super(FeedForwardModule, self).__init__()
350
+ self.layernorm = nn.LayerNorm(encoder_dim)
351
+ self.dropout_module = nn.Dropout(p=dropout_p)
352
+ self.first_linear = nn.Linear(encoder_dim, encoder_dim * expansion_factor, bias=True)
353
+ nn.init.xavier_uniform_(self.first_linear.weight)
354
+ nn.init.zeros_(self.first_linear.bias)
355
+ self.second_linear = nn.Linear(encoder_dim * expansion_factor, encoder_dim, bias=True)
356
+ nn.init.xavier_uniform_(self.second_linear.weight)
357
+ nn.init.zeros_(self.second_linear.bias)
358
+
359
+ def forward(self, x: Tensor) -> Tensor:
360
+ x = self.layernorm(x)
361
+ x = self.first_linear(x)
362
+ x = F.silu(x)
363
+ x = self.dropout_module(x)
364
+ x = self.second_linear(x)
365
+ x = self.dropout_module(x)
366
+ return x
367
+
368
+
369
+ class ConformerConvModule(nn.Module):
370
+ """
371
+ Conformer convolution module starts with the first pointwise convolution and a gated linear unit (GLU).
372
+ This is followed by a single 1-D depthwise convolution layer. Batchnorm is deployed just after the convolution
373
+ to aid training deep models. Then, Swift (or SiLu) activation function is applied and followed by the second
374
+ pointwise convolution. The Dropout module is applied in the end.
375
+
376
+ Args:
377
+ in_channels (int): Number of channels in the input
378
+ kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31
379
+ dropout_p (float, optional): probability of dropout
380
+
381
+ Inputs: inputs
382
+ x (batch, time, dim): Tensor contains input sequences
383
+
384
+ Outputs: outputs
385
+ **outputs** (batch, time, dim): Tensor produces by conformer convolution module.
386
+ """
387
+ def __init__(
388
+ self,
389
+ in_channels: int,
390
+ kernel_size: int = 31,
391
+ expansion_factor: int = 2,
392
+ dropout_p: float = 0.1,
393
+ no_syncbatchnorm: bool = False,
394
+ ) -> None:
395
+ super(ConformerConvModule, self).__init__()
396
+ assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
397
+ assert expansion_factor == 2, "Currently, only supports expansion_factor 2"
398
+ self.layernorm = nn.LayerNorm(in_channels)
399
+ self.batchnorm = nn.SyncBatchNorm(in_channels) if not no_syncbatchnorm else nn.BatchNorm1d(in_channels)
400
+ self.first_pointwise_conv1d = nn.Conv1d(
401
+ in_channels=in_channels,
402
+ out_channels=in_channels * expansion_factor,
403
+ kernel_size=(1, ),
404
+ stride=(1, ),
405
+ padding=0,
406
+ bias=True,
407
+ )
408
+ self.second_pointwise_conv1d = nn.Conv1d(
409
+ in_channels=in_channels,
410
+ out_channels=in_channels,
411
+ kernel_size=(1, ),
412
+ stride=(1, ),
413
+ padding=0,
414
+ bias=True,
415
+ )
416
+ self.depthwise_conv1d = nn.Conv1d(
417
+ in_channels=in_channels,
418
+ out_channels=in_channels,
419
+ kernel_size=(kernel_size, ),
420
+ stride=(1, ),
421
+ groups=in_channels,
422
+ padding=(kernel_size - 1) // 2,
423
+ bias=False,
424
+ )
425
+ self.dropout_module = nn.Dropout(p=dropout_p)
426
+
427
+ def forward(self, x: Tensor, encoder_padding_mask: Tensor) -> Tensor:
428
+ x = self.layernorm(x).transpose(1, 2)
429
+ x = self.first_pointwise_conv1d(x)
430
+ x = F.glu(x, dim=1)
431
+ bool_padding_mask = None
432
+ if encoder_padding_mask is not None:
433
+ bool_padding_mask = encoder_padding_mask.unsqueeze(1).bool()
434
+ if bool_padding_mask is not None:
435
+ x = x.float().masked_fill(bool_padding_mask, 0.0)
436
+ x = self.depthwise_conv1d(x)
437
+ if bool_padding_mask is not None:
438
+ x = x.float().masked_fill(bool_padding_mask, 0.0)
439
+ x = self.batchnorm(x)
440
+ if bool_padding_mask is not None:
441
+ x = x.float().masked_fill(bool_padding_mask, 0.0)
442
+ x = F.silu(x)
443
+ x = self.second_pointwise_conv1d(x)
444
+ if bool_padding_mask is not None:
445
+ x = x.float().masked_fill(bool_padding_mask, 0.0)
446
+ x = self.dropout_module(x)
447
+ return x.transpose(1, 2)
448
+
449
+
450
+ class ConformerEncoderLayer(nn.Module):
451
+ """
452
+ Conformer block contains two Feed Forward modules sandwiching the Multi-Headed Self-Attention module
453
+ and the Convolution module. This sandwich structure is inspired by Macaron-Net, which proposes replacing
454
+ the original feed-forward layer in the Transformer block into two half-step feed-forward layers,
455
+ one before the attention layer and one after.
456
+
457
+ Args:
458
+ encoder_dim (int, optional): Dimension of conformer encoder
459
+ num_attention_heads (int, optional): Number of attention heads
460
+ feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
461
+ conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
462
+ feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
463
+ attention_dropout_p (float, optional): Probability of attention module dropout
464
+ conv_dropout_p (float, optional): Probability of conformer convolution module dropout
465
+ conv_kernel_size (int or tuple, optional): Size of the convolving kernel
466
+ half_step_residual (bool): Flag indication whether to use half step residual or not
467
+
468
+ Inputs: inputs
469
+ x (time, batch, dim): Tensor containing input vector
470
+
471
+ Returns: outputs
472
+ **outputs** (batch, time, dim): Tensor produces by conformer block.
473
+ """
474
+
475
+ def __init__(self, config: Speech2TextConformerConfig):
476
+ super().__init__()
477
+ self.encoder_dim = config.d_model
478
+ self.num_attention_heads = config.encoder_attention_heads
479
+ self.feed_forward_expansion_factor = config.feed_forward_expansion_factor
480
+ self.conv_expansion_factor = config.conv_expansion_factor
481
+ self.feed_forward_dropout_p = config.conformer_feedforward_dropout
482
+ self.attention_dropout_p = config.conformer_attention_dropout
483
+ self.conv_dropout_p = config.conformer_conv_dropout
484
+ self.conv_kernel_size = config.conformer_conv_kernel_size
485
+ self.half_step_residual = config.conformer_half_step_residual
486
+ self.no_syncbatchnorm = config.no_syncbatchnorm
487
+ self.batch_unsafe_relative_shift = getattr(config, 'batch_unsafe_relative_shift', False)
488
+
489
+ if self.half_step_residual:
490
+ self.feed_forward_residual_factor = 0.5
491
+ else:
492
+ self.feed_forward_residual_factor = 1
493
+
494
+ self.first_feed_forward = FeedForwardModule(
495
+ encoder_dim=self.encoder_dim,
496
+ expansion_factor=self.feed_forward_expansion_factor,
497
+ dropout_p=self.feed_forward_dropout_p,
498
+ )
499
+
500
+ self.attention = MultiHeadedSelfAttentionModule(
501
+ d_model=self.encoder_dim,
502
+ num_heads=self.num_attention_heads,
503
+ dropout_p=self.attention_dropout_p,
504
+ batch_unsafe_relative_shift=self.batch_unsafe_relative_shift,
505
+ )
506
+
507
+ self.conv_module = ConformerConvModule(
508
+ in_channels=self.encoder_dim,
509
+ kernel_size=self.conv_kernel_size,
510
+ expansion_factor=self.conv_expansion_factor,
511
+ dropout_p=self.conv_dropout_p,
512
+ no_syncbatchnorm=self.no_syncbatchnorm,
513
+ )
514
+
515
+ self.second_feed_forward = FeedForwardModule(
516
+ encoder_dim=self.encoder_dim,
517
+ expansion_factor=self.feed_forward_expansion_factor,
518
+ dropout_p=self.feed_forward_dropout_p,
519
+ )
520
+
521
+ self.layernorm = nn.LayerNorm(self.encoder_dim)
522
+
523
+ def forward(
524
+ self, x: Tensor, encoder_padding_mask: Tensor, output_attentions: bool = False
525
+ ) -> Tuple[Tensor, Optional[Tensor]]:
526
+ x = x.transpose(0, 1) # B x T x C
527
+ new_x = self.first_feed_forward(x)
528
+ x = new_x * self.feed_forward_residual_factor + x
529
+ new_x, attn = self.attention(x, encoder_padding_mask, output_attentions)
530
+ x = new_x + x
531
+ new_x = self.conv_module(x, encoder_padding_mask)
532
+ x = new_x + x
533
+ new_x = self.second_feed_forward(x)
534
+ x = new_x * self.feed_forward_residual_factor + x
535
+ x = self.layernorm(x).transpose(1, 0)
536
+ return x, attn
537
+
538
+
539
+ class CTCCompressStrategy:
540
+ FIXED_RATIO = 4
541
+ @staticmethod
542
+ def new_lengths(batch_predicted):
543
+ return [len(p) for p in batch_predicted]
544
+
545
+ @staticmethod
546
+ def avg(prob_ctc, predicted, dtype, device):
547
+ new_lengths = CTCCompressStrategy.new_lengths(predicted)
548
+ new_maxlen = max(new_lengths)
549
+ weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype)
550
+ for b_idx, pred in enumerate(predicted):
551
+ processed_inputs_cnt = 0
552
+ for t_idx, same in enumerate(pred):
553
+ new_processed_inputs_cnt = processed_inputs_cnt + same[1]
554
+ weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = 1.0 / same[1]
555
+ processed_inputs_cnt = new_processed_inputs_cnt
556
+ return weights_matrix.to(device), new_lengths
557
+
558
+ @staticmethod
559
+ def weighted(prob_ctc, predicted, dtype, device):
560
+ new_lengths = CTCCompressStrategy.new_lengths(predicted)
561
+ new_maxlen = max(new_lengths)
562
+ weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype, device=device)
563
+ for b_idx, pred in enumerate(predicted):
564
+ processed_inputs_cnt = 0
565
+ for t_idx, same in enumerate(pred):
566
+ new_processed_inputs_cnt = processed_inputs_cnt + same[1]
567
+ # Get the probabilities of the prediction for the different time steps as weight
568
+ weights = prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]]
569
+ weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \
570
+ weights / weights.sum()
571
+ processed_inputs_cnt = new_processed_inputs_cnt
572
+ return weights_matrix, new_lengths
573
+
574
+ @staticmethod
575
+ def softmax(prob_ctc, predicted, dtype, device):
576
+ new_lengths = CTCCompressStrategy.new_lengths(predicted)
577
+ new_maxlen = max(new_lengths)
578
+ weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype, device=device)
579
+ for b_idx, pred in enumerate(predicted):
580
+ processed_inputs_cnt = 0
581
+ for t_idx, same in enumerate(pred):
582
+ new_processed_inputs_cnt = processed_inputs_cnt + same[1]
583
+ # Get the probabilities of the prediction for the different time steps as weight
584
+ weights = F.softmax(prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]])
585
+ weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \
586
+ weights / weights.sum()
587
+ processed_inputs_cnt = new_processed_inputs_cnt
588
+ return weights_matrix, new_lengths
589
+
590
+ @staticmethod
591
+ def fixed(prob_ctc, predicted, dtype, device):
592
+ new_maxlen = math.ceil(prob_ctc.shape[1] / CTCCompressStrategy.FIXED_RATIO)
593
+ weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype)
594
+ new_lengths = []
595
+ for b_idx, pred in enumerate(predicted):
596
+ original_len = sum(x[1] for x in pred)
597
+ new_len = 0
598
+ for new_t_idx in range(new_maxlen):
599
+ processed_inputs_cnt = new_t_idx * CTCCompressStrategy.FIXED_RATIO
600
+ processed_inputs_cnt_end = processed_inputs_cnt + CTCCompressStrategy.FIXED_RATIO
601
+ if processed_inputs_cnt_end > original_len:
602
+ processed_inputs_cnt_end = original_len
603
+ weights_matrix[b_idx, processed_inputs_cnt:processed_inputs_cnt_end, new_t_idx] = \
604
+ 1.0 / (processed_inputs_cnt_end - processed_inputs_cnt)
605
+ new_len += 1
606
+ if processed_inputs_cnt_end == original_len:
607
+ break
608
+ new_lengths.append(new_len)
609
+ return weights_matrix.to(device), new_lengths
610
+
611
+
612
+ class ConformerEncoderDecoderPreTrainedModel(Speech2TextPreTrainedModel):
613
+ config_class = Speech2TextConformerConfig
614
+
615
+
616
+ class ConformerEncoder(ConformerEncoderDecoderPreTrainedModel):
617
+ """
618
+ Conformer encoder consisting of *config.encoder_layers* layers. Each layer is a
619
+ [`ConformerEncoderLayer`].
620
+
621
+ Args:
622
+ config: Speech2TextConformerConfig
623
+ """
624
+
625
+ def __init__(self, config: Speech2TextConformerConfig):
626
+ super().__init__(config)
627
+
628
+ self.dropout = config.dropout
629
+
630
+ embed_dim = config.d_model
631
+ self.padding_idx = config.pad_token_id
632
+ self.max_source_positions = config.max_source_positions
633
+ self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
634
+
635
+ self.subsample = Conv1dSubsampler(config)
636
+
637
+ self.layers = nn.ModuleList([ConformerEncoderLayer(config) for _ in range(config.encoder_layers)])
638
+
639
+ self.ctc_flag = False
640
+ if config.ctc_compress_strategy != "none":
641
+ self.ctc_flag = True
642
+ self.ctc_fc = nn.Linear(config.encoder_embed_dim, config.src_vocab_size)
643
+ self.ctc_layer = config.ctc_encoder_layer
644
+ self.ctc_compress_method = getattr(CTCCompressStrategy, config.ctc_compress_strategy)
645
+ self.ctc_compress_max_out_size = config.ctc_compress_max_out_size
646
+ CTCCompressStrategy.FIXED_RATIO = config.ctc_compress_fixed_ratio
647
+
648
+ self.gradient_checkpointing = False
649
+ # Initialize weights and apply final processing
650
+ self.post_init()
651
+
652
+ def ensure_max_ctc_out_len(self, batch_predicted):
653
+ """
654
+ Ensures that the output of the CTC compression is not longer than the ctc_compress_max_out_size.
655
+ If there are samples violating this constraint, consecutive predictions are merged so to shorten the sentence.
656
+ E.g. if the ctc_compress_max_out_size is set to 3, and the output of the CTC compression would be
657
+ long 5, the first and second predictions are merged, as well as the third and the fourth. So, the
658
+ corresponding vectors will be merged according to the CTC compression strategy.
659
+ """
660
+ if self.ctc_compress_max_out_size > 0:
661
+
662
+ def merge_sublist(elements):
663
+ """
664
+ Takes a list of Tuples (predicted_element, num_corresponding_vectors) and returns
665
+ a single tuple with the predicted_element having the highest number of corresponding_vectors
666
+ (in case of a tie, the first is returned) and the total sum of the num_corresponding_vectors
667
+ E.g. if the input is [(a, 3), (b, 5), (c, 6), (a, 4)], the output will be (a, 18).
668
+ """
669
+ sum_num_vectors = 0
670
+ max_element = None
671
+ max_element_cnt = 0
672
+ temp_dict = {}
673
+ for predicted_element, num_corresponding_vectors in elements:
674
+ if predicted_element in temp_dict:
675
+ temp_dict[predicted_element] += num_corresponding_vectors
676
+ else:
677
+ temp_dict[predicted_element] = num_corresponding_vectors
678
+ if temp_dict[predicted_element] > max_element_cnt:
679
+ max_element_cnt = temp_dict[predicted_element]
680
+ max_element = predicted_element
681
+ sum_num_vectors += num_corresponding_vectors
682
+ return max_element, sum_num_vectors
683
+
684
+ for b_idx, p in enumerate(batch_predicted):
685
+ pred_len = len(p)
686
+ if pred_len > self.ctc_compress_max_out_size:
687
+ reduction_factor = math.ceil(pred_len / self.ctc_compress_max_out_size)
688
+ i = 0
689
+ new_p = []
690
+ while i < pred_len:
691
+ new_p.append(merge_sublist(p[i:i + reduction_factor]))
692
+ i += reduction_factor
693
+ batch_predicted[b_idx] = new_p
694
+
695
+ return batch_predicted
696
+
697
+ def average_same_ctc_features(self, x_ctc, x, input_lengths):
698
+ with torch.no_grad():
699
+ batch_predicted = []
700
+ prob_ctc = F.softmax(x_ctc, dim=-1).transpose(0, 1) # from T x B x D to B x T x D
701
+ for b in range(prob_ctc.shape[0]):
702
+ predicted = prob_ctc[b][: input_lengths[b]].argmax(-1).tolist()
703
+ batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)])
704
+ batch_predicted = self.ensure_max_ctc_out_len(batch_predicted)
705
+ weights_matrix, new_lengths = self.ctc_compress_method(
706
+ prob_ctc, batch_predicted, x.dtype, x.device)
707
+ # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
708
+ compressed_output = x.permute(1, 2, 0).bmm(weights_matrix) # B x C x T'
709
+ return compressed_output.permute(2, 0, 1), input_lengths.new(new_lengths)
710
+
711
+ @staticmethod
712
+ def lengths_to_padding_mask(lens: torch.LongTensor) -> Tensor:
713
+ bsz, max_lens = lens.size(0), torch.max(lens).item()
714
+ mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
715
+ mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
716
+ return mask
717
+
718
+ def apply_ctc(self, x, input_lengths):
719
+ x_ctc = self.ctc_fc(x)
720
+ x, input_lengths = self.average_same_ctc_features(x_ctc, x, input_lengths)
721
+ padding_mask = ConformerEncoder.lengths_to_padding_mask(input_lengths)
722
+ return x, x_ctc, padding_mask
723
+
724
+ def forward(
725
+ self,
726
+ input_features,
727
+ attention_mask=None,
728
+ head_mask=None,
729
+ output_attentions=None,
730
+ output_hidden_states=None,
731
+ return_dict=None,
732
+ ):
733
+ r"""
734
+ Args:
735
+ input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
736
+ Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
737
+ obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
738
+ `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
739
+ `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
740
+ padding and conversion into a tensor of type `torch.FloatTensor`. See
741
+ [`~Speech2TextFeatureExtractor.__call__`]
742
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
743
+ Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
744
+ `[0, 1]`:
745
+
746
+ - 1 for tokens that are **not masked**,
747
+ - 0 for tokens that are **masked**.
748
+
749
+ [What are attention masks?](../glossary#attention-mask)
750
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
751
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
752
+
753
+ - 1 indicates the head is **not masked**,
754
+ - 0 indicates the head is **masked**.
755
+
756
+ output_attentions (`bool`, *optional*):
757
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
758
+ returned tensors for more detail.
759
+ output_hidden_states (`bool`, *optional*):
760
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
761
+ for more detail.
762
+ return_dict (`bool`, *optional*):
763
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
764
+ """
765
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
766
+ output_hidden_states = (
767
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
768
+ )
769
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
770
+ inputs_embeds = self.subsample(input_features, attention_mask)
771
+ inputs_embeds = self.embed_scale * inputs_embeds
772
+
773
+ # subsample attention mask if necessary
774
+ if attention_mask is not None:
775
+ attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[0], attention_mask)
776
+
777
+ hidden_states = nn.functional.dropout(inputs_embeds, p=self.dropout, training=self.training)
778
+
779
+ # expand attention_mask
780
+ if attention_mask is not None:
781
+ padding_mask = attention_mask.ne(1).long()
782
+ else:
783
+ padding_mask = torch.zeros(inputs_embeds.shape[:2], dtype=torch.long, device=inputs_embeds.device)
784
+
785
+ encoder_states = () if output_hidden_states else None
786
+ all_attentions = () if output_attentions else None
787
+
788
+ # TODO: implement head mask
789
+ assert head_mask is None, "Head masking is not yet implemented for Conformer model"
790
+
791
+ for idx, encoder_layer in enumerate(self.layers):
792
+ if output_hidden_states:
793
+ encoder_states = encoder_states + (hidden_states.transpose(0, 1),)
794
+ if self.gradient_checkpointing and self.training:
795
+ layer_outputs = self._gradient_checkpointing_func(
796
+ encoder_layer.__call__,
797
+ hidden_states,
798
+ padding_mask,
799
+ output_attentions,
800
+ )
801
+ else:
802
+ layer_outputs = encoder_layer(
803
+ hidden_states,
804
+ padding_mask,
805
+ output_attentions=output_attentions,
806
+ )
807
+
808
+ hidden_states = layer_outputs[0]
809
+
810
+ if output_attentions:
811
+ all_attentions = all_attentions + (layer_outputs[1],)
812
+
813
+ if self.ctc_flag and self.ctc_layer == idx + 1:
814
+ hidden_states, ctc_output, padding_mask = self.apply_ctc(hidden_states, attention_mask.sum(dim=1))
815
+ attention_mask = padding_mask.ne(1).long()
816
+
817
+ hidden_states = hidden_states.transpose(0, 1) # T x B x C -> B x T x C
818
+ if output_hidden_states:
819
+ encoder_states = encoder_states + (hidden_states,)
820
+
821
+ if not return_dict:
822
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
823
+ return BaseModelOutput(
824
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
825
+ )
826
+
827
+
828
+ @add_start_docstrings(
829
+ "The bare Conformer Model outputting raw hidden-states without any specific head on top.",
830
+ CONFORMER_START_DOCSTRING,
831
+ )
832
+ class ConformerEncoderDecoderModel(ConformerEncoderDecoderPreTrainedModel):
833
+ def __init__(self, config: Speech2TextConformerConfig):
834
+ super().__init__(config)
835
+
836
+ self.encoder = ConformerEncoder(config)
837
+ self.decoder = Speech2TextDecoder(config)
838
+
839
+ # Initialize weights and apply final processing
840
+ self.post_init()
841
+
842
+ def get_input_embeddings(self):
843
+ return self.decoder.embed_tokens
844
+
845
+ def set_input_embeddings(self, value):
846
+ self.decoder.embed_tokens = value
847
+
848
+ def get_encoder(self):
849
+ return self.encoder
850
+
851
+ def get_decoder(self):
852
+ return self.decoder
853
+
854
+ @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
855
+ @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
856
+ def forward(
857
+ self,
858
+ input_features: Optional[torch.LongTensor] = None,
859
+ attention_mask: Optional[torch.Tensor] = None,
860
+ decoder_input_ids: Optional[torch.LongTensor] = None,
861
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
862
+ head_mask: Optional[torch.Tensor] = None,
863
+ decoder_head_mask: Optional[torch.Tensor] = None,
864
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
865
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
866
+ past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
867
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
868
+ use_cache: Optional[bool] = None,
869
+ output_attentions: Optional[bool] = None,
870
+ output_hidden_states: Optional[bool] = None,
871
+ return_dict: Optional[bool] = None,
872
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
873
+ r"""
874
+ Returns:
875
+
876
+ Example:
877
+
878
+ ```python
879
+ >>> import torch
880
+ >>> from transformers import AutoFeatureExtractor, AutoModel
881
+ >>> from datasets import load_dataset
882
+
883
+ >>> model = AutoModel.from_pretrained("FBK-MT/balbetto-asr-small-test")
884
+ >>> feature_extractor = AutoFeatureExtractor.from_pretrained("FBK-MT/balbetto-asr-small-test")
885
+ >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
886
+ >>> inputs = feature_extractor(
887
+ ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
888
+ ... )
889
+ >>> input_features = inputs.input_features
890
+ >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
891
+ >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
892
+ >>> list(last_hidden_state.shape)
893
+ [1, 2, 256]
894
+ ```"""
895
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
896
+ output_hidden_states = (
897
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
898
+ )
899
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
900
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
901
+
902
+ if encoder_outputs is None:
903
+ encoder_outputs = self.encoder(
904
+ input_features,
905
+ attention_mask=attention_mask,
906
+ head_mask=head_mask,
907
+ output_attentions=output_attentions,
908
+ output_hidden_states=output_hidden_states,
909
+ return_dict=return_dict,
910
+ )
911
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
912
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
913
+ encoder_outputs = BaseModelOutput(
914
+ last_hidden_state=encoder_outputs[0],
915
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
916
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
917
+ )
918
+
919
+ # downsample encoder attention mask
920
+ if attention_mask is not None:
921
+ encoder_attention_mask = self._get_feature_vector_attention_mask(
922
+ encoder_outputs[0].shape[1], attention_mask
923
+ )
924
+ else:
925
+ encoder_attention_mask = None
926
+
927
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
928
+ decoder_outputs = self.decoder(
929
+ input_ids=decoder_input_ids,
930
+ attention_mask=decoder_attention_mask,
931
+ encoder_hidden_states=encoder_outputs[0],
932
+ encoder_attention_mask=encoder_attention_mask,
933
+ head_mask=decoder_head_mask,
934
+ cross_attn_head_mask=cross_attn_head_mask,
935
+ past_key_values=past_key_values,
936
+ inputs_embeds=decoder_inputs_embeds,
937
+ use_cache=use_cache,
938
+ output_attentions=output_attentions,
939
+ output_hidden_states=output_hidden_states,
940
+ return_dict=return_dict,
941
+ )
942
+
943
+ if not return_dict:
944
+ return decoder_outputs + encoder_outputs
945
+
946
+ return Seq2SeqModelOutput(
947
+ last_hidden_state=decoder_outputs.last_hidden_state,
948
+ past_key_values=decoder_outputs.past_key_values,
949
+ decoder_hidden_states=decoder_outputs.hidden_states,
950
+ decoder_attentions=decoder_outputs.attentions,
951
+ cross_attentions=decoder_outputs.cross_attentions,
952
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
953
+ encoder_hidden_states=encoder_outputs.hidden_states,
954
+ encoder_attentions=encoder_outputs.attentions,
955
+ )
956
+
957
+
958
+ @add_start_docstrings(
959
+ "The Conformer Model with a language modeling head.",
960
+ CONFORMER_START_DOCSTRING,
961
+ )
962
+ class ConformerEncoderDecoderForConditionalGeneration(ConformerEncoderDecoderPreTrainedModel, GenerationMixin):
963
+ base_model_prefix = "model"
964
+ _tied_weights_keys = ["lm_head.weight"]
965
+
966
+ def __init__(self, config: Speech2TextConformerConfig):
967
+ super().__init__(config)
968
+ self.model = ConformerEncoderDecoderModel(config)
969
+ self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)
970
+
971
+ # Initialize weights and apply final processing
972
+ self.post_init()
973
+
974
+ def get_encoder(self):
975
+ return self.model.get_encoder()
976
+
977
+ def get_decoder(self):
978
+ return self.model.get_decoder()
979
+
980
+ def get_output_embeddings(self):
981
+ return self.lm_head
982
+
983
+ def set_output_embeddings(self, new_embeddings):
984
+ self.lm_head = new_embeddings
985
+
986
+ @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
987
+ @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
988
+ def forward(
989
+ self,
990
+ input_features: Optional[torch.LongTensor] = None,
991
+ attention_mask: Optional[torch.Tensor] = None,
992
+ decoder_input_ids: Optional[torch.LongTensor] = None,
993
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
994
+ head_mask: Optional[torch.Tensor] = None,
995
+ decoder_head_mask: Optional[torch.Tensor] = None,
996
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
997
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
998
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
999
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1000
+ labels: Optional[torch.LongTensor] = None,
1001
+ use_cache: Optional[bool] = None,
1002
+ output_attentions: Optional[bool] = None,
1003
+ output_hidden_states: Optional[bool] = None,
1004
+ return_dict: Optional[bool] = None,
1005
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1006
+ r"""
1007
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1008
+ Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
1009
+ or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
1010
+ only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1011
+
1012
+ Returns:
1013
+
1014
+ Example:
1015
+
1016
+ ```python
1017
+ >>> import torch
1018
+ >>> import transformers
1019
+ >>> from datasets import load_dataset
1020
+
1021
+ >>> pipe = transformers.pipeline(
1022
+ ... "automatic-speech-recognition",
1023
+ ... model='FBK-MT/balbetto-asr-small-test',
1024
+ ... feature_extractor='FBK-MT/balbetto-asr-small-test',
1025
+ ... trust_remote_code=True)
1026
+
1027
+
1028
+ >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
1029
+
1030
+ >>> generated_ids = pipe(ds[0]["audio"])
1031
+
1032
+ >>> transcription = pipe.feature_extractor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1033
+ >>> transcription
1034
+ 'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
1035
+ ```"""
1036
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1037
+
1038
+ if labels is not None:
1039
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
1040
+ decoder_input_ids = shift_tokens_right(
1041
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
1042
+ )
1043
+
1044
+ outputs = self.model(
1045
+ input_features,
1046
+ attention_mask=attention_mask,
1047
+ decoder_input_ids=decoder_input_ids,
1048
+ encoder_outputs=encoder_outputs,
1049
+ decoder_attention_mask=decoder_attention_mask,
1050
+ head_mask=head_mask,
1051
+ decoder_head_mask=decoder_head_mask,
1052
+ cross_attn_head_mask=cross_attn_head_mask,
1053
+ past_key_values=past_key_values,
1054
+ decoder_inputs_embeds=decoder_inputs_embeds,
1055
+ use_cache=use_cache,
1056
+ output_attentions=output_attentions,
1057
+ output_hidden_states=output_hidden_states,
1058
+ return_dict=return_dict,
1059
+ )
1060
+ lm_logits = self.lm_head(outputs[0])
1061
+
1062
+ loss = None
1063
+ if labels is not None:
1064
+ loss_fct = CrossEntropyLoss()
1065
+ loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
1066
+
1067
+ if not return_dict:
1068
+ output = (lm_logits,) + outputs[1:]
1069
+ return ((loss,) + output) if loss is not None else output
1070
+
1071
+ return Seq2SeqLMOutput(
1072
+ loss=loss,
1073
+ logits=lm_logits,
1074
+ past_key_values=outputs.past_key_values,
1075
+ decoder_hidden_states=outputs.decoder_hidden_states,
1076
+ decoder_attentions=outputs.decoder_attentions,
1077
+ cross_attentions=outputs.cross_attentions,
1078
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
1079
+ encoder_hidden_states=outputs.encoder_hidden_states,
1080
+ encoder_attentions=outputs.encoder_attentions,
1081
+ )
1082
+
1083
+ @staticmethod
1084
+ def _reorder_cache(past_key_values, beam_idx):
1085
+ reordered_past = ()
1086
+ for layer_past in past_key_values:
1087
+ reordered_past += (
1088
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1089
+ )
1090
+ return reordered_past
1091
+
1092
+
1093
+ Speech2TextConformerConfig.register_for_auto_class()
1094
+ ConformerEncoderDecoderForConditionalGeneration.register_for_auto_class("AutoModel")
1095
+ ConformerEncoderDecoderForConditionalGeneration.register_for_auto_class("AutoModelForSpeechSeq2Seq")
1096
+
1097
+ transformers.AutoConfig.register("conformer_encoder_decoder", Speech2TextConformerConfig)
1098
+ transformers.AutoModel.register(
1099
+ Speech2TextConformerConfig, ConformerEncoderDecoderForConditionalGeneration)
1100
+ transformers.AutoModelForSpeechSeq2Seq.register(
1101
+ Speech2TextConformerConfig, ConformerEncoderDecoderForConditionalGeneration)
1102
+ transformers.AutoProcessor.register(Speech2TextConformerConfig, Speech2TextProcessor)
1103
+ transformers.models.auto.modeling_auto.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES['conformer_encoder_decoder'] = \
1104
+ "ConformerEncoderDecoderForConditionalGeneration"
1105
+ transformers.TOKENIZER_MAPPING.register(Speech2TextConformerConfig, (Speech2TextTokenizer, None))
1106
+ transformers.FEATURE_EXTRACTOR_MAPPING.register(Speech2TextConformerConfig, Speech2TextFeatureExtractor)
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "max_length": 200,
8
+ "num_beams": 5,
9
+ "pad_token_id": 1,
10
+ "transformers_version": "4.48.1"
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d14abb88a1a139b63757428950ef9c2d3079923bf2cb05bec203eec1fc6b8cad
3
+ size 2263765176
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_ceptral_normalize": true,
3
+ "feature_extractor_type": "Speech2TextFeatureExtractor",
4
+ "feature_size": 80,
5
+ "normalize_means": true,
6
+ "normalize_vars": true,
7
+ "num_mel_bins": 80,
8
+ "padding_side": "right",
9
+ "padding_value": 0.0,
10
+ "processor_class": "Speech2TextProcessor",
11
+ "return_attention_mask": true,
12
+ "sampling_rate": 16000
13
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bdd22451ef0da74acc077f40a074170163aa2601c1a57bad29e93dbbe0fc903
3
+ size 524584
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "additional_special_tokens": [],
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "do_lower_case": false,
40
+ "do_upper_case": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "lang_codes": null,
44
+ "langs": [
45
+ "it",
46
+ "en"
47
+ ],
48
+ "model_max_length": 1000000000000000019884624838656,
49
+ "pad_token": "<pad>",
50
+ "processor_class": "Speech2TextProcessor",
51
+ "sp_model_kwargs": {},
52
+ "tgt_lang": null,
53
+ "tokenizer_class": "Speech2TextTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff