huggingface · SaulLu · Apr 19, 2022 · Apr 19, 2022 · Apr 19, 2022
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -139,8 +139,8 @@
  Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
  by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
  via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
- [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a
- tensor of type `torch.FloatTensor`. See [`~Speech2TextTokenizer.__call__`]
+ [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
+ into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`]
  return_dict (`bool`, *optional*):
  If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple.
  kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:

diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -600,8 +600,8 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
  Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
  by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
  via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
- [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a
- tensor of type `torch.FloatTensor`. See [`~Speech2TextTokenizer.__call__`]
+ [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
+ into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`]
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
  1]`:
@@ -733,9 +733,9 @@ def forward(
  Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
  obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
  `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
- `input_features`, the [`Speech2TextTokenizer`] should be used for extracting the fbank features,
+ `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features,
  padding and conversion into a tensor of type `torch.FloatTensor`. See
- [`~Speech2TextTokenizer.__call__`]
+ [`~Speech2TextFeatureExtractor.__call__`]
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
  `[0, 1]`:

diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -650,8 +650,8 @@ def serving(self, inputs):
  Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
  by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
  via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
- [`Speech2TextTokenizer`] should be used for extracting the fbank features, padding and conversion into a
- tensor of floats. See [`~Speech2TextTokenizer.__call__`]
+ [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion
+ into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
  attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -798,8 +798,8 @@ def call(
  Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
  obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
  `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
- `input_features`, the [`Speech2TextTokenizer`] should be used for extracting the fbank features,
- padding and conversion into a tensor of floats. See [`~Speech2TextTokenizer.__call__`]
+ `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features,
+ padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
  attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: