From 8c104642d4be90558337c9697415b9201979e1f4 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 2 Oct 2022 16:25:06 +0200
Subject: [PATCH 01/37] First draft

---
 README.md                                     |   1 +
 README_ko.md                                  |   1 +
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/en/index.mdx                      |   1 +
 .../audio-spectogram-transformer.mdx          |  54 ++
 docs/source/en/serialization.mdx              |   1 +
 src/transformers/__init__.py                  |  47 +-
 src/transformers/models/__init__.py           |   1 +
 .../audio_spectogram_transformer/__init__.py  |  80 ++
 ...figuration_audio_spectogram_transformer.py | 144 +++
 ..._spectogram_transformer_timm_to_pytorch.py | 250 ++++++
 ...extraction_audio_spectogram_transformer.py | 149 +++
 .../modeling_audio_spectogram_transformer.py  | 845 ++++++++++++++++++
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |   3 +
 src/transformers/utils/dummy_pt_objects.py    |  31 +
 .../utils/dummy_vision_objects.py             |   2 +-
 .../audio_spectogram_transformer/__init__.py  |   0
 ...extraction_audio_spectogram_transformer.py | 191 ++++
 ...t_modeling_audio_spectogram_transformer.py | 302 +++++++
 22 files changed, 2095 insertions(+), 14 deletions(-)
 create mode 100644 docs/source/en/model_doc/audio-spectogram-transformer.mdx
 create mode 100644 src/transformers/models/audio_spectogram_transformer/__init__.py
 create mode 100644 src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
 create mode 100644 src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py
 create mode 100644 src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
 create mode 100644 src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
 create mode 100644 tests/models/audio_spectogram_transformer/__init__.py
 create mode 100644 tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
 create mode 100644 tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
diff --git a/README.md b/README.md
index 12b1b3314edfe6..ae4d24726d1806 100644
--- a/README.md
+++ b/README.md
@@ -262,6 +262,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_ko.md b/README_ko.md
index 5e660e1a2e7615..2e2e071773d1f3 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -212,6 +212,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 78f19c374e32be..150e881f66d5e1 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -236,6 +236,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 9bc7c5a86c7f29..0fba6914e9ec51 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -248,6 +248,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index d357b8eb627689..530011ee2cc97d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -50,6 +50,7 @@ The documentation is organized into five sections:
 <!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
 
 1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectogram Transformer](model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/model_doc/audio-spectogram-transformer.mdx b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
new file mode 100644
index 00000000000000..0556e4b0215154
--- /dev/null
+++ b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
@@ -0,0 +1,54 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Audio Spectogram Transformer
+
+## Overview
+
+The Audio Spectogram Transformer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## AudioSpectogramTransformerConfig
+
+[[autodoc]] AudioSpectogramTransformerConfig
+
+## AudioSpectogramTransformerFeatureExtractor
+
+[[autodoc]] AudioSpectogramTransformerFeatureExtractor
+    - __call__
+
+## AudioSpectogramTransformerModel
+
+[[autodoc]] AudioSpectogramTransformerModel
+    - forward
+
+## AudioSpectogramTransformerForMaskedImageModeling
+
+[[autodoc]] AudioSpectogramTransformerForMaskedImageModeling
+    - forward
+
+## AudioSpectogramTransformerForImageClassification
+
+[[autodoc]] AudioSpectogramTransformerForImageClassification
+    - forward
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index b03f2291da5f8c..ca654bcb1c1aea 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -46,6 +46,7 @@ Ready-made configurations include the following architectures:
 <!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
 
 - ALBERT
+- Audio Spectogram Transformer
 - BART
 - BEiT
 - BERT
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8afaa1e2338661..3d3d964a06fd48 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -403,6 +403,7 @@
     "models.vision_text_dual_encoder": ["VisionTextDualEncoderConfig", "VisionTextDualEncoderProcessor"],
     "models.visual_bert": ["VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VisualBertConfig"],
     "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
+    "models.audio_spectogram_transformer": ["AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "AudioSpectogramTransformerConfig"],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
     "models.wav2vec2": [
@@ -735,13 +736,15 @@
     _import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"])
     _import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
     _import_structure["models.owlvit"].append("OwlViTFeatureExtractor")
-    _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
-    _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
-    _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
-    _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
-    _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
-    _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
-    _import_structure["models.yolos"].extend(["YolosFeatureExtractor"])
+    _import_structure["models.perceiver"].append("PerceiverFeatureExtractor")
+    _import_structure["models.poolformer"].append("PoolFormerFeatureExtractor")
+    _import_structure["models.segformer"].append("SegformerFeatureExtractor")
+    _import_structure["models.videomae"].append("VideoMAEFeatureExtractor")
+    _import_structure["models.vilt"].append("ViltFeatureExtractor")
+    _import_structure["models.vilt"].append("ViltProcessor")
+    _import_structure["models.vit"].append("ViTFeatureExtractor")
+    _import_structure["models.audio_spectogram_transformer"].append("AudioSpectogramTransformerFeatureExtractor")
+    _import_structure["models.yolos"].append("YolosFeatureExtractor")
 
 # Timm-backed objects
 try:
@@ -2159,6 +2162,15 @@
             "ViTPreTrainedModel",
         ]
     )
+    _import_structure["models.audio_spectogram_transformer"].extend(
+        [
+            "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AudioSpectogramTransformerForImageClassification",
+            "AudioSpectogramTransformerForMaskedImageModeling",
+            "AudioSpectogramTransformerModel",
+            "AudioSpectogramTransformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.vit_mae"].extend(
         [
             "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3553,6 +3565,7 @@
     from .models.vision_text_dual_encoder import VisionTextDualEncoderConfig, VisionTextDualEncoderProcessor
     from .models.visual_bert import VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, VisualBertConfig
     from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
+    from .models.audio_spectogram_transformer import AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, AudioSpectogramTransformerConfig
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
     from .models.wav2vec2 import (
@@ -3846,12 +3859,13 @@
         from .models.mobilenet_v2 import MobileNetV2FeatureExtractor, MobileNetV2ImageProcessor
         from .models.mobilevit import MobileViTFeatureExtractor, MobileViTImageProcessor
         from .models.owlvit import OwlViTFeatureExtractor
-        from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
-        from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
-        from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
-        from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
-        from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
-        from .models.vit import ViTFeatureExtractor, ViTImageProcessor
+        from .models.perceiver import PerceiverFeatureExtractor
+        from .models.poolformer import PoolFormerFeatureExtractor
+        from .models.segformer import SegformerFeatureExtractor
+        from .models.videomae import VideoMAEFeatureExtractor
+        from .models.vilt import ViltFeatureExtractor, ViltProcessor
+        from .models.vit import ViTFeatureExtractor
+        from .models.audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
         from .models.yolos import YolosFeatureExtractor
 
     # Modeling
@@ -5005,6 +5019,13 @@
             ViTModel,
             ViTPreTrainedModel,
         )
+        from .models.audio_spectogram_transformer import (
+            AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AudioSpectogramTransformerForImageClassification,
+            AudioSpectogramTransformerForMaskedImageModeling,
+            AudioSpectogramTransformerModel,
+            AudioSpectogramTransformerPreTrainedModel,
+        )
         from .models.vit_mae import (
             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTMAEForPreTraining,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index ded3a4745b27db..73658cf92de810 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -162,6 +162,7 @@
     vision_text_dual_encoder,
     visual_bert,
     vit,
+    audio_spectogram_transformer,
     vit_mae,
     vit_msn,
     wav2vec2,
diff --git a/src/transformers/models/audio_spectogram_transformer/__init__.py b/src/transformers/models/audio_spectogram_transformer/__init__.py
new file mode 100644
index 00000000000000..8c6d5e303fe786
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/__init__.py
@@ -0,0 +1,80 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_vision_available,
+)
+
+
+_import_structure = {"configuration_audio_spectogram_transformer": ["AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "AudioSpectogramTransformerConfig", "AudioSpectogramTransformerOnnxConfig"]}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_audio_spectogram_transformer"] = ["AudioSpectogramTransformerFeatureExtractor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_audio_spectogram_transformer"] = [
+        "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "AudioSpectogramTransformerForImageClassification",
+        "AudioSpectogramTransformerForMaskedImageModeling",
+        "AudioSpectogramTransformerModel",
+        "AudioSpectogramTransformerPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_audio_spectogram_transformer import AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, AudioSpectogramTransformerConfig, AudioSpectogramTransformerOnnxConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_audio_spectogram_transformer import (
+            AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AudioSpectogramTransformerForImageClassification,
+            AudioSpectogramTransformerForMaskedImageModeling,
+            AudioSpectogramTransformerModel,
+            AudioSpectogramTransformerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
new file mode 100644
index 00000000000000..6bbc59f6724c2d
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" AudioSpectogramTransformer model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "MIT/ast-10-10": "https://huggingface.co/MIT/ast-10-10/resolve/main/config.json",
+}
+
+
+
+class AudioSpectogramTransformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AudioSpectogramTransformerModel`]. It is used to instantiate an AudioSpectogramTransformer
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the AudioSpectogramTransformer
+    [google/audio_spectogram_transformer-base-patch16-224](https://huggingface.co/google/audio_spectogram_transformer-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `16`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        encoder_stride (`int`, `optional`, defaults to 16):
+           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+
+    Example:
+
+    ```python
+    >>> from transformers import AudioSpectogramTransformerModel, AudioSpectogramTransformerConfig
+
+    >>> # Initializing a AudioSpectogramTransformer audio_spectogram_transformer-base-patch16-224 style configuration
+    >>> configuration = AudioSpectogramTransformerConfig()
+
+    >>> # Initializing a model from the audio_spectogram_transformer-base-patch16-224 style configuration
+    >>> model = AudioSpectogramTransformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "audio-spectogram-transformer"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        is_encoder_decoder=False,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        encoder_stride=16,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.encoder_stride = encoder_stride
+
+
+class AudioSpectogramTransformerOnnxConfig(OnnxConfig):
+
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py
new file mode 100644
index 00000000000000..68612060ba9603
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert AudioSpectogramTransformer and non-distilled DeiT checkpoints from the timm library."""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+import timm
+from huggingface_hub import hf_hub_download
+from transformers import DeiTFeatureExtractor, AudioSpectogramTransformerConfig, AudioSpectogramTransformerFeatureExtractor, AudioSpectogramTransformerForImageClassification, AudioSpectogramTransformerModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"audio_spectogram_transformer.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"audio_spectogram_transformer.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"audio_spectogram_transformer.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"audio_spectogram_transformer.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"audio_spectogram_transformer.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"audio_spectogram_transformer.encoder.layer.{i}.output.dense.bias"))
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("cls_token", "audio_spectogram_transformer.embeddings.cls_token"),
+            ("patch_embed.proj.weight", "audio_spectogram_transformer.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "audio_spectogram_transformer.embeddings.patch_embeddings.projection.bias"),
+            ("pos_embed", "audio_spectogram_transformer.embeddings.position_embeddings"),
+        ]
+    )
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend(
+            [
+                ("norm.weight", "layernorm.weight"),
+                ("norm.bias", "layernorm.bias"),
+                ("pre_logits.fc.weight", "pooler.dense.weight"),
+                ("pre_logits.fc.bias", "pooler.dense.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "audio_spectogram_transformer" from all keys that start with "audio_spectogram_transformer"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("audio_spectogram_transformer") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "audio_spectogram_transformer.layernorm.weight"),
+                ("norm.bias", "audio_spectogram_transformer.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        if base_model:
+            prefix = ""
+        else:
+            prefix = "audio_spectogram_transformer."
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.weight", "head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_audio_spectogram_transformer_checkpoint(audio_spectogram_transformer_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our AudioSpectogramTransformer structure.
+    """
+
+    # define default AudioSpectogramTransformer configuration
+    config = AudioSpectogramTransformerConfig()
+    base_model = False
+    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
+    if audio_spectogram_transformer_name[-5:] == "in21k":
+        base_model = True
+        config.patch_size = int(audio_spectogram_transformer_name[-12:-10])
+        config.image_size = int(audio_spectogram_transformer_name[-9:-6])
+    else:
+        config.num_labels = 1000
+        repo_id = "huggingface/label-files"
+        filename = "imagenet-1k-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+        config.patch_size = int(audio_spectogram_transformer_name[-6:-4])
+        config.image_size = int(audio_spectogram_transformer_name[-3:])
+    # size of the architecture
+    if "deit" in audio_spectogram_transformer_name:
+        if audio_spectogram_transformer_name[9:].startswith("tiny"):
+            config.hidden_size = 192
+            config.intermediate_size = 768
+            config.num_hidden_layers = 12
+            config.num_attention_heads = 3
+        elif audio_spectogram_transformer_name[9:].startswith("small"):
+            config.hidden_size = 384
+            config.intermediate_size = 1536
+            config.num_hidden_layers = 12
+            config.num_attention_heads = 6
+        else:
+            pass
+    else:
+        if audio_spectogram_transformer_name[4:].startswith("small"):
+            config.hidden_size = 768
+            config.intermediate_size = 2304
+            config.num_hidden_layers = 8
+            config.num_attention_heads = 8
+        elif audio_spectogram_transformer_name[4:].startswith("base"):
+            pass
+        elif audio_spectogram_transformer_name[4:].startswith("large"):
+            config.hidden_size = 1024
+            config.intermediate_size = 4096
+            config.num_hidden_layers = 24
+            config.num_attention_heads = 16
+        elif audio_spectogram_transformer_name[4:].startswith("huge"):
+            config.hidden_size = 1280
+            config.intermediate_size = 5120
+            config.num_hidden_layers = 32
+            config.num_attention_heads = 16
+
+    # load original model from timm
+    timm_model = timm.create_model(audio_spectogram_transformer_name, pretrained=True)
+    timm_model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = timm_model.state_dict()
+    if base_model:
+        remove_classification_head_(state_dict)
+    rename_keys = create_rename_keys(config, base_model)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model)
+
+    # load HuggingFace model
+    if audio_spectogram_transformer_name[-5:] == "in21k":
+        model = AudioSpectogramTransformerModel(config).eval()
+    else:
+        model = AudioSpectogramTransformerForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by AudioSpectogramTransformerFeatureExtractor/DeiTFeatureExtractor
+    if "deit" in audio_spectogram_transformer_name:
+        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
+    else:
+        feature_extractor = AudioSpectogramTransformerFeatureExtractor(size=config.image_size)
+    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+
+    if base_model:
+        timm_pooled_output = timm_model.forward_features(pixel_values)
+        assert timm_pooled_output.shape == outputs.pooler_output.shape
+        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
+    else:
+        timm_logits = timm_model(pixel_values)
+        assert timm_logits.shape == outputs.logits.shape
+        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {audio_spectogram_transformer_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--audio_spectogram_transformer_name",
+        default="audio_spectogram_transformer_base_patch16_224",
+        type=str,
+        help="Name of the AudioSpectogramTransformer timm model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_audio_spectogram_transformer_checkpoint(args.audio_spectogram_transformer_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
new file mode 100644
index 00000000000000..47220288757adb
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for AudioSpectogramTransformer."""
+
+from typing import Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class AudioSpectogramTransformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a AudioSpectogramTransformer feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
+            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
+            set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
+            if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize=True,
+        size=224,
+        resample=Image.BILINEAR,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def __call__(
+        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several image(s).
+
+        <Tip warning={true}>
+
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        # Input type checking for clearer error
+        valid_images = False
+
+        # Check that images has a valid type
+        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
+            valid_images = True
+        elif isinstance(images, (list, tuple)):
+            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
+                valid_images = True
+
+        if not valid_images:
+            raise ValueError(
+                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
+                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+            )
+
+        is_batched = bool(
+            isinstance(images, (list, tuple))
+            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+        )
+
+        if not is_batched:
+            images = [images]
+
+        # transformations (resizing + normalization)
+        if self.do_resize and self.size is not None:
+            images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
+        if self.do_normalize:
+            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+
+        # return as BatchFeature
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
new file mode 100644
index 00000000000000..9884989ebeb5a0
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -0,0 +1,845 @@
+# coding=utf-8
+# Copyright 2022 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch AudioSpectogramTransformer model."""
+
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_audio_spectogram_transformer import AudioSpectogramTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "AudioSpectogramTransformerConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AudioSpectogramTransformerFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "MIT/ast-10-10"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/audio_spectogram_transformer-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "MIT/ast-10-10",
+    # See all Audio Spectogram Transformer models at https://huggingface.co/models?filter=audio-spectogram-transformer
+]
+
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: AudioSpectogramTransformerConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = AudioSpectogramTransformerPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPatchEmbeddings with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerSelfAttention(nn.Module):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerSelfOutput(nn.Module):
+    """
+    The residual connection is defined in AudioSpectogramTransformerLayer instead of here (as is the case with other
+    models), due to the layernorm applied before each block.
+    """
+
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerAttention(nn.Module):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        self.attention = AudioSpectogramTransformerSelfAttention(config)
+        self.output = AudioSpectogramTransformerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerIntermediate(nn.Module):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerOutput(nn.Module):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = AudioSpectogramTransformerAttention(config)
+        self.intermediate = AudioSpectogramTransformerIntermediate(config)
+        self.output = AudioSpectogramTransformerOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(
+                hidden_states
+            ),  # in AudioSpectogramTransformer, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in AudioSpectogramTransformer, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerEncoder(nn.Module):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([AudioSpectogramTransformerLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->AudioSpectogramTransformer,vit->audio_spectogram_transformer
+class AudioSpectogramTransformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = AudioSpectogramTransformerConfig
+    base_model_prefix = "audio_spectogram_transformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder, value: bool = False) -> None:
+        if isinstance(module, AudioSpectogramTransformerEncoder):
+            module.gradient_checkpointing = value
+
+
+AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`AudioSpectogramTransformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AudioSpectogramTransformerFeatureExtractor`]. See
+            [`AudioSpectogramTransformerFeatureExtractor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare AudioSpectogramTransformer Model transformer outputting raw hidden-states without any specific head on top.",
+    AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.vit.modeling_vit.ViTModel with VIT->AUDIO_SPECTOGRAM_TRANSFORMER,ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerModel(AudioSpectogramTransformerPreTrainedModel):
+    def __init__(
+        self, config: AudioSpectogramTransformerConfig, add_pooling_layer: bool = True, use_mask_token: bool = False
+    ):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = AudioSpectogramTransformerEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = AudioSpectogramTransformerEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = AudioSpectogramTransformerPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> AudioSpectogramTransformerPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->AudioSpectogramTransformer
+class AudioSpectogramTransformerPooler(nn.Module):
+    def __init__(self, config: AudioSpectogramTransformerConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@add_start_docstrings(
+    """AudioSpectogramTransformer Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """,
+    AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.vit.modeling_vit.ViTForMaskedImageModeling with VIT->AUDIO_SPECTOGRAM_TRANSFORMER,ViT->AudioSpectogramTransformer,vit->audio_spectogram_transformer,google/vit-base-patch16-224-in21k->MIT/ast-10-10
+class AudioSpectogramTransformerForMaskedImageModeling(AudioSpectogramTransformerPreTrainedModel):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__(config)
+
+        self.audio_spectogram_transformer = AudioSpectogramTransformerModel(
+            config, add_pooling_layer=False, use_mask_token=True
+        )
+
+        self.decoder = nn.Sequential(
+            nn.Conv2d(
+                in_channels=config.hidden_size,
+                out_channels=config.encoder_stride**2 * config.num_channels,
+                kernel_size=1,
+            ),
+            nn.PixelShuffle(config.encoder_stride),
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import (
+        ...     AudioSpectogramTransformerFeatureExtractor,
+        ...     AudioSpectogramTransformerForMaskedImageModeling,
+        ... )
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = AudioSpectogramTransformerFeatureExtractor.from_pretrained(
+        ...     "google/audio_spectogram_transformer-base-patch16-224-in21k"
+        ... )
+        >>> model = AudioSpectogramTransformerForMaskedImageModeling.from_pretrained(
+        ...     "google/audio_spectogram_transformer-base-patch16-224-in21k"
+        ... )
+
+        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+        >>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
+        >>> # create random boolean mask of shape (batch_size, num_patches)
+        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
+        >>> list(reconstructed_pixel_values.shape)
+        [1, 3, 224, 224]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.audio_spectogram_transformer(
+            pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # Reshape to (batch_size, num_channels, height, width)
+        sequence_output = sequence_output[:, 1:]
+        batch_size, sequence_length, num_channels = sequence_output.shape
+        height = width = math.floor(sequence_length**0.5)
+        sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+
+        # Reconstruct pixel values
+        reconstructed_pixel_values = self.decoder(sequence_output)
+
+        masked_im_loss = None
+        if bool_masked_pos is not None:
+            size = self.config.image_size // self.config.patch_size
+            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+            mask = (
+                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+                .repeat_interleave(self.config.patch_size, 2)
+                .unsqueeze(1)
+                .contiguous()
+            )
+            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+        if not return_dict:
+            output = (reconstructed_pixel_values,) + outputs[1:]
+            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_im_loss,
+            logits=reconstructed_pixel_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    AudioSpectogramTransformer Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune AudioSpectogramTransformer on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """,
+    AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
+)
+# Copied from transformers.models.vit.modeling_vit.ViTForImageClassification with VIT->AUDIO_SPECTOGRAM_TRANSFORMER,ViT->AudioSpectogramTransformer,vit->audio_spectogram_transformer
+class AudioSpectogramTransformerForImageClassification(AudioSpectogramTransformerPreTrainedModel):
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.audio_spectogram_transformer = AudioSpectogramTransformerModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.audio_spectogram_transformer(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ac542bd14ba675..f1f6f8f0e243a4 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -157,6 +157,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
         ("visual_bert", "VisualBertConfig"),
         ("vit", "ViTConfig"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerConfig"),
         ("vit_mae", "ViTMAEConfig"),
         ("vit_msn", "ViTMSNConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -293,6 +294,7 @@
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("audio-spectogram-transformer", "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_msn", "VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -460,6 +462,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
         ("visual_bert", "VisualBERT"),
         ("vit", "ViT"),
+        ("audio-spectogram-transformer", "Audio Spectogram Transformer"),
         ("vit_mae", "ViTMAE"),
         ("vit_msn", "ViTMSN"),
         ("wav2vec2", "Wav2Vec2"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 40429942282404..cbe23d3830a645 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -79,6 +79,7 @@
         ("videomae", "VideoMAEFeatureExtractor"),
         ("vilt", "ViltFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),
         ("vit_msn", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 433e4f63384663..f9964cb510205e 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -151,6 +151,7 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
         ("visual_bert", "VisualBertModel"),
         ("vit", "ViTModel"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerModel"),
         ("vit_mae", "ViTMAEModel"),
         ("vit_msn", "ViTMSNModel"),
         ("wav2vec2", "Wav2Vec2Model"),
@@ -351,6 +352,7 @@
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerForMaskedImageModeling"),
     ]
 )
 
@@ -393,6 +395,7 @@
         ("swinv2", "Swinv2ForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerForImageClassification"),
         ("vit_msn", "ViTMSNForImageClassification"),
     ]
 )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 8d9d1c93381c3b..7319358dc12bec 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5698,6 +5698,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class AudioSpectogramTransformerForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerForMaskedImageModeling(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 7ce1f1867057f3..e5438b4c159d7f 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -358,7 +358,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ViTImageProcessor(metaclass=DummyObject):
+class AudioSpectogramTransformerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/audio_spectogram_transformer/__init__.py b/tests/models/audio_spectogram_transformer/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
new file mode 100644
index 00000000000000..05db4ec24511c3
--- /dev/null
+++ b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AudioSpectogramTransformerFeatureExtractor
+
+
+class AudioSpectogramTransformerFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+
+@require_torch
+@require_vision
+class AudioSpectogramTransformerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = AudioSpectogramTransformerFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = AudioSpectogramTransformerFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
diff --git a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
new file mode 100644
index 00000000000000..01cfcf26320b77
--- /dev/null
+++ b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch AudioSpectogramTransformer model. """
+
+
+import inspect
+import unittest
+
+from transformers import AudioSpectogramTransformerConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import AudioSpectogramTransformerForImageClassification, AudioSpectogramTransformerForMaskedImageModeling, AudioSpectogramTransformerModel
+    from transformers.models.audio_spectogram_transformer.modeling_audio_spectogram_transformer import AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AudioSpectogramTransformerFeatureExtractor
+
+
+class AudioSpectogramTransformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+        encoder_stride=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.encoder_stride = encoder_stride
+
+        # in AudioSpectogramTransformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return AudioSpectogramTransformerConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = AudioSpectogramTransformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
+        model = AudioSpectogramTransformerForMaskedImageModeling(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
+        )
+
+        # test greyscale images
+        config.num_channels = 1
+        model = AudioSpectogramTransformerForMaskedImageModeling(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = AudioSpectogramTransformerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = AudioSpectogramTransformerForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class AudioSpectogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as AudioSpectogramTransformer does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            AudioSpectogramTransformerModel,
+            AudioSpectogramTransformerForImageClassification,
+            AudioSpectogramTransformerForMaskedImageModeling,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_compatible = False
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = AudioSpectogramTransformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AudioSpectogramTransformerConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="AudioSpectogramTransformer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_image_modeling(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = AudioSpectogramTransformerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class AudioSpectogramTransformerModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return AudioSpectogramTransformerFeatureExtractor.from_pretrained("google/audio_spectogram_transformer-base-patch16-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = AudioSpectogramTransformerForImageClassification.from_pretrained("google/audio_spectogram_transformer-base-patch16-224").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # AudioSpectogramTransformer models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model = AudioSpectogramTransformerModel.from_pretrained("facebook/dino-audio_spectogram_transformers8").to(torch_device)
+
+        feature_extractor = AudioSpectogramTransformerFeatureExtractor.from_pretrained("facebook/dino-audio_spectogram_transformers8", size=480)
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(pixel_values, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 3601, 384))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))

From 353ead3b622d2d32ff691a3c00c977019d6d4c40 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 3 Oct 2022 11:23:46 +0200
Subject: [PATCH 02/37] Make conversion script work

---
 src/transformers/__init__.py                  |   2 +
 .../audio_spectogram_transformer/__init__.py  |   2 +
 ...figuration_audio_spectogram_transformer.py |  38 +-
 ...ctogram_transformer_original_to_pytorch.py | 227 ++++++++++++
 .../modeling_audio_spectogram_transformer.py  | 350 ++++--------------
 .../audio_spectogram_transformer/test.py      |  14 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 7 files changed, 331 insertions(+), 303 deletions(-)
 create mode 100644 src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
 create mode 100644 src/transformers/models/audio_spectogram_transformer/test.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3d3d964a06fd48..62911f60f2faf6 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2169,6 +2169,7 @@
             "AudioSpectogramTransformerForMaskedImageModeling",
             "AudioSpectogramTransformerModel",
             "AudioSpectogramTransformerPreTrainedModel",
+            "AudioSpectogramTransformerForSequenceClassification",
         ]
     )
     _import_structure["models.vit_mae"].extend(
@@ -5025,6 +5026,7 @@
             AudioSpectogramTransformerForMaskedImageModeling,
             AudioSpectogramTransformerModel,
             AudioSpectogramTransformerPreTrainedModel,
+            AudioSpectogramTransformerForSequenceClassification,
         )
         from .models.vit_mae import (
             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/audio_spectogram_transformer/__init__.py b/src/transformers/models/audio_spectogram_transformer/__init__.py
index 8c6d5e303fe786..93762ed569cc71 100644
--- a/src/transformers/models/audio_spectogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectogram_transformer/__init__.py
@@ -47,6 +47,7 @@
         "AudioSpectogramTransformerForMaskedImageModeling",
         "AudioSpectogramTransformerModel",
         "AudioSpectogramTransformerPreTrainedModel",
+        "AudioSpectogramTransformerForSequenceClassification",
     ]
 
 if TYPE_CHECKING:
@@ -72,6 +73,7 @@
             AudioSpectogramTransformerForMaskedImageModeling,
             AudioSpectogramTransformerModel,
             AudioSpectogramTransformerPreTrainedModel,
+            AudioSpectogramTransformerForSequenceClassification,
         )
 
 else:
diff --git a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
index 6bbc59f6724c2d..3e09881c3d393f 100644
--- a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
@@ -20,7 +20,6 @@
 from packaging import version
 
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
 from ...utils import logging
 
 
@@ -71,8 +70,14 @@ class AudioSpectogramTransformerConfig(PretrainedConfig):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
-           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        fstride (`int`, *optional*, defaults to 10):
+            ...
+        tstride (`int`, *optional*, defaults to 10):
+            ...
+        input_fdim (`int`, *optional*, defaults to 128):
+            ...
+        input_tdim (`int`, *optional*, defaults to 1024):
+            ...
 
     Example:
 
@@ -106,7 +111,10 @@ def __init__(
         patch_size=16,
         num_channels=3,
         qkv_bias=True,
-        encoder_stride=16,
+        fstride=10,
+        tstride=10,
+        input_fdim=128,
+        input_tdim=1024,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -124,21 +132,7 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-        self.encoder_stride = encoder_stride
-
-
-class AudioSpectogramTransformerOnnxConfig(OnnxConfig):
-
-    torch_onnx_minimum_version = version.parse("1.11")
-
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
-            ]
-        )
-
-    @property
-    def atol_for_validation(self) -> float:
-        return 1e-4
+        self.fstride = fstride
+        self.tstride = tstride
+        self.input_fdim = input_fdim
+        self.input_tdim = input_tdim
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
new file mode 100644
index 00000000000000..e5817a768b4e93
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Audio Spectogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
+
+
+import argparse
+import json
+from pathlib import Path
+
+import torch
+from PIL import Image
+
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
+from transformers.utils import logging
+
+import torchaudio
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_audio_spectogram_transformer_config(model_name):
+    config = AudioSpectogramTransformerConfig()
+
+    config.num_labels = 527
+    
+    #TODO add id2label mappings
+    # repo_id = "huggingface/label-files"
+    # filename = "coco-detection-id2label.json"
+    # id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    # id2label = {int(k): v for k, v in id2label.items()}
+    # config.id2label = id2label
+    # config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
+
+
+def rename_key(name):
+    if "module.v" in name:
+        name = name.replace("module.v", "audio_spectogram_transformer")
+    if "cls_token" in name:
+        name = name.replace("cls_token", "embeddings.cls_token")
+    if "dist_token" in name:
+        name = name.replace("dist_token", "embeddings.distillation_token")
+    if "pos_embed" in name:
+        name = name.replace("pos_embed", "embeddings.position_embeddings")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    # transformer blocks
+    if "blocks" in name:
+        name = name.replace("blocks", "encoder.layer")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn" in name:
+        name = name.replace("attn", "attention.self")
+    if "norm1" in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    # final layernorm
+    if "audio_spectogram_transformer.norm" in name:
+        name = name.replace("audio_spectogram_transformer.norm", "audio_spectogram_transformer.layernorm")
+    # classifier head
+    if "module.mlp_head.0" in name:
+        name = name.replace("module.mlp_head.0", "layernorm")
+    if "module.mlp_head.1" in name:
+        name = name.replace("module.mlp_head.1", "classifier")
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[3])
+            dim = config.hidden_size
+            if "weight" in key:
+                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
+                    dim : dim * 2, :
+                ]
+                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+            else:
+                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
+                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def remove_keys(state_dict):
+    ignore_keys = [
+        "module.v.head.weight",
+        "module.v.head.bias",
+        "module.v.head_dist.weight",
+        "module.v.head_dist.bias",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def make_features(wav_name, mel_bins, target_length=1024):
+    waveform, sr = torchaudio.load(wav_name)
+
+    fbank = torchaudio.compliance.kaldi.fbank(
+        waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
+        window_type='hanning', num_mel_bins=mel_bins, dither=0.0,
+        frame_shift=10)
+
+    n_frames = fbank.shape[0]
+
+    p = target_length - n_frames
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[0:target_length, :]
+
+    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
+    return fbank
+
+
+@torch.no_grad()
+def convert_audio_spectogram_transformer_checkpoint(model_name, checkpoint_url, pytorch_dump_folder_path, push_to_hub=False):
+    """
+    Copy/paste/tweak model's weights to our YOLOS structure.
+    """
+    config = get_audio_spectogram_transformer_config(model_name)
+
+    # load original state_dict
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
+    # remove some keys
+    remove_keys(state_dict)
+    # rename some keys
+    new_state_dict = convert_state_dict(state_dict, config)
+
+    # load 🤗 model
+    model = AudioSpectogramTransformerForSequenceClassification(config)
+    model.eval()
+    
+    model.load_state_dict(new_state_dict)
+
+    # verify outputs on dummy input
+    filepath = hf_hub_download(repo_id="nielsr/audio-spectogram-transformer-checkpoint",
+                           filename="sample_audio.flac",
+                           repo_type="dataset")
+    features = make_features(filepath, mel_bins=128) # shape(1024, 128)
+    input_values = features.expand(1, 1024, 128)  # (batch_size, time, freq)  
+    
+    # forward pass
+    outputs = model(input_values)
+    logits = outputs.logits
+
+    print("Shape of logits:", logits.shape)
+    print("Predicted class:", logits.argmax(-1))
+
+    expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
+    if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
+        raise ValueError("Logits don't match")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_mapping = {
+            "audio_spectogram_transformer_ti": "audio_spectogram_transformer-tiny",
+            "audio_spectogram_transformer_s_200_pre": "audio_spectogram_transformer-small",
+            "audio_spectogram_transformer_s_300_pre": "audio_spectogram_transformer-small-300",
+            "audio_spectogram_transformer_s_dWr": "audio_spectogram_transformer-small-dwr",
+            "audio_spectogram_transformer_base": "audio_spectogram_transformer-base",
+        }
+
+        print("Pushing to the hub...")
+        model_name = model_mapping[model_name]
+        model.push_to_hub(model_name, organization="hustvl")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="audio_spectogram_transformer_s_200_pre",
+        type=str,
+        help=(
+            "Name of the YOLOS model you'd like to convert. Should be one of 'audio_spectogram_transformer_ti', 'audio_spectogram_transformer_s_200_pre',"
+            " 'audio_spectogram_transformer_s_300_pre', 'audio_spectogram_transformer_s_dWr', 'audio_spectogram_transformer_base'."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoint_url", default="https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1", type=str, help="URL of the original state dict (.pth file)."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_audio_spectogram_transformer_checkpoint(args.model_name, args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
\ No newline at end of file
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index 9884989ebeb5a0..2c6f06ca066091 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -25,7 +25,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput, MaskedLMOutput
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput, MaskedLMOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
@@ -60,123 +60,75 @@
 
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings with ViT->AudioSpectogramTransformer
 class AudioSpectogramTransformerEmbeddings(nn.Module):
     """
-    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    Construct the CLS token, position and patch embeddings.
     """
 
-    def __init__(self, config: AudioSpectogramTransformerConfig, use_mask_token: bool = False) -> None:
+    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
         super().__init__()
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.patch_embeddings = AudioSpectogramTransformerPatchEmbeddings(config)
-        num_patches = self.patch_embeddings.num_patches
-        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+
+        frequency_dimension, time_dimension = self.get_shape(config)
+        num_patches = frequency_dimension * time_dimension
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
-    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
-        """
-        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
-        resolution images.
-
-        Source:
-        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
-        """
-
-        num_patches = embeddings.shape[1] - 1
-        num_positions = self.position_embeddings.shape[1] - 1
-        if num_patches == num_positions and height == width:
-            return self.position_embeddings
-        class_pos_embed = self.position_embeddings[:, 0]
-        patch_pos_embed = self.position_embeddings[:, 1:]
-        dim = embeddings.shape[-1]
-        h0 = height // self.config.patch_size
-        w0 = width // self.config.patch_size
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        h0, w0 = h0 + 0.1, w0 + 0.1
-        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
-        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
-        patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed,
-            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
-            mode="bicubic",
-            align_corners=False,
-        )
-        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
-        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
-        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def get_shape(self, config):
+        fstride = config.fstride
+        tstride = config.tstride
+        input_fdim = config.input_fdim
+        input_tdim = config.input_tdim
+        test_input = torch.randn(1, 1, input_fdim, input_tdim)
+        test_proj = nn.Conv2d(1, config.hidden_size, kernel_size=(16, 16), stride=(fstride, tstride))
+        test_out = test_proj(test_input)
+        f_dim = test_out.shape[2]
+        t_dim = test_out.shape[3]
+        return f_dim, t_dim
+    
+    def forward(self, input_values: torch.Tensor) -> torch.Tensor:
+        batch_size = input_values.shape[0]
+        embeddings = self.patch_embeddings(input_values)
 
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
-        interpolate_pos_encoding: bool = False,
-    ) -> torch.Tensor:
-        batch_size, num_channels, height, width = pixel_values.shape
-        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-
-        if bool_masked_pos is not None:
-            seq_length = embeddings.shape[1]
-            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
-            # replace the masked visual tokens by mask_tokens
-            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
-            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
-
-        # add the [CLS] token to the embedded patch tokens
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
-
-        # add positional encoding to each token
-        if interpolate_pos_encoding:
-            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-        else:
-            embeddings = embeddings + self.position_embeddings
-
+        distillation_tokens = self.distillation_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, distillation_tokens, embeddings), dim=1)
+        embeddings = embeddings + self.position_embeddings
         embeddings = self.dropout(embeddings)
 
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTPatchEmbeddings with ViT->AudioSpectogramTransformer
 class AudioSpectogramTransformerPatchEmbeddings(nn.Module):
     """
-    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    This class turns `input_values` of shape `(batch_size, num_channels, height, width)` into the initial
     `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
     Transformer.
     """
 
     def __init__(self, config):
         super().__init__()
-        image_size, patch_size = config.image_size, config.patch_size
-        num_channels, hidden_size = config.num_channels, config.hidden_size
-
-        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_patches = num_patches
-
-        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
-        batch_size, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
-        if not interpolate_pos_encoding:
-            if height != self.image_size[0] or width != self.image_size[1]:
-                raise ValueError(
-                    f"Input image size ({height}*{width}) doesn't match model"
-                    f" ({self.image_size[0]}*{self.image_size[1]})."
-                )
-        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        # image_size, patch_size = config.image_size, config.patch_size
+        # num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        # image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        # patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        # num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        # self.image_size = image_size
+        # self.patch_size = patch_size
+        # self.num_channels = num_channels
+        # self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(1, config.hidden_size, kernel_size=(16, 16), stride=(10, 10))
+
+    def forward(self, input_values: torch.Tensor) -> torch.Tensor:
+        input_values = input_values.unsqueeze(1)
+        input_values = input_values.transpose(2, 3)
+        embeddings = self.projection(input_values).flatten(2).transpose(1, 2)
         return embeddings
 
 
@@ -447,7 +399,7 @@ class AudioSpectogramTransformerPreTrainedModel(PreTrainedModel):
 
     config_class = AudioSpectogramTransformerConfig
     base_model_prefix = "audio_spectogram_transformer"
-    main_input_name = "pixel_values"
+    main_input_name = "input_values"
     supports_gradient_checkpointing = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
@@ -480,7 +432,7 @@ def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder,
 
 AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        input_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Pixel values can be obtained using [`AudioSpectogramTransformerFeatureExtractor`]. See
             [`AudioSpectogramTransformerFeatureExtractor.__call__`] for details.
 
@@ -496,8 +448,6 @@ def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder,
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        interpolate_pos_encoding (`bool`, *optional*):
-            Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
@@ -507,19 +457,15 @@ def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder,
     "The bare AudioSpectogramTransformer Model transformer outputting raw hidden-states without any specific head on top.",
     AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.vit.modeling_vit.ViTModel with VIT->AUDIO_SPECTOGRAM_TRANSFORMER,ViT->AudioSpectogramTransformer
 class AudioSpectogramTransformerModel(AudioSpectogramTransformerPreTrainedModel):
-    def __init__(
-        self, config: AudioSpectogramTransformerConfig, add_pooling_layer: bool = True, use_mask_token: bool = False
-    ):
+    def __init__(self, config: AudioSpectogramTransformerConfig):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = AudioSpectogramTransformerEmbeddings(config, use_mask_token=use_mask_token)
+        self.embeddings = AudioSpectogramTransformerEmbeddings(config)
         self.encoder = AudioSpectogramTransformerEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.pooler = AudioSpectogramTransformerPooler(config) if add_pooling_layer else None
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -541,17 +487,15 @@ class PreTrainedModel
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPooling,
         config_class=_CONFIG_FOR_DOC,
-        modality="vision",
+        modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
     )
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        input_values: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -560,8 +504,8 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
+        if input_values is None:
+            raise ValueError("You have to specify input_values")
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
@@ -570,9 +514,7 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output = self.embeddings(
-            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embedding_output = self.embeddings(input_values)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -583,11 +525,11 @@ def forward(
         )
         sequence_output = encoder_outputs[0]
         sequence_output = self.layernorm(sequence_output)
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        pooled_output = (sequence_output[:, 0] + sequence_output[:, 1]) / 2
 
         if not return_dict:
-            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
-            return head_outputs + encoder_outputs[1:]
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
 
         return BaseModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -596,176 +538,23 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-
-# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerPooler(nn.Module):
-    def __init__(self, config: AudioSpectogramTransformerConfig):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-@add_start_docstrings(
-    """AudioSpectogramTransformer Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
-
-    <Tip>
-
-    Note that we provide a script to pre-train this model on custom data in our [examples
-    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
-
-    </Tip>
-    """,
-    AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
-)
-# Copied from transformers.models.vit.modeling_vit.ViTForMaskedImageModeling with VIT->AUDIO_SPECTOGRAM_TRANSFORMER,ViT->AudioSpectogramTransformer,vit->audio_spectogram_transformer,google/vit-base-patch16-224-in21k->MIT/ast-10-10
-class AudioSpectogramTransformerForMaskedImageModeling(AudioSpectogramTransformerPreTrainedModel):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
-        super().__init__(config)
-
-        self.audio_spectogram_transformer = AudioSpectogramTransformerModel(
-            config, add_pooling_layer=False, use_mask_token=True
-        )
-
-        self.decoder = nn.Sequential(
-            nn.Conv2d(
-                in_channels=config.hidden_size,
-                out_channels=config.encoder_stride**2 * config.num_channels,
-                kernel_size=1,
-            ),
-            nn.PixelShuffle(config.encoder_stride),
-        )
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        pixel_values: Optional[torch.Tensor] = None,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[tuple, MaskedLMOutput]:
-        r"""
-        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
-            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
-
-        Returns:
-
-        Examples:
-        ```python
-        >>> from transformers import (
-        ...     AudioSpectogramTransformerFeatureExtractor,
-        ...     AudioSpectogramTransformerForMaskedImageModeling,
-        ... )
-        >>> import torch
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> feature_extractor = AudioSpectogramTransformerFeatureExtractor.from_pretrained(
-        ...     "google/audio_spectogram_transformer-base-patch16-224-in21k"
-        ... )
-        >>> model = AudioSpectogramTransformerForMaskedImageModeling.from_pretrained(
-        ...     "google/audio_spectogram_transformer-base-patch16-224-in21k"
-        ... )
-
-        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
-        >>> pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
-        >>> # create random boolean mask of shape (batch_size, num_patches)
-        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
-
-        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
-        >>> list(reconstructed_pixel_values.shape)
-        [1, 3, 224, 224]
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.audio_spectogram_transformer(
-            pixel_values,
-            bool_masked_pos=bool_masked_pos,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        # Reshape to (batch_size, num_channels, height, width)
-        sequence_output = sequence_output[:, 1:]
-        batch_size, sequence_length, num_channels = sequence_output.shape
-        height = width = math.floor(sequence_length**0.5)
-        sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
-
-        # Reconstruct pixel values
-        reconstructed_pixel_values = self.decoder(sequence_output)
-
-        masked_im_loss = None
-        if bool_masked_pos is not None:
-            size = self.config.image_size // self.config.patch_size
-            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
-            mask = (
-                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
-                .repeat_interleave(self.config.patch_size, 2)
-                .unsqueeze(1)
-                .contiguous()
-            )
-            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
-            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
-
-        if not return_dict:
-            output = (reconstructed_pixel_values,) + outputs[1:]
-            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_im_loss,
-            logits=reconstructed_pixel_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
+    
 @add_start_docstrings(
     """
-    AudioSpectogramTransformer Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
-    the [CLS] token) e.g. for ImageNet.
-
-    <Tip>
-
-        Note that it's possible to fine-tune AudioSpectogramTransformer on higher resolution images than the ones it has been trained on, by
-        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
-        position embeddings to the higher resolution.
-
-    </Tip>
+    Audio Spectogram Transformer model with an audio classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for AudioSet.
     """,
     AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
 )
-# Copied from transformers.models.vit.modeling_vit.ViTForImageClassification with VIT->AUDIO_SPECTOGRAM_TRANSFORMER,ViT->AudioSpectogramTransformer,vit->audio_spectogram_transformer
-class AudioSpectogramTransformerForImageClassification(AudioSpectogramTransformerPreTrainedModel):
+class AudioSpectogramTransformerForSequenceClassification(AudioSpectogramTransformerPreTrainedModel):
     def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.audio_spectogram_transformer = AudioSpectogramTransformerModel(config, add_pooling_layer=False)
+        self.audio_spectogram_transformer = AudioSpectogramTransformerModel(config)
 
         # Classifier head
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
 
         # Initialize weights and apply final processing
@@ -775,7 +564,7 @@ def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutput,
+        output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
     )
@@ -786,12 +575,11 @@ def forward(
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, ImageClassifierOutput]:
+    ) -> Union[tuple, SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            Labels for computing the audio classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
@@ -802,13 +590,13 @@ def forward(
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        pooled_output = outputs[1]
 
-        logits = self.classifier(sequence_output[:, 0, :])
+        pooled_output = self.layernorm(pooled_output)
+        logits = self.classifier(pooled_output)
 
         loss = None
         if labels is not None:
@@ -837,7 +625,7 @@ def forward(
             output = (logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
-        return ImageClassifierOutput(
+        return SequenceClassifierOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
diff --git a/src/transformers/models/audio_spectogram_transformer/test.py b/src/transformers/models/audio_spectogram_transformer/test.py
new file mode 100644
index 00000000000000..5093380c742f86
--- /dev/null
+++ b/src/transformers/models/audio_spectogram_transformer/test.py
@@ -0,0 +1,14 @@
+from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
+import torch
+
+config = AudioSpectogramTransformerConfig(num_labels=527)
+model = AudioSpectogramTransformerForSequenceClassification(config)
+
+dummy_inputs = torch.randn(1, 1024, 128)
+
+outputs = model(dummy_inputs)
+
+print("Shape of logits:", outputs.logits.shape)
+
+for name, param in model.named_parameters():
+    print(name, param.shape)
\ No newline at end of file
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index f9964cb510205e..11bb3ba89d0860 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -787,6 +787,7 @@
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Audio Classification mapping
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerForSequenceClassification"),
         ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
         ("hubert", "HubertForSequenceClassification"),
         ("sew", "SEWForSequenceClassification"),

From f996abe0e26825c84ecaebfccc7d211411b1a783 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 3 Oct 2022 11:55:33 +0200
Subject: [PATCH 03/37] Add id2label mapping, run code quality

---
 src/transformers/__init__.py                  |  32 ++-
 src/transformers/models/__init__.py           |   2 +-
 .../audio_spectogram_transformer/__init__.py  |  27 +-
 ...figuration_audio_spectogram_transformer.py |  15 +-
 ...ctogram_transformer_original_to_pytorch.py | 104 ++++----
 ..._spectogram_transformer_timm_to_pytorch.py | 250 ------------------
 .../modeling_audio_spectogram_transformer.py  |  49 ++--
 .../audio_spectogram_transformer/test.py      |   6 +-
 .../models/auto/configuration_auto.py         |   6 +-
 .../models/auto/feature_extraction_auto.py    |   2 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 ...t_modeling_audio_spectogram_transformer.py |  34 ++-
 12 files changed, 160 insertions(+), 373 deletions(-)
 delete mode 100644 src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 62911f60f2faf6..d49f8ea5fa6fb3 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -122,6 +122,10 @@
     "models": [],
     # Models
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
+    "models.audio_spectogram_transformer": [
+        "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AudioSpectogramTransformerConfig",
+    ],
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "CONFIG_MAPPING",
@@ -403,7 +407,6 @@
     "models.vision_text_dual_encoder": ["VisionTextDualEncoderConfig", "VisionTextDualEncoderProcessor"],
     "models.visual_bert": ["VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VisualBertConfig"],
     "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-    "models.audio_spectogram_transformer": ["AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "AudioSpectogramTransformerConfig"],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
     "models.wav2vec2": [
@@ -3314,6 +3317,10 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+    from .models.audio_spectogram_transformer import (
+        AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AudioSpectogramTransformerConfig,
+    )
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -3566,7 +3573,6 @@
     from .models.vision_text_dual_encoder import VisionTextDualEncoderConfig, VisionTextDualEncoderProcessor
     from .models.visual_bert import VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, VisualBertConfig
     from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-    from .models.audio_spectogram_transformer import AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, AudioSpectogramTransformerConfig
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
     from .models.wav2vec2 import (
@@ -3840,8 +3846,9 @@
         from .image_processing_utils import ImageProcessingMixin
         from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
-        from .models.beit import BeitFeatureExtractor, BeitImageProcessor
-        from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
+        from .models.audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
+        from .models.beit import BeitFeatureExtractor
+        from .models.clip import CLIPFeatureExtractor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
         from .models.deformable_detr import DeformableDetrFeatureExtractor
@@ -3866,7 +3873,6 @@
         from .models.videomae import VideoMAEFeatureExtractor
         from .models.vilt import ViltFeatureExtractor, ViltProcessor
         from .models.vit import ViTFeatureExtractor
-        from .models.audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
         from .models.yolos import YolosFeatureExtractor
 
     # Modeling
@@ -3969,6 +3975,14 @@
             AlbertPreTrainedModel,
             load_tf_weights_in_albert,
         )
+        from .models.audio_spectogram_transformer import (
+            AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AudioSpectogramTransformerForImageClassification,
+            AudioSpectogramTransformerForMaskedImageModeling,
+            AudioSpectogramTransformerForSequenceClassification,
+            AudioSpectogramTransformerModel,
+            AudioSpectogramTransformerPreTrainedModel,
+        )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             MODEL_FOR_AUDIO_XVECTOR_MAPPING,
@@ -5020,14 +5034,6 @@
             ViTModel,
             ViTPreTrainedModel,
         )
-        from .models.audio_spectogram_transformer import (
-            AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectogramTransformerForImageClassification,
-            AudioSpectogramTransformerForMaskedImageModeling,
-            AudioSpectogramTransformerModel,
-            AudioSpectogramTransformerPreTrainedModel,
-            AudioSpectogramTransformerForSequenceClassification,
-        )
         from .models.vit_mae import (
             VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViTMAEForPreTraining,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 73658cf92de810..0035b8d6eb0862 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -18,6 +18,7 @@
 
 from . import (
     albert,
+    audio_spectogram_transformer,
     auto,
     bart,
     barthez,
@@ -162,7 +163,6 @@
     vision_text_dual_encoder,
     visual_bert,
     vit,
-    audio_spectogram_transformer,
     vit_mae,
     vit_msn,
     wav2vec2,
diff --git a/src/transformers/models/audio_spectogram_transformer/__init__.py b/src/transformers/models/audio_spectogram_transformer/__init__.py
index 93762ed569cc71..7d4b223390b0d9 100644
--- a/src/transformers/models/audio_spectogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectogram_transformer/__init__.py
@@ -17,15 +17,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-    is_vision_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
-_import_structure = {"configuration_audio_spectogram_transformer": ["AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "AudioSpectogramTransformerConfig", "AudioSpectogramTransformerOnnxConfig"]}
+_import_structure = {
+    "configuration_audio_spectogram_transformer": [
+        "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AudioSpectogramTransformerConfig",
+        "AudioSpectogramTransformerOnnxConfig",
+    ]
+}
 
 try:
     if not is_vision_available():
@@ -33,7 +34,9 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_audio_spectogram_transformer"] = ["AudioSpectogramTransformerFeatureExtractor"]
+    _import_structure["feature_extraction_audio_spectogram_transformer"] = [
+        "AudioSpectogramTransformerFeatureExtractor"
+    ]
 
 try:
     if not is_torch_available():
@@ -51,7 +54,11 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_audio_spectogram_transformer import AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, AudioSpectogramTransformerConfig, AudioSpectogramTransformerOnnxConfig
+    from .configuration_audio_spectogram_transformer import (
+        AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AudioSpectogramTransformerConfig,
+        AudioSpectogramTransformerOnnxConfig,
+    )
 
     try:
         if not is_vision_available():
@@ -71,9 +78,9 @@
             AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             AudioSpectogramTransformerForImageClassification,
             AudioSpectogramTransformerForMaskedImageModeling,
+            AudioSpectogramTransformerForSequenceClassification,
             AudioSpectogramTransformerModel,
             AudioSpectogramTransformerPreTrainedModel,
-            AudioSpectogramTransformerForSequenceClassification,
         )
 
 else:
diff --git a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
index 3e09881c3d393f..68023f42ddba2b 100644
--- a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
@@ -14,10 +14,6 @@
 # limitations under the License.
 """ AudioSpectogramTransformer model configuration"""
 
-from collections import OrderedDict
-from typing import Mapping
-
-from packaging import version
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -30,13 +26,14 @@
 }
 
 
-
 class AudioSpectogramTransformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AudioSpectogramTransformerModel`]. It is used to instantiate an AudioSpectogramTransformer
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the AudioSpectogramTransformer
-    [google/audio_spectogram_transformer-base-patch16-224](https://huggingface.co/google/audio_spectogram_transformer-base-patch16-224) architecture.
+    This is the configuration class to store the configuration of a [`AudioSpectogramTransformerModel`]. It is used to
+    instantiate an AudioSpectogramTransformer model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    AudioSpectogramTransformer
+    [google/audio_spectogram_transformer-base-patch16-224](https://huggingface.co/google/audio_spectogram_transformer-base-patch16-224)
+    architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index e5817a768b4e93..e3d10dc83a57be 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -20,15 +20,12 @@
 from pathlib import Path
 
 import torch
-from PIL import Image
+import torchaudio
 
-import requests
 from huggingface_hub import hf_hub_download
 from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
 from transformers.utils import logging
 
-import torchaudio
-
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -38,14 +35,12 @@ def get_audio_spectogram_transformer_config(model_name):
     config = AudioSpectogramTransformerConfig()
 
     config.num_labels = 527
-    
-    #TODO add id2label mappings
-    # repo_id = "huggingface/label-files"
-    # filename = "coco-detection-id2label.json"
-    # id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    # id2label = {int(k): v for k, v in id2label.items()}
-    # config.id2label = id2label
-    # config.label2id = {v: k for k, v in id2label.items()}
+    repo_id = "huggingface/label-files"
+    filename = "audioset-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
 
     return config
 
@@ -97,15 +92,25 @@ def convert_state_dict(orig_state_dict, config):
             layer_num = int(key_split[3])
             dim = config.hidden_size
             if "weight" in key:
-                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                orig_state_dict[
+                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
+                ] = val[:dim, :]
+                orig_state_dict[
+                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
+                ] = val[dim : dim * 2, :]
+                orig_state_dict[
+                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
+                ] = val[-dim:, :]
             else:
-                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
-                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
+                orig_state_dict[
+                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
+                ] = val[:dim]
+                orig_state_dict[
+                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
+                ] = val[dim : dim * 2]
+                orig_state_dict[
+                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
+                ] = val[-dim:]
         else:
             orig_state_dict[rename_key(key)] = val
 
@@ -127,9 +132,15 @@ def make_features(wav_name, mel_bins, target_length=1024):
     waveform, sr = torchaudio.load(wav_name)
 
     fbank = torchaudio.compliance.kaldi.fbank(
-        waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
-        window_type='hanning', num_mel_bins=mel_bins, dither=0.0,
-        frame_shift=10)
+        waveform,
+        htk_compat=True,
+        sample_frequency=sr,
+        use_energy=False,
+        window_type="hanning",
+        num_mel_bins=mel_bins,
+        dither=0.0,
+        frame_shift=10,
+    )
 
     n_frames = fbank.shape[0]
 
@@ -145,7 +156,9 @@ def make_features(wav_name, mel_bins, target_length=1024):
 
 
 @torch.no_grad()
-def convert_audio_spectogram_transformer_checkpoint(model_name, checkpoint_url, pytorch_dump_folder_path, push_to_hub=False):
+def convert_audio_spectogram_transformer_checkpoint(
+    model_name, checkpoint_url, pytorch_dump_folder_path, push_to_hub=False
+):
     """
     Copy/paste/tweak model's weights to our YOLOS structure.
     """
@@ -161,22 +174,22 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, checkpoint_url,
     # load 🤗 model
     model = AudioSpectogramTransformerForSequenceClassification(config)
     model.eval()
-    
+
     model.load_state_dict(new_state_dict)
 
     # verify outputs on dummy input
-    filepath = hf_hub_download(repo_id="nielsr/audio-spectogram-transformer-checkpoint",
-                           filename="sample_audio.flac",
-                           repo_type="dataset")
-    features = make_features(filepath, mel_bins=128) # shape(1024, 128)
-    input_values = features.expand(1, 1024, 128)  # (batch_size, time, freq)  
-    
+    filepath = hf_hub_download(
+        repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
+    )
+    features = make_features(filepath, mel_bins=128)  # shape(1024, 128)
+    input_values = features.expand(1, 1024, 128)  # (batch_size, time, freq)
+
     # forward pass
     outputs = model(input_values)
     logits = outputs.logits
 
     print("Shape of logits:", logits.shape)
-    print("Predicted class:", logits.argmax(-1))
+    print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
 
     expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
     if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
@@ -188,17 +201,8 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, checkpoint_url,
         model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        model_mapping = {
-            "audio_spectogram_transformer_ti": "audio_spectogram_transformer-tiny",
-            "audio_spectogram_transformer_s_200_pre": "audio_spectogram_transformer-small",
-            "audio_spectogram_transformer_s_300_pre": "audio_spectogram_transformer-small-300",
-            "audio_spectogram_transformer_s_dWr": "audio_spectogram_transformer-small-dwr",
-            "audio_spectogram_transformer_base": "audio_spectogram_transformer-base",
-        }
-
         print("Pushing to the hub...")
-        model_name = model_mapping[model_name]
-        model.push_to_hub(model_name, organization="hustvl")
+        model.push_to_hub(model_name, organization="nielsr")
 
 
 if __name__ == "__main__":
@@ -206,15 +210,15 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, checkpoint_url,
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="audio_spectogram_transformer_s_200_pre",
+        default="audio-spectogram-transformer-finetuned-audioset-10-10-0.4593",
         type=str,
-        help=(
-            "Name of the YOLOS model you'd like to convert. Should be one of 'audio_spectogram_transformer_ti', 'audio_spectogram_transformer_s_200_pre',"
-            " 'audio_spectogram_transformer_s_300_pre', 'audio_spectogram_transformer_s_dWr', 'audio_spectogram_transformer_base'."
-        ),
+        help="Name of the Audio Spectogram Transformer model you'd like to convert.",
     )
     parser.add_argument(
-        "--checkpoint_url", default="https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1", type=str, help="URL of the original state dict (.pth file)."
+        "--checkpoint_url",
+        default="https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1",
+        type=str,
+        help="URL of the original state dict (.pth file).",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
@@ -224,4 +228,6 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, checkpoint_url,
     )
 
     args = parser.parse_args()
-    convert_audio_spectogram_transformer_checkpoint(args.model_name, args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
\ No newline at end of file
+    convert_audio_spectogram_transformer_checkpoint(
+        args.model_name, args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub
+    )
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py
deleted file mode 100644
index 68612060ba9603..00000000000000
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_timm_to_pytorch.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert AudioSpectogramTransformer and non-distilled DeiT checkpoints from the timm library."""
-
-
-import argparse
-import json
-from pathlib import Path
-
-import torch
-from PIL import Image
-
-import requests
-import timm
-from huggingface_hub import hf_hub_download
-from transformers import DeiTFeatureExtractor, AudioSpectogramTransformerConfig, AudioSpectogramTransformerFeatureExtractor, AudioSpectogramTransformerForImageClassification, AudioSpectogramTransformerModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"audio_spectogram_transformer.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"audio_spectogram_transformer.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"audio_spectogram_transformer.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"audio_spectogram_transformer.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"audio_spectogram_transformer.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"audio_spectogram_transformer.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"audio_spectogram_transformer.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "audio_spectogram_transformer.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "audio_spectogram_transformer.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "audio_spectogram_transformer.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "audio_spectogram_transformer.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "audio_spectogram_transformer" from all keys that start with "audio_spectogram_transformer"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("audio_spectogram_transformer") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "audio_spectogram_transformer.layernorm.weight"),
-                ("norm.bias", "audio_spectogram_transformer.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "audio_spectogram_transformer."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_audio_spectogram_transformer_checkpoint(audio_spectogram_transformer_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our AudioSpectogramTransformer structure.
-    """
-
-    # define default AudioSpectogramTransformer configuration
-    config = AudioSpectogramTransformerConfig()
-    base_model = False
-    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
-    if audio_spectogram_transformer_name[-5:] == "in21k":
-        base_model = True
-        config.patch_size = int(audio_spectogram_transformer_name[-12:-10])
-        config.image_size = int(audio_spectogram_transformer_name[-9:-6])
-    else:
-        config.num_labels = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.patch_size = int(audio_spectogram_transformer_name[-6:-4])
-        config.image_size = int(audio_spectogram_transformer_name[-3:])
-    # size of the architecture
-    if "deit" in audio_spectogram_transformer_name:
-        if audio_spectogram_transformer_name[9:].startswith("tiny"):
-            config.hidden_size = 192
-            config.intermediate_size = 768
-            config.num_hidden_layers = 12
-            config.num_attention_heads = 3
-        elif audio_spectogram_transformer_name[9:].startswith("small"):
-            config.hidden_size = 384
-            config.intermediate_size = 1536
-            config.num_hidden_layers = 12
-            config.num_attention_heads = 6
-        else:
-            pass
-    else:
-        if audio_spectogram_transformer_name[4:].startswith("small"):
-            config.hidden_size = 768
-            config.intermediate_size = 2304
-            config.num_hidden_layers = 8
-            config.num_attention_heads = 8
-        elif audio_spectogram_transformer_name[4:].startswith("base"):
-            pass
-        elif audio_spectogram_transformer_name[4:].startswith("large"):
-            config.hidden_size = 1024
-            config.intermediate_size = 4096
-            config.num_hidden_layers = 24
-            config.num_attention_heads = 16
-        elif audio_spectogram_transformer_name[4:].startswith("huge"):
-            config.hidden_size = 1280
-            config.intermediate_size = 5120
-            config.num_hidden_layers = 32
-            config.num_attention_heads = 16
-
-    # load original model from timm
-    timm_model = timm.create_model(audio_spectogram_transformer_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = timm_model.state_dict()
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    if audio_spectogram_transformer_name[-5:] == "in21k":
-        model = AudioSpectogramTransformerModel(config).eval()
-    else:
-        model = AudioSpectogramTransformerForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by AudioSpectogramTransformerFeatureExtractor/DeiTFeatureExtractor
-    if "deit" in audio_spectogram_transformer_name:
-        feature_extractor = DeiTFeatureExtractor(size=config.image_size)
-    else:
-        feature_extractor = AudioSpectogramTransformerFeatureExtractor(size=config.image_size)
-    encoding = feature_extractor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    if base_model:
-        timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.pooler_output.shape
-        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
-    else:
-        timm_logits = timm_model(pixel_values)
-        assert timm_logits.shape == outputs.logits.shape
-        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {audio_spectogram_transformer_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--audio_spectogram_transformer_name",
-        default="audio_spectogram_transformer_base_patch16_224",
-        type=str,
-        help="Name of the AudioSpectogramTransformer timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_audio_spectogram_transformer_checkpoint(args.audio_spectogram_transformer_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index 2c6f06ca066091..3f3574c21e8d0f 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 MIT and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch AudioSpectogramTransformer model."""
+""" PyTorch Audio Spectogram Transformer model."""
 
-
-import collections.abc
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
 
@@ -25,16 +23,10 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput, MaskedLMOutput
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_audio_spectogram_transformer import AudioSpectogramTransformerConfig
 
 
@@ -48,18 +40,18 @@
 _CHECKPOINT_FOR_DOC = "MIT/ast-10-10"
 _EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
 
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "google/audio_spectogram_transformer-base-patch16-224"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+# Audio classification docstring
+_SEQ_CLASS_CHECKPOINT = "MIT/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
+_SEQ_CLASS_EXPECTED_OUTPUT = ""
+_SEQ_CLASS_EXPECTED_LOSS = 0.0
 
 
 AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "MIT/ast-10-10",
+    "MIT/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593",
     # See all Audio Spectogram Transformer models at https://huggingface.co/models?filter=audio-spectogram-transformer
 ]
 
 
-
 class AudioSpectogramTransformerEmbeddings(nn.Module):
     """
     Construct the CLS token, position and patch embeddings.
@@ -89,7 +81,7 @@ def get_shape(self, config):
         f_dim = test_out.shape[2]
         t_dim = test_out.shape[3]
         return f_dim, t_dim
-    
+
     def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         batch_size = input_values.shape[0]
         embeddings = self.patch_embeddings(input_values)
@@ -123,7 +115,11 @@ def __init__(self, config):
         # self.num_channels = num_channels
         # self.num_patches = num_patches
 
-        self.projection = nn.Conv2d(1, config.hidden_size, kernel_size=(16, 16), stride=(10, 10))
+        patch_size = config.patch_size
+        fstride = config.fstride
+        tstride = config.tstride
+
+        self.projection = nn.Conv2d(1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(fstride, tstride))
 
     def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         input_values = input_values.unsqueeze(1)
@@ -454,7 +450,8 @@ def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder,
 
 
 @add_start_docstrings(
-    "The bare AudioSpectogramTransformer Model transformer outputting raw hidden-states without any specific head on top.",
+    "The bare AudioSpectogramTransformer Model transformer outputting raw hidden-states without any specific head on"
+    " top.",
     AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
 )
 class AudioSpectogramTransformerModel(AudioSpectogramTransformerPreTrainedModel):
@@ -538,11 +535,11 @@ def forward(
             attentions=encoder_outputs.attentions,
         )
 
-    
+
 @add_start_docstrings(
     """
-    Audio Spectogram Transformer model with an audio classification head on top (a linear layer on top of the final hidden state of
-    the [CLS] token) e.g. for AudioSet.
+    Audio Spectogram Transformer model with an audio classification head on top (a linear layer on top of the final
+    hidden state of the [CLS] token) e.g. for AudioSet.
     """,
     AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
 )
@@ -563,10 +560,12 @@ def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
     @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
-        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
-        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
     def forward(
         self,
diff --git a/src/transformers/models/audio_spectogram_transformer/test.py b/src/transformers/models/audio_spectogram_transformer/test.py
index 5093380c742f86..0880f06ee135ea 100644
--- a/src/transformers/models/audio_spectogram_transformer/test.py
+++ b/src/transformers/models/audio_spectogram_transformer/test.py
@@ -1,6 +1,8 @@
-from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
 import torch
 
+from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
+
+
 config = AudioSpectogramTransformerConfig(num_labels=527)
 model = AudioSpectogramTransformerForSequenceClassification(config)
 
@@ -11,4 +13,4 @@
 print("Shape of logits:", outputs.logits.shape)
 
 for name, param in model.named_parameters():
-    print(name, param.shape)
\ No newline at end of file
+    print(name, param.shape)
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index f1f6f8f0e243a4..cb0c588b673275 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -30,6 +30,7 @@
     [
         # Add configs here
         ("albert", "AlbertConfig"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
         ("bert", "BertConfig"),
@@ -157,7 +158,6 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
         ("visual_bert", "VisualBertConfig"),
         ("vit", "ViTConfig"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerConfig"),
         ("vit_mae", "ViTMAEConfig"),
         ("vit_msn", "ViTMSNConfig"),
         ("wav2vec2", "Wav2Vec2Config"),
@@ -180,6 +180,7 @@
     [
         # Add archive maps here)
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("audio-spectogram-transformer", "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -294,7 +295,6 @@
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("audio-spectogram-transformer", "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_msn", "VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -315,6 +315,7 @@
     [
         # Add full (and cased) model names here
         ("albert", "ALBERT"),
+        ("audio-spectogram-transformer", "Audio Spectogram Transformer"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
         ("bartpho", "BARTpho"),
@@ -462,7 +463,6 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
         ("visual_bert", "VisualBERT"),
         ("vit", "ViT"),
-        ("audio-spectogram-transformer", "Audio Spectogram Transformer"),
         ("vit_mae", "ViTMAE"),
         ("vit_msn", "ViTMSN"),
         ("wav2vec2", "Wav2Vec2"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index cbe23d3830a645..7baed7e625630d 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -37,6 +37,7 @@
 
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
     [
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
@@ -79,7 +80,6 @@
         ("videomae", "VideoMAEFeatureExtractor"),
         ("vilt", "ViltFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),
         ("vit_msn", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 11bb3ba89d0860..96e7f37715008c 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -29,6 +29,7 @@
     [
         # Base model mapping
         ("albert", "AlbertModel"),
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
         ("bert", "BertModel"),
@@ -151,7 +152,6 @@
         ("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
         ("visual_bert", "VisualBertModel"),
         ("vit", "ViTModel"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerModel"),
         ("vit_mae", "ViTMAEModel"),
         ("vit_msn", "ViTMSNModel"),
         ("wav2vec2", "Wav2Vec2Model"),
@@ -348,11 +348,11 @@
 
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
     [
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerForMaskedImageModeling"),
         ("deit", "DeiTForMaskedImageModeling"),
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
         ("vit", "ViTForMaskedImageModeling"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerForMaskedImageModeling"),
     ]
 )
 
@@ -367,6 +367,7 @@
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Image Classification mapping
+        ("audio-spectogram-transformer", "AudioSpectogramTransformerForImageClassification"),
         ("beit", "BeitForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
         ("cvt", "CvtForImageClassification"),
@@ -395,7 +396,6 @@
         ("swinv2", "Swinv2ForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerForImageClassification"),
         ("vit_msn", "ViTMSNForImageClassification"),
     ]
 )
diff --git a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
index 01cfcf26320b77..bb7c4248fb2e84 100644
--- a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
@@ -30,8 +30,14 @@
     import torch
     from torch import nn
 
-    from transformers import AudioSpectogramTransformerForImageClassification, AudioSpectogramTransformerForMaskedImageModeling, AudioSpectogramTransformerModel
-    from transformers.models.audio_spectogram_transformer.modeling_audio_spectogram_transformer import AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers import (
+        AudioSpectogramTransformerForImageClassification,
+        AudioSpectogramTransformerForMaskedImageModeling,
+        AudioSpectogramTransformerModel,
+    )
+    from transformers.models.audio_spectogram_transformer.modeling_audio_spectogram_transformer import (
+        AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
 
 
 if is_vision_available():
@@ -192,7 +198,9 @@ class AudioSpectogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = AudioSpectogramTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AudioSpectogramTransformerConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=AudioSpectogramTransformerConfig, has_text_modality=False, hidden_size=37
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -252,11 +260,19 @@ def prepare_img():
 class AudioSpectogramTransformerModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return AudioSpectogramTransformerFeatureExtractor.from_pretrained("google/audio_spectogram_transformer-base-patch16-224") if is_vision_available() else None
+        return (
+            AudioSpectogramTransformerFeatureExtractor.from_pretrained(
+                "google/audio_spectogram_transformer-base-patch16-224"
+            )
+            if is_vision_available()
+            else None
+        )
 
     @slow
     def test_inference_image_classification_head(self):
-        model = AudioSpectogramTransformerForImageClassification.from_pretrained("google/audio_spectogram_transformer-base-patch16-224").to(torch_device)
+        model = AudioSpectogramTransformerForImageClassification.from_pretrained(
+            "google/audio_spectogram_transformer-base-patch16-224"
+        ).to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         image = prepare_img()
@@ -280,9 +296,13 @@ def test_inference_interpolate_pos_encoding(self):
         # allowing to interpolate the pre-trained position embeddings in order to use
         # the model on higher resolutions. The DINO model by Facebook AI leverages this
         # to visualize self-attention on higher resolution images.
-        model = AudioSpectogramTransformerModel.from_pretrained("facebook/dino-audio_spectogram_transformers8").to(torch_device)
+        model = AudioSpectogramTransformerModel.from_pretrained("facebook/dino-audio_spectogram_transformers8").to(
+            torch_device
+        )
 
-        feature_extractor = AudioSpectogramTransformerFeatureExtractor.from_pretrained("facebook/dino-audio_spectogram_transformers8", size=480)
+        feature_extractor = AudioSpectogramTransformerFeatureExtractor.from_pretrained(
+            "facebook/dino-audio_spectogram_transformers8", size=480
+        )
         image = prepare_img()
         inputs = feature_extractor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(torch_device)

From 7006ed886f8a31e27ebe6c672b4230d545ee45d5 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 30 Oct 2022 17:20:41 +0100
Subject: [PATCH 04/37] Fix copies

---
 README_es.md                                  |  1 +
 docs/source/en/serialization.mdx              |  1 -
 .../modeling_audio_spectogram_transformer.py  |  9 ++-
 src/transformers/utils/dummy_pt_objects.py    | 69 ++++++++++---------
 .../utils/dummy_vision_objects.py             | 14 ++--
 5 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/README_es.md b/README_es.md
index af0f556d739caa..c0cdf2eeb26e90 100644
--- a/README_es.md
+++ b/README_es.md
@@ -262,6 +262,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index ca654bcb1c1aea..b03f2291da5f8c 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -46,7 +46,6 @@ Ready-made configurations include the following architectures:
 <!--This table is automatically generated by `make fix-copies`, do not fill manually!-->
 
 - ALBERT
-- Audio Spectogram Transformer
 - BART
 - BEiT
 - BERT
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index 3f3574c21e8d0f..bedb62616e8f43 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -386,7 +386,6 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->AudioSpectogramTransformer,vit->audio_spectogram_transformer
 class AudioSpectogramTransformerPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -398,20 +397,20 @@ class AudioSpectogramTransformerPreTrainedModel(PreTrainedModel):
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
 
+    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._init_weights
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
+    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing
     def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder, value: bool = False) -> None:
-        if isinstance(module, AudioSpectogramTransformerEncoder):
+        if isinstance(module, ViTEncoder):
             module.gradient_checkpointing = value
 
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 7319358dc12bec..59ad51131e0ac2 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -350,6 +350,44 @@ def load_tf_weights_in_albert(*args, **kwargs):
     requires_backends(load_tf_weights_in_albert, ["torch"])
 
 
+AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class AudioSpectogramTransformerForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerForMaskedImageModeling(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AudioSpectogramTransformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
 
 
@@ -5698,37 +5736,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class AudioSpectogramTransformerForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class AudioSpectogramTransformerForMaskedImageModeling(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class AudioSpectogramTransformerModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class AudioSpectogramTransformerPreTrainedModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e5438b4c159d7f..e7ce624f0ee34b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -29,6 +29,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class AudioSpectogramTransformerFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class BeitFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -358,13 +365,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class AudioSpectogramTransformerFeatureExtractor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class YolosFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 488e7e5456acb83358657befc9bbc86c4f9e7e1f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 30 Oct 2022 18:01:05 +0100
Subject: [PATCH 05/37] Add first draft of feature extractor

---
 .../audio-spectogram-transformer.mdx          |  11 +-
 src/transformers/__init__.py                  |   8 +-
 .../audio_spectogram_transformer/__init__.py  |  10 +-
 ...extraction_audio_spectogram_transformer.py | 245 +++++++++++-------
 .../modeling_audio_spectogram_transformer.py  |   8 +-
 .../audio_spectogram_transformer/test.py      |  38 ++-
 src/transformers/utils/dummy_pt_objects.py    |  14 -
 .../utils/dummy_speech_objects.py             |   7 +
 .../utils/dummy_vision_objects.py             |   7 -
 ...t_modeling_audio_spectogram_transformer.py |  81 +-----
 10 files changed, 207 insertions(+), 222 deletions(-)

diff --git a/docs/source/en/model_doc/audio-spectogram-transformer.mdx b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
index 0556e4b0215154..ff555af61403fe 100644
--- a/docs/source/en/model_doc/audio-spectogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
@@ -43,12 +43,7 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 [[autodoc]] AudioSpectogramTransformerModel
     - forward
 
-## AudioSpectogramTransformerForMaskedImageModeling
+## AudioSpectogramTransformerForSequenceClassification
 
-[[autodoc]] AudioSpectogramTransformerForMaskedImageModeling
-    - forward
-
-## AudioSpectogramTransformerForImageClassification
-
-[[autodoc]] AudioSpectogramTransformerForImageClassification
-    - forward
+[[autodoc]] AudioSpectogramTransformerForSequenceClassification
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d49f8ea5fa6fb3..fdc7093ec3f937 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -677,6 +677,7 @@
         name for name in dir(dummy_speech_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["models.audio_spectogram_transformer"].append("AudioSpectogramTransformerFeatureExtractor")
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
 
@@ -746,7 +747,6 @@
     _import_structure["models.vilt"].append("ViltFeatureExtractor")
     _import_structure["models.vilt"].append("ViltProcessor")
     _import_structure["models.vit"].append("ViTFeatureExtractor")
-    _import_structure["models.audio_spectogram_transformer"].append("AudioSpectogramTransformerFeatureExtractor")
     _import_structure["models.yolos"].append("YolosFeatureExtractor")
 
 # Timm-backed objects
@@ -2168,8 +2168,6 @@
     _import_structure["models.audio_spectogram_transformer"].extend(
         [
             "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "AudioSpectogramTransformerForImageClassification",
-            "AudioSpectogramTransformerForMaskedImageModeling",
             "AudioSpectogramTransformerModel",
             "AudioSpectogramTransformerPreTrainedModel",
             "AudioSpectogramTransformerForSequenceClassification",
@@ -3818,6 +3816,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_speech_objects import *
     else:
+        from .models.audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
 
@@ -3846,7 +3845,6 @@
         from .image_processing_utils import ImageProcessingMixin
         from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
-        from .models.audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
         from .models.beit import BeitFeatureExtractor
         from .models.clip import CLIPFeatureExtractor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor
@@ -3977,8 +3975,6 @@
         )
         from .models.audio_spectogram_transformer import (
             AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectogramTransformerForImageClassification,
-            AudioSpectogramTransformerForMaskedImageModeling,
             AudioSpectogramTransformerForSequenceClassification,
             AudioSpectogramTransformerModel,
             AudioSpectogramTransformerPreTrainedModel,
diff --git a/src/transformers/models/audio_spectogram_transformer/__init__.py b/src/transformers/models/audio_spectogram_transformer/__init__.py
index 7d4b223390b0d9..8bae67516da2ab 100644
--- a/src/transformers/models/audio_spectogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectogram_transformer/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_speech_available, is_torch_available
 
 
 _import_structure = {
@@ -29,7 +29,7 @@
 }
 
 try:
-    if not is_vision_available():
+    if not is_speech_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -46,8 +46,6 @@
 else:
     _import_structure["modeling_audio_spectogram_transformer"] = [
         "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "AudioSpectogramTransformerForImageClassification",
-        "AudioSpectogramTransformerForMaskedImageModeling",
         "AudioSpectogramTransformerModel",
         "AudioSpectogramTransformerPreTrainedModel",
         "AudioSpectogramTransformerForSequenceClassification",
@@ -61,7 +59,7 @@
     )
 
     try:
-        if not is_vision_available():
+        if not is_speech_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
@@ -76,8 +74,6 @@
     else:
         from .modeling_audio_spectogram_transformer import (
             AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectogramTransformerForImageClassification,
-            AudioSpectogramTransformerForMaskedImageModeling,
             AudioSpectogramTransformerForSequenceClassification,
             AudioSpectogramTransformerModel,
             AudioSpectogramTransformerPreTrainedModel,
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index 47220288757adb..70957269499e0b 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,138 +12,199 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for AudioSpectogramTransformer."""
+"""
+Feature extractor class for Audio Spectogram Transformer.
+"""
 
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import numpy as np
-from PIL import Image
+import torch
+import torchaudio.compliance.kaldi as ta_kaldi
 
-from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ImageFeatureExtractionMixin,
-    ImageInput,
-    is_torch_tensor,
-)
-from ...utils import TensorType, logging
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
 
 
 logger = logging.get_logger(__name__)
 
 
-class AudioSpectogramTransformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
     r"""
-    Constructs a AudioSpectogramTransformer feature extractor.
+    Constructs a Audio Spectogram Transformer feature extractor.
 
-    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
-    should refer to this superclass for more information regarding those methods.
+    This feature extractor inherits from [`AudioSpectogramTransformerFeatureExtractor`] which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
+    mean and variance normalization to the extracted features.
 
     Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input to a certain `size`.
-        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
-            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
-            set to `True`.
-        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
-            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
-            if `do_resize` is set to `True`.
+        sampling_rate (`int`, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
+        num_mel_bins (`int`, defaults to 128):
+            Number of Mel-frequency bins.
+        padding_value (`float`, defaults to 0.0):
+            The value that is used to fill the padding vectors.
         do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to normalize the input with mean and standard deviation.
-        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
-            The sequence of means for each channel, to be used when normalizing images.
-        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
-            The sequence of standard deviations for each channel, to be used when normalizing images.
+            Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
+        mean (`int`, *optional*, defaults to -4.2677393):
+            Whether or not to zero-mean normalize the extracted features.
+        std (`int`, *optional*, defaults to `True`):
+            Whether or not to unit-variance normalize the extracted features.
     """
 
-    model_input_names = ["pixel_values"]
+    model_input_names = ["input_features", "attention_mask"]
 
     def __init__(
         self,
-        do_resize=True,
-        size=224,
-        resample=Image.BILINEAR,
+        feature_size=80,
+        sampling_rate=16000,
+        num_mel_bins=128,
+        padding_value=0.0,
         do_normalize=True,
-        image_mean=None,
-        image_std=None,
+        mean=-4.2677393,
+        std=4.5689974,
         **kwargs
     ):
-        super().__init__(**kwargs)
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
+        super().__init__(feature_size=80, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.num_mel_bins = num_mel_bins
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.mean = mean
+        self.std = std
 
-    def __call__(
-        self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
-    ) -> BatchFeature:
+    def _extract_fbank_features(
+        self,
+        waveform: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
+        and hence the waveform should not be normalized before feature extraction.
         """
-        Main method to prepare for the model one or several image(s).
+        # waveform = waveform * (2**15)  # Kaldi compliance: 16-bit signed integers
+        waveform = torch.from_numpy(waveform).unsqueeze(0)
+        features = ta_kaldi.fbank(
+            waveform,
+            htk_compat=True,
+            sample_frequency=self.sampling_rate,
+            use_energy=False,
+            window_type="hanning",
+            num_mel_bins=self.num_mel_bins,
+            dither=0.0,
+            frame_shift=10,
+        )
 
-        <Tip warning={true}>
+        return features.numpy()
 
-        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-        PIL images.
+    def normalize(self, input_features: List[np.ndarray]) -> List[np.ndarray]:
+        return (input_features - (self.mean)) / (self.std * 2)
 
-        </Tip>
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: int = 1024,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
 
         Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
-                If set, will return tensors of a particular framework. Acceptable values are:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*, defaults to 1024):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
 
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
-              width).
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
+                The value that is used to fill the padding values / vectors.
         """
-        # Input type checking for clearer error
-        valid_images = False
-
-        # Check that images has a valid type
-        if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
-            valid_images = True
-        elif isinstance(images, (list, tuple)):
-            if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
-                valid_images = True
-
-        if not valid_images:
-            raise ValueError(
-                "Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
-                "`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
             )
 
         is_batched = bool(
-            isinstance(images, (list, tuple))
-            and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
+            isinstance(raw_speech, (list, tuple))
+            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
         )
 
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
         if not is_batched:
-            images = [images]
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            **kwargs,
+        )
+
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features")
+        if isinstance(input_features[0], list):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
 
-        # transformations (resizing + normalization)
-        if self.do_resize and self.size is not None:
-            images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
+        # normalization
         if self.do_normalize:
-            images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
+            padded_inputs["input_features"] = self.normalize(padded_inputs["input_features"])
 
-        # return as BatchFeature
-        data = {"pixel_values": images}
-        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
 
-        return encoded_inputs
+        return padded_inputs
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index bedb62616e8f43..35e3f113e5f98d 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -119,7 +119,9 @@ def __init__(self, config):
         fstride = config.fstride
         tstride = config.tstride
 
-        self.projection = nn.Conv2d(1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(fstride, tstride))
+        self.projection = nn.Conv2d(
+            1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(fstride, tstride)
+        )
 
     def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         input_values = input_values.unsqueeze(1)
@@ -408,9 +410,9 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing
+    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing with ViT->AudioSpectogramTransformer
     def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder, value: bool = False) -> None:
-        if isinstance(module, ViTEncoder):
+        if isinstance(module, AudioSpectogramTransformerEncoder):
             module.gradient_checkpointing = value
 
 
diff --git a/src/transformers/models/audio_spectogram_transformer/test.py b/src/transformers/models/audio_spectogram_transformer/test.py
index 0880f06ee135ea..afaef6b32a62cf 100644
--- a/src/transformers/models/audio_spectogram_transformer/test.py
+++ b/src/transformers/models/audio_spectogram_transformer/test.py
@@ -1,16 +1,38 @@
 import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
 
-from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
+from transformers import (
+    AudioSpectogramTransformerConfig,
+    AudioSpectogramTransformerFeatureExtractor,
+    AudioSpectogramTransformerForSequenceClassification,
+)
 
+# define feature extractor and model
+feature_extractor = AudioSpectogramTransformerFeatureExtractor()
+# config = AudioSpectogramTransformerConfig(num_labels=527)
+# model = AudioSpectogramTransformerForSequenceClassification(config)
 
-config = AudioSpectogramTransformerConfig(num_labels=527)
-model = AudioSpectogramTransformerForSequenceClassification(config)
+# read audio
+filepath = hf_hub_download(repo_id="nielsr/audio-spectogram-transformer-checkpoint",
+                           filename="sample_audio.flac",
+                           repo_type="dataset")
 
-dummy_inputs = torch.randn(1, 1024, 128)
+raw_speech, _ = torchaudio.load(filepath)
 
-outputs = model(dummy_inputs)
+raw_speech = raw_speech.squeeze().numpy()
 
-print("Shape of logits:", outputs.logits.shape)
+# prepare audio for the model
+inputs = feature_extractor(raw_speech, padding="max_length", return_tensors="pt")
 
-for name, param in model.named_parameters():
-    print(name, param.shape)
+for k,v in inputs.items():
+    print(k,v.shape)
+
+# dummy_inputs = torch.randn(1, 1024, 128)
+
+# outputs = model(dummy_inputs)
+
+# print("Shape of logits:", outputs.logits.shape)
+
+# for name, param in model.named_parameters():
+#     print(name, param.shape)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 59ad51131e0ac2..edfcd3839c75b1 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -353,20 +353,6 @@ def load_tf_weights_in_albert(*args, **kwargs):
 AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class AudioSpectogramTransformerForImageClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
-class AudioSpectogramTransformerForMaskedImageModeling(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class AudioSpectogramTransformerForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index ae5589292a4cf9..66c2d85fdacdac 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class AudioSpectogramTransformerFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
+
+
 class MCTCTFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e7ce624f0ee34b..4b57bfe2f1895b 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -29,13 +29,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class AudioSpectogramTransformerFeatureExtractor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class BeitFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
index bb7c4248fb2e84..32bc547e164d3e 100644
--- a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
@@ -30,11 +30,7 @@
     import torch
     from torch import nn
 
-    from transformers import (
-        AudioSpectogramTransformerForImageClassification,
-        AudioSpectogramTransformerForMaskedImageModeling,
-        AudioSpectogramTransformerModel,
-    )
+    from transformers import AudioSpectogramTransformerForSequenceClassification, AudioSpectogramTransformerModel
     from transformers.models.audio_spectogram_transformer.modeling_audio_spectogram_transformer import (
         AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
@@ -126,43 +122,6 @@ def create_and_check_model(self, config, pixel_values, labels):
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = AudioSpectogramTransformerForMaskedImageModeling(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = AudioSpectogramTransformerForMaskedImageModeling(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = AudioSpectogramTransformerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = AudioSpectogramTransformerForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -184,8 +143,7 @@ class AudioSpectogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             AudioSpectogramTransformerModel,
-            AudioSpectogramTransformerForImageClassification,
-            AudioSpectogramTransformerForMaskedImageModeling,
+            AudioSpectogramTransformerForSequenceClassification,
         )
         if is_torch_available()
         else ()
@@ -269,8 +227,8 @@ def default_feature_extractor(self):
         )
 
     @slow
-    def test_inference_image_classification_head(self):
-        model = AudioSpectogramTransformerForImageClassification.from_pretrained(
+    def test_inference_audio_classification(self):
+        model = AudioSpectogramTransformerForSequenceClassification.from_pretrained(
             "google/audio_spectogram_transformer-base-patch16-224"
         ).to(torch_device)
 
@@ -289,34 +247,3 @@ def test_inference_image_classification_head(self):
         expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # AudioSpectogramTransformer models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model = AudioSpectogramTransformerModel.from_pretrained("facebook/dino-audio_spectogram_transformers8").to(
-            torch_device
-        )
-
-        feature_extractor = AudioSpectogramTransformerFeatureExtractor.from_pretrained(
-            "facebook/dino-audio_spectogram_transformers8", size=480
-        )
-        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 3601, 384))
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
-        ).to(torch_device)
-
-        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))

From 6a27c2eec5c4ead18f45db707a69e8665e9774e2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 30 Oct 2022 18:23:23 +0100
Subject: [PATCH 06/37] Update conversion script to use feature extractor

---
 ...ctogram_transformer_original_to_pytorch.py | 42 ++++++-------------
 ...extraction_audio_spectogram_transformer.py |  8 +++-
 .../audio_spectogram_transformer/test.py      | 23 +++++-----
 3 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index e3d10dc83a57be..812e2e407a3470 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -17,13 +17,18 @@
 
 import argparse
 import json
+import wave
 from pathlib import Path
 
 import torch
 import torchaudio
 
 from huggingface_hub import hf_hub_download
-from transformers import AudioSpectogramTransformerConfig, AudioSpectogramTransformerForSequenceClassification
+from transformers import (
+    AudioSpectogramTransformerConfig,
+    AudioSpectogramTransformerFeatureExtractor,
+    AudioSpectogramTransformerForSequenceClassification,
+)
 from transformers.utils import logging
 
 
@@ -128,33 +133,6 @@ def remove_keys(state_dict):
         state_dict.pop(k, None)
 
 
-def make_features(wav_name, mel_bins, target_length=1024):
-    waveform, sr = torchaudio.load(wav_name)
-
-    fbank = torchaudio.compliance.kaldi.fbank(
-        waveform,
-        htk_compat=True,
-        sample_frequency=sr,
-        use_energy=False,
-        window_type="hanning",
-        num_mel_bins=mel_bins,
-        dither=0.0,
-        frame_shift=10,
-    )
-
-    n_frames = fbank.shape[0]
-
-    p = target_length - n_frames
-    if p > 0:
-        m = torch.nn.ZeroPad2d((0, 0, 0, p))
-        fbank = m(fbank)
-    elif p < 0:
-        fbank = fbank[0:target_length, :]
-
-    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
-    return fbank
-
-
 @torch.no_grad()
 def convert_audio_spectogram_transformer_checkpoint(
     model_name, checkpoint_url, pytorch_dump_folder_path, push_to_hub=False
@@ -181,8 +159,12 @@ def convert_audio_spectogram_transformer_checkpoint(
     filepath = hf_hub_download(
         repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
     )
-    features = make_features(filepath, mel_bins=128)  # shape(1024, 128)
-    input_values = features.expand(1, 1024, 128)  # (batch_size, time, freq)
+    feature_extractor = AudioSpectogramTransformerFeatureExtractor()
+    waveform, _ = torchaudio.load(filepath)
+    waveform = waveform.squeeze().numpy()
+
+    inputs = feature_extractor(waveform, sampling_rate=16000, padding="max_length", return_tensors="pt")
+    input_values = inputs.input_features
 
     # forward pass
     outputs = model(input_values)
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index 70957269499e0b..d71bd298baad0c 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -41,6 +41,8 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
     mean and variance normalization to the extracted features.
 
     Args:
+        feature_size (:obj:`int`, defaults to 128):
+            The feature dimension of the extracted features.
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
         num_mel_bins (`int`, defaults to 128):
@@ -53,19 +55,22 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
             Whether or not to zero-mean normalize the extracted features.
         std (`int`, *optional*, defaults to `True`):
             Whether or not to unit-variance normalize the extracted features.
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~AudioSpectogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
     """
 
     model_input_names = ["input_features", "attention_mask"]
 
     def __init__(
         self,
-        feature_size=80,
+        feature_size=128,
         sampling_rate=16000,
         num_mel_bins=128,
         padding_value=0.0,
         do_normalize=True,
         mean=-4.2677393,
         std=4.5689974,
+        return_attention_mask=False,
         **kwargs
     ):
         super().__init__(feature_size=80, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
@@ -73,6 +78,7 @@ def __init__(
         self.do_normalize = do_normalize
         self.mean = mean
         self.std = std
+        self.return_attention_mask = return_attention_mask
 
     def _extract_fbank_features(
         self,
diff --git a/src/transformers/models/audio_spectogram_transformer/test.py b/src/transformers/models/audio_spectogram_transformer/test.py
index afaef6b32a62cf..90e10d986a399f 100644
--- a/src/transformers/models/audio_spectogram_transformer/test.py
+++ b/src/transformers/models/audio_spectogram_transformer/test.py
@@ -1,22 +1,23 @@
 import torch
 import torchaudio
-from huggingface_hub import hf_hub_download
 
+from huggingface_hub import hf_hub_download
 from transformers import (
     AudioSpectogramTransformerConfig,
     AudioSpectogramTransformerFeatureExtractor,
     AudioSpectogramTransformerForSequenceClassification,
 )
 
+
 # define feature extractor and model
 feature_extractor = AudioSpectogramTransformerFeatureExtractor()
-# config = AudioSpectogramTransformerConfig(num_labels=527)
-# model = AudioSpectogramTransformerForSequenceClassification(config)
+config = AudioSpectogramTransformerConfig(num_labels=527)
+model = AudioSpectogramTransformerForSequenceClassification(config)
 
 # read audio
-filepath = hf_hub_download(repo_id="nielsr/audio-spectogram-transformer-checkpoint",
-                           filename="sample_audio.flac",
-                           repo_type="dataset")
+filepath = hf_hub_download(
+    repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
+)
 
 raw_speech, _ = torchaudio.load(filepath)
 
@@ -25,14 +26,12 @@
 # prepare audio for the model
 inputs = feature_extractor(raw_speech, padding="max_length", return_tensors="pt")
 
-for k,v in inputs.items():
-    print(k,v.shape)
-
-# dummy_inputs = torch.randn(1, 1024, 128)
+for k, v in inputs.items():
+    print(k, v.shape)
 
-# outputs = model(dummy_inputs)
+outputs = model(inputs.input_features)
 
-# print("Shape of logits:", outputs.logits.shape)
+print("Shape of logits:", outputs.logits.shape)
 
 # for name, param in model.named_parameters():
 #     print(name, param.shape)

From e8feefc96e43e8bf05afb5674a7fc0f1e01b335d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 30 Oct 2022 20:52:39 +0100
Subject: [PATCH 07/37] Make more tests pass

---
 ...figuration_audio_spectogram_transformer.py |  29 ++--
 ...ctogram_transformer_original_to_pytorch.py |  10 +-
 ...extraction_audio_spectogram_transformer.py |   4 +-
 .../modeling_audio_spectogram_transformer.py  |  50 ++++---
 .../audio_spectogram_transformer/test.py      |   1 -
 src/transformers/models/auto/modeling_auto.py |   2 -
 ...t_modeling_audio_spectogram_transformer.py | 128 +++++++++++-------
 tests/test_modeling_common.py                 |   2 +
 8 files changed, 123 insertions(+), 103 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
index 68023f42ddba2b..3dfa57342fd31a 100644
--- a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
@@ -59,17 +59,13 @@ class AudioSpectogramTransformerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
-            The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to `16`):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
-            The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
-        fstride (`int`, *optional*, defaults to 10):
+        frequency_stride (`int`, *optional*, defaults to 10):
             ...
-        tstride (`int`, *optional*, defaults to 10):
+        time_stride (`int`, *optional*, defaults to 10):
             ...
         input_fdim (`int`, *optional*, defaults to 128):
             ...
@@ -103,15 +99,12 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        is_encoder_decoder=False,
-        image_size=224,
         patch_size=16,
-        num_channels=3,
         qkv_bias=True,
-        fstride=10,
-        tstride=10,
-        input_fdim=128,
-        input_tdim=1024,
+        frequency_stride=10,
+        time_stride=10,
+        time_dimension=1024,
+        frequency_dimension=128,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -125,11 +118,9 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
         self.patch_size = patch_size
-        self.num_channels = num_channels
         self.qkv_bias = qkv_bias
-        self.fstride = fstride
-        self.tstride = tstride
-        self.input_fdim = input_fdim
-        self.input_tdim = input_tdim
+        self.frequency_stride = frequency_stride
+        self.time_stride = time_stride
+        self.time_dimension = time_dimension
+        self.frequency_dimension = frequency_dimension
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index 812e2e407a3470..b7ed79d5895c44 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -17,7 +17,6 @@
 
 import argparse
 import json
-import wave
 from pathlib import Path
 
 import torch
@@ -170,21 +169,22 @@ def convert_audio_spectogram_transformer_checkpoint(
     outputs = model(input_values)
     logits = outputs.logits
 
-    print("Shape of logits:", logits.shape)
-    print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-
     expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
     if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
         raise ValueError("Logits don't match")
+    print("Looks ok!")
 
     if pytorch_dump_folder_path is not None:
         Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
         print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
         model.save_pretrained(pytorch_dump_folder_path)
+        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
-        print("Pushing to the hub...")
+        print("Pushing model and feature extractor to the hub...")
         model.push_to_hub(model_name, organization="nielsr")
+        feature_extractor.push_to_hub(model_name, organization="nielsr")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index d71bd298baad0c..09150aee5f1632 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -41,7 +41,7 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
     mean and variance normalization to the extracted features.
 
     Args:
-        feature_size (:obj:`int`, defaults to 128):
+        feature_size (`int`, defaults to 1):
             The feature dimension of the extracted features.
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
@@ -63,7 +63,7 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
 
     def __init__(
         self,
-        feature_size=128,
+        feature_size=1,
         sampling_rate=16000,
         num_mel_bins=128,
         padding_value=0.0,
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index 35e3f113e5f98d..c7a17dce3081bf 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -47,7 +47,8 @@
 
 
 AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "MIT/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593",
+    # TODO update to appropriate organization
+    "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593",
     # See all Audio Spectogram Transformer models at https://huggingface.co/models?filter=audio-spectogram-transformer
 ]
 
@@ -71,16 +72,21 @@ def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
         self.config = config
 
     def get_shape(self, config):
-        fstride = config.fstride
-        tstride = config.tstride
-        input_fdim = config.input_fdim
-        input_tdim = config.input_tdim
-        test_input = torch.randn(1, 1, input_fdim, input_tdim)
-        test_proj = nn.Conv2d(1, config.hidden_size, kernel_size=(16, 16), stride=(fstride, tstride))
-        test_out = test_proj(test_input)
-        f_dim = test_out.shape[2]
-        t_dim = test_out.shape[3]
-        return f_dim, t_dim
+        frequency_stride = config.frequency_stride
+        time_stride = config.time_stride
+        frequency_dimension = config.frequency_dimension
+        time_dimension = config.time_dimension
+        test_input = torch.randn(1, 1, frequency_dimension, time_dimension)
+        test_projection = nn.Conv2d(
+            1,
+            config.hidden_size,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(frequency_stride, time_stride),
+        )
+        test_out = test_projection(test_input)
+        frequency_dimension = test_out.shape[2]
+        time_dimension = test_out.shape[3]
+        return frequency_dimension, time_dimension
 
     def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         batch_size = input_values.shape[0]
@@ -104,23 +110,13 @@ class AudioSpectogramTransformerPatchEmbeddings(nn.Module):
 
     def __init__(self, config):
         super().__init__()
-        # image_size, patch_size = config.image_size, config.patch_size
-        # num_channels, hidden_size = config.num_channels, config.hidden_size
-
-        # image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
-        # patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
-        # num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        # self.image_size = image_size
-        # self.patch_size = patch_size
-        # self.num_channels = num_channels
-        # self.num_patches = num_patches
 
         patch_size = config.patch_size
-        fstride = config.fstride
-        tstride = config.tstride
+        frequency_stride = config.frequency_stride
+        time_stride = config.time_stride
 
         self.projection = nn.Conv2d(
-            1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(fstride, tstride)
+            1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(frequency_stride, time_stride)
         )
 
     def forward(self, input_values: torch.Tensor) -> torch.Tensor:
@@ -570,7 +566,7 @@ def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
     )
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
+        input_values: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
@@ -586,7 +582,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.audio_spectogram_transformer(
-            pixel_values,
+            input_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -622,7 +618,7 @@ def forward(
                 loss = loss_fct(logits, labels)
 
         if not return_dict:
-            output = (logits,) + outputs[1:]
+            output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
         return SequenceClassifierOutput(
diff --git a/src/transformers/models/audio_spectogram_transformer/test.py b/src/transformers/models/audio_spectogram_transformer/test.py
index 90e10d986a399f..1f2cf270ee32ff 100644
--- a/src/transformers/models/audio_spectogram_transformer/test.py
+++ b/src/transformers/models/audio_spectogram_transformer/test.py
@@ -1,4 +1,3 @@
-import torch
 import torchaudio
 
 from huggingface_hub import hf_hub_download
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 96e7f37715008c..d6b10908a74c76 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -348,7 +348,6 @@
 
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
     [
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerForMaskedImageModeling"),
         ("deit", "DeiTForMaskedImageModeling"),
         ("swin", "SwinForMaskedImageModeling"),
         ("swinv2", "Swinv2ForMaskedImageModeling"),
@@ -367,7 +366,6 @@
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Image Classification mapping
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerForImageClassification"),
         ("beit", "BeitForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
         ("cvt", "CvtForImageClassification"),
diff --git a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
index 32bc547e164d3e..ff8fca9b1dc218 100644
--- a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
@@ -14,13 +14,14 @@
 # limitations under the License.
 """ Testing suite for the PyTorch AudioSpectogramTransformer model. """
 
-
 import inspect
+import tempfile
 import unittest
 
+from huggingface_hub import hf_hub_download
 from transformers import AudioSpectogramTransformerConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
-from transformers.utils import cached_property, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -36,8 +37,8 @@
     )
 
 
-if is_vision_available():
-    from PIL import Image
+if is_torchaudio_available():
+    import torchaudio
 
     from transformers import AudioSpectogramTransformerFeatureExtractor
 
@@ -47,9 +48,9 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        image_size=30,
         patch_size=2,
-        num_channels=3,
+        time_dimension=24,
+        frequency_dimension=16,
         is_training=True,
         use_labels=True,
         hidden_size=32,
@@ -62,13 +63,14 @@ def __init__(
         type_sequence_label_size=10,
         initializer_range=0.02,
         scope=None,
-        encoder_stride=2,
+        frequency_stride=2,
+        time_stride=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.image_size = image_size
         self.patch_size = patch_size
-        self.num_channels = num_channels
+        self.time_dimension = time_dimension
+        self.frequency_dimension = frequency_dimension
         self.is_training = is_training
         self.use_labels = use_labels
         self.hidden_size = hidden_size
@@ -81,14 +83,25 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.scope = scope
-        self.encoder_stride = encoder_stride
-
-        # in AudioSpectogramTransformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
+        self.frequency_stride = frequency_stride
+        self.time_stride = time_stride
+
+        # in AudioSpectogramTransformer, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        test_input = torch.randn(1, 1, self.frequency_dimension, self.time_dimension)
+        test_projection = nn.Conv2d(
+            1,
+            self.hidden_size,
+            kernel_size=(self.patch_size, self.patch_size),
+            stride=(self.frequency_stride, self.time_stride),
+        )
+        test_out = test_projection(test_input)
+        frequency_dimension = test_out.shape[2]
+        time_dimension = test_out.shape[3]
+        num_patches = frequency_dimension * time_dimension
+        self.seq_length = num_patches + 2
 
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        input_values = floats_tensor([self.batch_size, self.time_dimension, self.frequency_dimension])
 
         labels = None
         if self.use_labels:
@@ -96,13 +109,13 @@ def prepare_config_and_inputs(self):
 
         config = self.get_config()
 
-        return config, pixel_values, labels
+        return config, input_values, labels
 
     def get_config(self):
         return AudioSpectogramTransformerConfig(
-            image_size=self.image_size,
             patch_size=self.patch_size,
-            num_channels=self.num_channels,
+            time_dimension=self.time_dimension,
+            frequency_dimension=self.frequency_dimension,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
@@ -112,24 +125,25 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             is_decoder=False,
             initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
+            frequency_stride=self.frequency_stride,
+            time_stride=self.time_stride,
         )
 
-    def create_and_check_model(self, config, pixel_values, labels):
+    def create_and_check_model(self, config, input_values, labels):
         model = AudioSpectogramTransformerModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values)
+        result = model(input_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
             config,
-            pixel_values,
+            input_values,
             labels,
         ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
+        inputs_dict = {"input_values": input_values}
         return config, inputs_dict
 
 
@@ -149,7 +163,6 @@ class AudioSpectogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
         else ()
     )
     fx_compatible = False
-
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
@@ -185,65 +198,86 @@ def test_forward_signature(self):
             # signature.parameters is an OrderedDict => so arg_names order is deterministic
             arg_names = [*signature.parameters.keys()]
 
-            expected_arg_names = ["pixel_values"]
+            expected_arg_names = ["input_values"]
             self.assertListEqual(arg_names[:1], expected_arg_names)
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = AudioSpectogramTransformerModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_correct_missing_keys(self):
+        if not self.test_missing_keys:
+            return
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            model = model_class(config)
+            base_model_prefix = model.base_model_prefix
+
+            if hasattr(model, base_model_prefix):
+                with tempfile.TemporaryDirectory() as temp_dir_name:
+                    model.base_model.save_pretrained(temp_dir_name)
+                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
+                    with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"):
+                        self.assertGreater(len(loading_info["missing_keys"]), 0)
+
+
+# We will verify our results on some audio from AudioSet
+def prepare_audio():
+    filepath = hf_hub_download(
+        repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
+    )
+
+    audio, sampling_rate = torchaudio.load(filepath)
 
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
+    return audio, sampling_rate
 
 
 @require_torch
-@require_vision
+@require_torchaudio
 class AudioSpectogramTransformerModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
+        # TODO rename nielsr to appropriate organization
         return (
             AudioSpectogramTransformerFeatureExtractor.from_pretrained(
-                "google/audio_spectogram_transformer-base-patch16-224"
+                "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
             )
-            if is_vision_available()
+            if is_torchaudio_available()
             else None
         )
 
     @slow
     def test_inference_audio_classification(self):
+
+        feature_extractor = self.default_feature_extractor
+        # TODO rename nielsr to appropriate organization
         model = AudioSpectogramTransformerForSequenceClassification.from_pretrained(
-            "google/audio_spectogram_transformer-base-patch16-224"
+            "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
         ).to(torch_device)
 
         feature_extractor = self.default_feature_extractor
-        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        audio, sampling_rate = prepare_audio()
+        audio = audio.squeeze().numpy()
+        inputs = feature_extractor(audio, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").to(
+            torch_device
+        )
 
         # forward pass
         with torch.no_grad():
-            outputs = model(**inputs)
+            outputs = model(inputs.input_features)
 
         # verify the logits
-        expected_shape = torch.Size((1, 1000))
+        expected_shape = torch.Size((1, 527))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+        expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index ac1772d853734c..a5746836bb4d8a 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -44,6 +44,7 @@
     logging,
 )
 from transformers.models.auto import get_values
+from transformers.models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 from transformers.testing_utils import (
     TOKEN,
     USER,
@@ -223,6 +224,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
                 *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
                 *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device

From c0ec2685a09a4e2a7e276ff67aac05dab7f6f30c Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 30 Oct 2022 21:15:22 +0100
Subject: [PATCH 08/37] Add docs

---
 .../en/model_doc/audio-spectogram-transformer.mdx   | 13 +++++++------
 .../modeling_audio_spectogram_transformer.py        |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/audio-spectogram-transformer.mdx b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
index ff555af61403fe..b850da10440377 100644
--- a/docs/source/en/model_doc/audio-spectogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
@@ -14,19 +14,20 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The Audio Spectogram Transformer model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The Audio Spectogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+The Audio Spectogram Transformer applies a [Vision Transformer](vit) to audio, by turning audio into an image (spectogram). The model obtains state-of-the-art results
+for audio classification.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*In the past decade, convolutional neural networks (CNNs) have been widely adopted as the main building block for end-to-end audio classification models, which aim to learn a direct mapping from audio spectrograms to corresponding labels. To better capture long-range global context, a recent trend is to add a self-attention mechanism on top of the CNN, forming a CNN-attention hybrid model. However, it is unclear whether the reliance on a CNN is necessary, and if neural networks purely based on attention are sufficient to obtain good performance in audio classification. In this paper, we answer the question by introducing the Audio Spectrogram Transformer (AST), the first convolution-free, purely attention-based model for audio classification. We evaluate AST on various audio classification benchmarks, where it achieves new state-of-the-art results of 0.485 mAP on AudioSet, 95.6% accuracy on ESC-50, and 98.1% accuracy on Speech Commands V2.*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- Todo
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/YuanGongND/ast).
 
 
 ## AudioSpectogramTransformerConfig
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index c7a17dce3081bf..f93a32cae32c5e 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -37,11 +37,11 @@
 _FEAT_EXTRACTOR_FOR_DOC = "AudioSpectogramTransformerFeatureExtractor"
 
 # Base docstring
-_CHECKPOINT_FOR_DOC = "MIT/ast-10-10"
+_CHECKPOINT_FOR_DOC = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
 _EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
 
 # Audio classification docstring
-_SEQ_CLASS_CHECKPOINT = "MIT/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
+_SEQ_CLASS_CHECKPOINT = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
 _SEQ_CLASS_EXPECTED_OUTPUT = ""
 _SEQ_CLASS_EXPECTED_LOSS = 0.0
 

From d952e5f6987a8f871a6fd36eaf91c4701909642d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 10:21:43 +0100
Subject: [PATCH 09/37] update input_features to input_values + pad by default
 to max length

---
 ...extraction_audio_spectogram_transformer.py | 24 +++++++++----------
 .../modeling_audio_spectogram_transformer.py  |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index 09150aee5f1632..911c0390902bbb 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -59,7 +59,7 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
             Whether or not [`~AudioSpectogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
     """
 
-    model_input_names = ["input_features", "attention_mask"]
+    model_input_names = ["input_values", "attention_mask"]
 
     def __init__(
         self,
@@ -103,15 +103,15 @@ def _extract_fbank_features(
 
         return features.numpy()
 
-    def normalize(self, input_features: List[np.ndarray]) -> List[np.ndarray]:
-        return (input_features - (self.mean)) / (self.std * 2)
+    def normalize(self, input_values: List[np.ndarray]) -> List[np.ndarray]:
+        return (input_values - (self.mean)) / (self.std * 2)
 
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        padding: Union[bool, str, PaddingStrategy] = False,
+        padding: Union[bool, str, PaddingStrategy] = "max_length",
         max_length: int = 1024,
-        truncation: bool = False,
+        truncation: bool = True,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
@@ -124,7 +124,7 @@ def __call__(
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
 
@@ -136,7 +136,7 @@ def __call__(
                   lengths).
             max_length (`int`, *optional*, defaults to 1024):
                 Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`):
+            truncation (`bool`, *optional*, defaults to `True`):
                 Activates truncation to cut input sequences longer than *max_length* to *max_length*.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value.
@@ -190,7 +190,7 @@ def __call__(
         features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
 
         # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_features": features})
+        encoded_inputs = BatchFeature({"input_values": features})
 
         padded_inputs = self.pad(
             encoded_inputs,
@@ -202,13 +202,13 @@ def __call__(
         )
 
         # make sure list is in array format
-        input_features = padded_inputs.get("input_features")
-        if isinstance(input_features[0], list):
-            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+        input_values = padded_inputs.get("input_values")
+        if isinstance(input_values[0], list):
+            padded_inputs["input_values"] = [np.asarray(feature, dtype=np.float32) for feature in input_values]
 
         # normalization
         if self.do_normalize:
-            padded_inputs["input_features"] = self.normalize(padded_inputs["input_features"])
+            padded_inputs["input_values"] = self.normalize(padded_inputs["input_values"])
 
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index f93a32cae32c5e..f0cc0d0aa46ea6 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -38,7 +38,7 @@
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+_EXPECTED_OUTPUT_SHAPE = [1, 1214, 768]
 
 # Audio classification docstring
 _SEQ_CLASS_CHECKPOINT = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"

From 3547d7c0b124c9467cd1f7de6feeadd625c67049 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 10:27:01 +0100
Subject: [PATCH 10/37] Fix doc tests

---
 .../modeling_audio_spectogram_transformer.py                | 6 ++++--
 utils/documentation_tests.txt                               | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
index f0cc0d0aa46ea6..9fad0ce82c63db 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
@@ -37,13 +37,15 @@
 _FEAT_EXTRACTOR_FOR_DOC = "AudioSpectogramTransformerFeatureExtractor"
 
 # Base docstring
+# TODO update to appropriate organization
 _CHECKPOINT_FOR_DOC = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
 _EXPECTED_OUTPUT_SHAPE = [1, 1214, 768]
 
 # Audio classification docstring
+# TODO update to appropriate organization
 _SEQ_CLASS_CHECKPOINT = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
-_SEQ_CLASS_EXPECTED_OUTPUT = ""
-_SEQ_CLASS_EXPECTED_LOSS = 0.0
+_SEQ_CLASS_EXPECTED_OUTPUT = "'Speech'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.17
 
 
 AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 761684192922f2..f245fad7f8b349 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -18,6 +18,7 @@ src/transformers/generation/utils.py
 src/transformers/models/albert/configuration_albert.py
 src/transformers/models/albert/modeling_albert.py
 src/transformers/models/albert/modeling_tf_albert.py
+src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
 src/transformers/models/bart/configuration_bart.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/beit/configuration_beit.py

From 00471227155f5ab50d7741646fcc3584d6c755d5 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 10:41:05 +0100
Subject: [PATCH 11/37] Add feature extractor tests

---
 ...extraction_audio_spectogram_transformer.py |   2 +-
 ...extraction_audio_spectogram_transformer.py | 224 +++++++-----------
 ...t_modeling_audio_spectogram_transformer.py |  24 +-
 3 files changed, 89 insertions(+), 161 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index 911c0390902bbb..568ad9bef1de16 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -73,7 +73,7 @@ def __init__(
         return_attention_mask=False,
         **kwargs
     ):
-        super().__init__(feature_size=80, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
         self.do_normalize = do_normalize
         self.mean = mean
diff --git a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
index 05db4ec24511c3..7d064d38135917 100644
--- a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2021 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,23 +14,37 @@
 # limitations under the License.
 
 
+import itertools
+import random
 import unittest
 
 import numpy as np
 
-from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_vision_available
+from transformers import (
+    AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    AudioSpectogramTransformerConfig,
+    AudioSpectogramTransformerFeatureExtractor,
+)
+from transformers.testing_utils import require_torch, slow
 
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
-if is_torch_available():
-    import torch
+global_rng = random.Random()
 
-if is_vision_available():
-    from PIL import Image
 
-    from transformers import AudioSpectogramTransformerFeatureExtractor
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
 
 
 class AudioSpectogramTransformerFeatureExtractionTester(unittest.TestCase):
@@ -38,154 +52,88 @@ def __init__(
         self,
         parent,
         batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=18,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        padding_value=0.0,
+        sampling_rate=16000,
+        return_attention_mask=True,
         do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
         self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
 
     def prepare_feat_extract_dict(self):
         return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
+            "feature_size": self.feature_size,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
             "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
         }
 
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
 
-@require_torch
-@require_vision
-class AudioSpectogramTransformerFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+        if equal_length:
+            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
 
-    feature_extraction_class = AudioSpectogramTransformerFeatureExtractor if is_vision_available() else None
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
 
-    def setUp(self):
-        self.feature_extract_tester = AudioSpectogramTransformerFeatureExtractionTester(self)
-
-    @property
-    def feat_extract_dict(self):
-        return self.feature_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_properties(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feature_extractor, "image_mean"))
-        self.assertTrue(hasattr(feature_extractor, "image_std"))
-        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
-        self.assertTrue(hasattr(feature_extractor, "do_resize"))
-        self.assertTrue(hasattr(feature_extractor, "size"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PIL images
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
+        return speech_inputs
 
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size,
-                self.feature_extract_tester.size,
-            ),
-        )
 
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size,
-                self.feature_extract_tester.size,
-            ),
-        )
-
-    def test_call_numpy(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random numpy tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
+class AudioSpectogramTransformerFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
 
-        # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size,
-                self.feature_extract_tester.size,
-            ),
-        )
+    feature_extraction_class = AudioSpectogramTransformerFeatureExtractor
 
-        # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size,
-                self.feature_extract_tester.size,
-            ),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize feature_extractor
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        # create random PyTorch tensors
-        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
+    def setUp(self):
+        self.feat_extract_tester = AudioSpectogramTransformerFeatureExtractionTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
         # Test not batched input
-        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size,
-                self.feature_extract_tester.size,
-            ),
-        )
+        encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
 
         # Test batched
-        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.feature_extract_tester.batch_size,
-                self.feature_extract_tester.num_channels,
-                self.feature_extract_tester.size,
-                self.feature_extract_tester.size,
-            ),
-        )
+        encoded_sequences_1 = feat_extract(speech_inputs, padding=True, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, padding=True, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    @require_torch
+    def test_double_precision_pad(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_values.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_values.dtype == torch.float32)
diff --git a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
index ff8fca9b1dc218..81fee1afc3aeb3 100644
--- a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch AudioSpectogramTransformer model. """
 
 import inspect
-import tempfile
 import unittest
 
 from huggingface_hub import hf_hub_download
@@ -211,23 +210,6 @@ def test_model_from_pretrained(self):
             model = AudioSpectogramTransformerModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
-    def test_correct_missing_keys(self):
-        if not self.test_missing_keys:
-            return
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            print("Model class:", model_class)
-            model = model_class(config)
-            base_model_prefix = model.base_model_prefix
-
-            if hasattr(model, base_model_prefix):
-                with tempfile.TemporaryDirectory() as temp_dir_name:
-                    model.base_model.save_pretrained(temp_dir_name)
-                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
-                    with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"):
-                        self.assertGreater(len(loading_info["missing_keys"]), 0)
-
 
 # We will verify our results on some audio from AudioSet
 def prepare_audio():
@@ -266,13 +248,11 @@ def test_inference_audio_classification(self):
         feature_extractor = self.default_feature_extractor
         audio, sampling_rate = prepare_audio()
         audio = audio.squeeze().numpy()
-        inputs = feature_extractor(audio, sampling_rate=sampling_rate, padding="max_length", return_tensors="pt").to(
-            torch_device
-        )
+        inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
-            outputs = model(inputs.input_features)
+            outputs = model(**inputs)
 
         # verify the logits
         expected_shape = torch.Size((1, 527))

From 56aafc533ae8801a5ae55d721f802269a1cd2ea1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 11:18:48 +0100
Subject: [PATCH 12/37] Add proper padding/truncation to feature extractor

---
 ...ctogram_transformer_original_to_pytorch.py |  3 +-
 ...extraction_audio_spectogram_transformer.py | 75 +++++++------------
 ...extraction_audio_spectogram_transformer.py |  8 +-
 3 files changed, 30 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index b7ed79d5895c44..3c2e46b53bd071 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -163,10 +163,9 @@ def convert_audio_spectogram_transformer_checkpoint(
     waveform = waveform.squeeze().numpy()
 
     inputs = feature_extractor(waveform, sampling_rate=16000, padding="max_length", return_tensors="pt")
-    input_values = inputs.input_features
 
     # forward pass
-    outputs = model(input_values)
+    outputs = model(**inputs)
     logits = outputs.logits
 
     expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index 568ad9bef1de16..5a7134ca449c9d 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -47,8 +47,6 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
         num_mel_bins (`int`, defaults to 128):
             Number of Mel-frequency bins.
-        padding_value (`float`, defaults to 0.0):
-            The value that is used to fill the padding vectors.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
         mean (`int`, *optional*, defaults to -4.2677393):
@@ -83,6 +81,7 @@ def __init__(
     def _extract_fbank_features(
         self,
         waveform: np.ndarray,
+        max_length: int,
     ) -> np.ndarray:
         """
         Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
@@ -90,7 +89,7 @@ def _extract_fbank_features(
         """
         # waveform = waveform * (2**15)  # Kaldi compliance: 16-bit signed integers
         waveform = torch.from_numpy(waveform).unsqueeze(0)
-        features = ta_kaldi.fbank(
+        fbank = ta_kaldi.fbank(
             waveform,
             htk_compat=True,
             sample_frequency=self.sampling_rate,
@@ -101,20 +100,29 @@ def _extract_fbank_features(
             frame_shift=10,
         )
 
-        return features.numpy()
+        n_frames = fbank.shape[0]
+        difference = max_length - n_frames
 
-    def normalize(self, input_values: List[np.ndarray]) -> List[np.ndarray]:
+        # pad or truncate, depending on difference
+        if difference > 0:
+            m = torch.nn.ZeroPad2d((0, 0, 0, difference))
+            fbank = m(fbank)
+        elif difference < 0:
+            fbank = fbank[0:max_length, :]
+
+        fbank = fbank.numpy()
+
+        return fbank
+
+    def normalize(self, input_values: np.ndarray) -> np.ndarray:
         return (input_values - (self.mean)) / (self.std * 2)
 
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        padding: Union[bool, str, PaddingStrategy] = "max_length",
         max_length: int = 1024,
-        truncation: bool = True,
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs
     ) -> BatchFeature:
         """
@@ -124,37 +132,17 @@ def __call__(
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
             max_length (`int`, *optional*, defaults to 1024):
                 Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*, defaults to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            padding_value (`float`, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
         """
 
         if sampling_rate is not None:
@@ -186,20 +174,11 @@ def __call__(
         if not is_batched:
             raw_speech = [raw_speech]
 
-        # extract fbank features
-        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
-
-        # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_values": features})
+        # extract fbank features (padded/truncated to max_length)
+        features = [self._extract_fbank_features(waveform, max_length=max_length) for waveform in raw_speech]
 
-        padded_inputs = self.pad(
-            encoded_inputs,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            **kwargs,
-        )
+        # convert into BatchFeature
+        padded_inputs = BatchFeature({"input_values": features})
 
         # make sure list is in array format
         input_values = padded_inputs.get("input_values")
@@ -208,7 +187,7 @@ def __call__(
 
         # normalization
         if self.do_normalize:
-            padded_inputs["input_values"] = self.normalize(padded_inputs["input_values"])
+            padded_inputs["input_values"] = [self.normalize(feature) for feature in input_values]
 
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
diff --git a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
index 7d064d38135917..0216c1d8914fce 100644
--- a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
@@ -20,12 +20,8 @@
 
 import numpy as np
 
-from transformers import (
-    AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    AudioSpectogramTransformerConfig,
-    AudioSpectogramTransformerFeatureExtractor,
-)
-from transformers.testing_utils import require_torch, slow
+from transformers import AudioSpectogramTransformerFeatureExtractor
+from transformers.testing_utils import require_torch
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 

From f194e9d8723b9e056ebd04f3dc80c5194a8cd8bf Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 11:57:51 +0100
Subject: [PATCH 13/37] Add support for conversion of all audioset checkpoints

---
 ...ctogram_transformer_original_to_pytorch.py | 88 +++++++++++++++----
 ...extraction_audio_spectogram_transformer.py |  8 +-
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index 3c2e46b53bd071..cbc51c1c9a8903 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -38,9 +38,28 @@
 def get_audio_spectogram_transformer_config(model_name):
     config = AudioSpectogramTransformerConfig()
 
-    config.num_labels = 527
+    if "10-10" in model_name or "speech-commands" in model_name:
+        pass
+    elif "12-12" in model_name:
+        config.time_stride = 12
+        config.frequency_stride = 12
+    elif "14-14" in model_name:
+        config.time_stride = 14
+        config.frequency_stride = 14
+    elif "16-16" in model_name:
+        config.time_stride = 16
+        config.frequency_stride = 16
+    else:
+        raise ValueError("Model not supported")
+
     repo_id = "huggingface/label-files"
-    filename = "audioset-id2label.json"
+    if "speech-commands" in model_name:
+        config.num_labels = 35
+        filename = "speech-commands-v2-id2label.json"
+    else:
+        config.num_labels = 527
+        filename = "audioset-id2label.json"
+
     id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
     id2label = {int(k): v for k, v in id2label.items()}
     config.id2label = id2label
@@ -133,15 +152,41 @@ def remove_keys(state_dict):
 
 
 @torch.no_grad()
-def convert_audio_spectogram_transformer_checkpoint(
-    model_name, checkpoint_url, pytorch_dump_folder_path, push_to_hub=False
-):
+def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
     """
-    Copy/paste/tweak model's weights to our YOLOS structure.
+    Copy/paste/tweak model's weights to our Audio Spectogram Transformer structure.
     """
     config = get_audio_spectogram_transformer_config(model_name)
 
+    model_name_to_url = {
+        "audio-spectogram-transformer-finetuned-audioset-10-10-0.4593": (
+            "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-audioset-10-10-0.450": (
+            "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-audioset-10-10-0.448": (
+            "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-audioset-10-10-0.448-v2": (
+            "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-audioset-12-12-0.447": (
+            "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-audioset-14-14-0.443": (
+            "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-audioset-16-16-0.442": (
+            "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
+        ),
+        "audio-spectogram-transformer-finetuned-speech-commands-v2": (
+            "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
+        ),
+    }
+
     # load original state_dict
+    checkpoint_url = model_name_to_url[model_name]
     state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
     # remove some keys
     remove_keys(state_dict)
@@ -168,7 +213,26 @@ def convert_audio_spectogram_transformer_checkpoint(
     outputs = model(**inputs)
     logits = outputs.logits
 
-    expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
+    predicted_class_idx = logits.argmax(-1).item()
+
+    if model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.4593":
+        expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
+    elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.450":
+        expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
+    elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.448":
+        expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
+    elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.448-v2":
+        expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
+    elif model_name == "audio-spectogram-transformer-finetuned-audioset-12-12-0.447":
+        expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
+    elif model_name == "audio-spectogram-transformer-finetuned-audioset-14-14-0.443":
+        expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
+    elif model_name == "audio-spectogram-transformer-finetuned-audioset-16-16-0.442":
+        expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
+    elif model_name == "audio-spectogram-transformer-finetuned-speech-commands-v2":
+        expected_slice = torch.tensor([1, 2, 3])
+    else:
+        raise ValueError("Unknown model name")
     if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
         raise ValueError("Logits don't match")
     print("Looks ok!")
@@ -195,12 +259,6 @@ def convert_audio_spectogram_transformer_checkpoint(
         type=str,
         help="Name of the Audio Spectogram Transformer model you'd like to convert.",
     )
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1",
-        type=str,
-        help="URL of the original state dict (.pth file).",
-    )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
@@ -209,6 +267,4 @@ def convert_audio_spectogram_transformer_checkpoint(
     )
 
     args = parser.parse_args()
-    convert_audio_spectogram_transformer_checkpoint(
-        args.model_name, args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub
-    )
+    convert_audio_spectogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
index 5a7134ca449c9d..7488b5f23ab731 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
@@ -50,9 +50,11 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
         mean (`int`, *optional*, defaults to -4.2677393):
-            Whether or not to zero-mean normalize the extracted features.
-        std (`int`, *optional*, defaults to `True`):
-            Whether or not to unit-variance normalize the extracted features.
+            Whether or not to normalize the extracted features to a mean of 0. Uses the AudioSet mean by default,
+            obtained by the authors.
+        std (`int`, *optional*, defaults to 4.5689974):
+            Whether or not to normalize the extracted features to a std of 05. Uses the AudioSet std by default,
+            obtained by the authors.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
             Whether or not [`~AudioSpectogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
     """

From df8575c316c7ec6336c347ba422cd3de1abe43f3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 14:12:07 +0100
Subject: [PATCH 14/37] Improve docs and extend conversion script

---
 docs/source/en/_toctree.yml                   |  2 ++
 .../audio-spectogram-transformer.mdx          | 13 +++++++-
 ...ctogram_transformer_original_to_pytorch.py | 33 ++++++++++++-------
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 6d371c8dc2d1cd..8341682ac3eb5f 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -447,6 +447,8 @@
       title: Vision models
     - isExpanded: false
       sections:
+      - local: model_doc/audio_spectogram_transformer
+        title: Audio Spectogram Transformer
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
diff --git a/docs/source/en/model_doc/audio-spectogram-transformer.mdx b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
index b850da10440377..af82bf2bfdd366 100644
--- a/docs/source/en/model_doc/audio-spectogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectogram-transformer.mdx
@@ -24,7 +24,18 @@ The abstract from the paper is the following:
 
 Tips:
 
-- Todo
+- When fine-tuning the Audio Spectogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization.
+The authors normalize the input audio to have a 0 mean and 0.5 std. To use the pretrained model, you should roughly normalize the input to
+this range. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
+the authors compute the stats, or you can try using the default mean and std of [`AudioSpectogramTransformerFeatureExtractor`] (which uses the
+AudioSet statistics).
+- Note that the AST needs a smaller learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
+[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges faster, so please search the learning rate and learning rate scheduler for your task.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/audio_spectogram_transformer_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Audio Spectogram Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2104.01778">original paper</a>.</small>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/YuanGongND/ast).
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index cbc51c1c9a8903..cdd3a0e5edae50 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -21,6 +21,7 @@
 
 import torch
 import torchaudio
+from datasets import load_dataset
 
 from huggingface_hub import hf_hub_download
 from transformers import (
@@ -38,8 +39,10 @@
 def get_audio_spectogram_transformer_config(model_name):
     config = AudioSpectogramTransformerConfig()
 
-    if "10-10" in model_name or "speech-commands" in model_name:
+    if "10-10" in model_name:
         pass
+    elif "speech-commands" in model_name:
+        config.time_dimension = 128
     elif "12-12" in model_name:
         config.time_stride = 12
         config.frequency_stride = 12
@@ -200,21 +203,29 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     model.load_state_dict(new_state_dict)
 
     # verify outputs on dummy input
-    filepath = hf_hub_download(
-        repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
-    )
-    feature_extractor = AudioSpectogramTransformerFeatureExtractor()
-    waveform, _ = torchaudio.load(filepath)
-    waveform = waveform.squeeze().numpy()
+    # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
+    mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
+    std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
+    feature_extractor = AudioSpectogramTransformerFeatureExtractor(mean=mean, std=std)
 
-    inputs = feature_extractor(waveform, sampling_rate=16000, padding="max_length", return_tensors="pt")
+    if "speech-commands" in model_name:
+        dataset = load_dataset("speech_commands", "v0.02", split="validation")
+        waveform = dataset[0]["audio"]["array"]
+    else:
+        filepath = hf_hub_download(
+            repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
+        )
+        
+        waveform, _ = torchaudio.load(filepath)
+        waveform = waveform.squeeze().numpy()
+
+    max_length = 1024 if "speech-commands" not in model_name else 128
+    inputs = feature_extractor(waveform, sampling_rate=16000, max_length=max_length, return_tensors="pt")
 
     # forward pass
     outputs = model(**inputs)
     logits = outputs.logits
 
-    predicted_class_idx = logits.argmax(-1).item()
-
     if model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.4593":
         expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
     elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.450":
@@ -230,7 +241,7 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     elif model_name == "audio-spectogram-transformer-finetuned-audioset-16-16-0.442":
         expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
     elif model_name == "audio-spectogram-transformer-finetuned-speech-commands-v2":
-        expected_slice = torch.tensor([1, 2, 3])
+        expected_slice = torch.tensor([ 6.1589, -8.0566, -8.7984])
     else:
         raise ValueError("Unknown model name")
     if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):

From a08338f09bfeb94d3daec4ff89f841d93a1e4eb3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 31 Oct 2022 14:23:03 +0100
Subject: [PATCH 15/37] Fix README

---
 README.md                                      |  2 +-
 README_es.md                                   |  2 +-
 README_ko.md                                   |  2 +-
 README_zh-hans.md                              |  2 +-
 README_zh-hant.md                              |  2 +-
 docs/source/en/index.mdx                       |  2 +-
 ...nfiguration_audio_spectogram_transformer.py | 18 +++++++++---------
 ...ectogram_transformer_original_to_pytorch.py |  4 ++--
 ..._extraction_audio_spectogram_transformer.py |  2 +-
 tests/test_modeling_common.py                  |  2 +-
 10 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index ae4d24726d1806..e7de20c48cc0a2 100644
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_es.md b/README_es.md
index c0cdf2eeb26e90..8fc32c2ddc19e8 100644
--- a/README_es.md
+++ b/README_es.md
@@ -262,7 +262,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_ko.md b/README_ko.md
index 2e2e071773d1f3..3281fc07ee41f8 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -212,7 +212,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 150e881f66d5e1..188a5c16f24ac0 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -236,7 +236,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 0fba6914e9ec51..259be0df430573 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -248,7 +248,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 530011ee2cc97d..867c39247f41ec 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -50,7 +50,7 @@ The documentation is organized into five sections:
 <!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
 
 1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](model_doc/audio-spectogram-transformer)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Audio Spectogram Transformer](model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
index 3dfa57342fd31a..6082e1a4cb71f6 100644
--- a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
@@ -22,7 +22,8 @@
 logger = logging.get_logger(__name__)
 
 AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "MIT/ast-10-10": "https://huggingface.co/MIT/ast-10-10/resolve/main/config.json",
+    # TODO update to appropriate organization
+    "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593": "https://huggingface.co/nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593/resolve/main/config.json",
 }
 
 
@@ -32,13 +33,12 @@ class AudioSpectogramTransformerConfig(PretrainedConfig):
     instantiate an AudioSpectogramTransformer model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
     AudioSpectogramTransformer
-    [google/audio_spectogram_transformer-base-patch16-224](https://huggingface.co/google/audio_spectogram_transformer-base-patch16-224)
+    [nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593](https://huggingface.co/nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593)
     architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
@@ -64,13 +64,13 @@ class AudioSpectogramTransformerConfig(PretrainedConfig):
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         frequency_stride (`int`, *optional*, defaults to 10):
-            ...
+            Frequency stride to use when patchifying the spectograms.
         time_stride (`int`, *optional*, defaults to 10):
-            ...
-        input_fdim (`int`, *optional*, defaults to 128):
-            ...
-        input_tdim (`int`, *optional*, defaults to 1024):
-            ...
+            Temporal stride to use when patchifying the spectograms.
+        time_dimension (`int`, *optional*, defaults to 1024):
+            Temporal dimension of the spectrograms.
+        frequency_dimension (`int`, *optional*, defaults to 128):
+            Frequency dimension of the spectrograms.
 
     Example:
 
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
index cdd3a0e5edae50..e79cbd0727b891 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
@@ -215,7 +215,7 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
         filepath = hf_hub_download(
             repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
         )
-        
+
         waveform, _ = torchaudio.load(filepath)
         waveform = waveform.squeeze().numpy()
 
@@ -241,7 +241,7 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     elif model_name == "audio-spectogram-transformer-finetuned-audioset-16-16-0.442":
         expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
     elif model_name == "audio-spectogram-transformer-finetuned-speech-commands-v2":
-        expected_slice = torch.tensor([ 6.1589, -8.0566, -8.7984])
+        expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
     else:
         raise ValueError("Unknown model name")
     if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
diff --git a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
index 0216c1d8914fce..1220b5299e5667 100644
--- a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 HuggingFace Inc.
+# Copyright 2022 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index a5746836bb4d8a..2caba1058836a7 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -44,7 +44,6 @@
     logging,
 )
 from transformers.models.auto import get_values
-from transformers.models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 from transformers.testing_utils import (
     TOKEN,
     USER,
@@ -92,6 +91,7 @@
     from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from transformers import (
         BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
         MODEL_FOR_AUDIO_XVECTOR_MAPPING,
         MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
         MODEL_FOR_CAUSAL_LM_MAPPING,

From 162853626c94bdc65ba6b77d76d5535ac72f2a21 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 16:45:26 +0100
Subject: [PATCH 16/37] Rename spectogram to spectrogram

---
 docs/source/en/_toctree.yml                   |   4 +-
 ....mdx => audio-spectrogram-transformer.mdx} |  28 ++--
 src/transformers/__init__.py                  |  36 ++---
 src/transformers/models/__init__.py           |   2 +-
 .../__init__.py                               |  42 ++---
 ...guration_audio_spectrogram_transformer.py} |  32 ++--
 ...rogram_transformer_original_to_pytorch.py} |  82 +++++-----
 ...traction_audio_spectrogram_transformer.py} |  12 +-
 ...modeling_audio_spectrogram_transformer.py} | 143 +++++++++---------
 .../test.py                                   |   0
 .../models/auto/configuration_auto.py         |   6 +-
 .../models/auto/feature_extraction_auto.py    |   2 +-
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../__init__.py                               |   0
 ...traction_audio_spectrogram_transformer.py} |  10 +-
 ...modeling_audio_spectrogram_transformer.py} |  48 +++---
 16 files changed, 227 insertions(+), 224 deletions(-)
 rename docs/source/en/model_doc/{audio-spectogram-transformer.mdx => audio-spectrogram-transformer.mdx} (72%)
 rename src/transformers/models/{audio_spectogram_transformer => audio_spectrogram_transformer}/__init__.py (57%)
 rename src/transformers/models/{audio_spectogram_transformer/configuration_audio_spectogram_transformer.py => audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py} (77%)
 rename src/transformers/models/{audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py => audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py} (71%)
 rename src/transformers/models/{audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py => audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py} (93%)
 rename src/transformers/models/{audio_spectogram_transformer/modeling_audio_spectogram_transformer.py => audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py} (81%)
 rename src/transformers/models/{audio_spectogram_transformer => audio_spectrogram_transformer}/test.py (100%)
 rename tests/models/{audio_spectogram_transformer => audio_spectrogram_transformer}/__init__.py (100%)
 rename tests/models/{audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py => audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py} (91%)
 rename tests/models/{audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py => audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py} (80%)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8341682ac3eb5f..79bb634e8debc9 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -447,8 +447,8 @@
       title: Vision models
     - isExpanded: false
       sections:
-      - local: model_doc/audio_spectogram_transformer
-        title: Audio Spectogram Transformer
+      - local: model_doc/audio-spectrogram-transformer
+        title: Audio Spectrogram Transformer
       - local: model_doc/hubert
         title: Hubert
       - local: model_doc/mctct
diff --git a/docs/source/en/model_doc/audio-spectogram-transformer.mdx b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
similarity index 72%
rename from docs/source/en/model_doc/audio-spectogram-transformer.mdx
rename to docs/source/en/model_doc/audio-spectrogram-transformer.mdx
index af82bf2bfdd366..5d8bee23231895 100644
--- a/docs/source/en/model_doc/audio-spectogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
@@ -10,12 +10,12 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Audio Spectogram Transformer
+# Audio Spectrogram Transformer
 
 ## Overview
 
-The Audio Spectogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
-The Audio Spectogram Transformer applies a [Vision Transformer](vit) to audio, by turning audio into an image (spectogram). The model obtains state-of-the-art results
+The Audio Spectrogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+The Audio Spectrogram Transformer applies a [Vision Transformer](vit) to audio, by turning audio into an image (spectrogram). The model obtains state-of-the-art results
 for audio classification.
 
 The abstract from the paper is the following:
@@ -24,10 +24,10 @@ The abstract from the paper is the following:
 
 Tips:
 
-- When fine-tuning the Audio Spectogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization.
+- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization.
 The authors normalize the input audio to have a 0 mean and 0.5 std. To use the pretrained model, you should roughly normalize the input to
 this range. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
-the authors compute the stats, or you can try using the default mean and std of [`AudioSpectogramTransformerFeatureExtractor`] (which uses the
+the authors compute the stats, or you can try using the default mean and std of [`AudioSpectrogramTransformerFeatureExtractor`] (which uses the
 AudioSet statistics).
 - Note that the AST needs a smaller learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
 [PSLA paper](https://arxiv.org/abs/2102.01243)) and converges faster, so please search the learning rate and learning rate scheduler for your task.
@@ -35,27 +35,27 @@ AudioSet statistics).
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/audio_spectogram_transformer_architecture.png"
 alt="drawing" width="600"/>
 
-<small> Audio Spectogram Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2104.01778">original paper</a>.</small>
+<small> Audio pectrogram Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2104.01778">original paper</a>.</small>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/YuanGongND/ast).
 
 
-## AudioSpectogramTransformerConfig
+## AudioSpectrogramTransformerConfig
 
-[[autodoc]] AudioSpectogramTransformerConfig
+[[autodoc]] AudioSpectrogramTransformerConfig
 
-## AudioSpectogramTransformerFeatureExtractor
+## AudioSpectrogramTransformerFeatureExtractor
 
-[[autodoc]] AudioSpectogramTransformerFeatureExtractor
+[[autodoc]] AudioSpectrogramTransformerFeatureExtractor
     - __call__
 
-## AudioSpectogramTransformerModel
+## AudioSpectrogramTransformerModel
 
-[[autodoc]] AudioSpectogramTransformerModel
+[[autodoc]] AudioSpectrogramTransformerModel
     - forward
 
-## AudioSpectogramTransformerForSequenceClassification
+## AudioSpectrogramTransformerForSequenceClassification
 
-[[autodoc]] AudioSpectogramTransformerForSequenceClassification
+[[autodoc]] AudioSpectrogramTransformerForSequenceClassification
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index fdc7093ec3f937..1e4cbebcbfbe9a 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -122,9 +122,9 @@
     "models": [],
     # Models
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
-    "models.audio_spectogram_transformer": [
-        "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "AudioSpectogramTransformerConfig",
+    "models.audio_spectrogram_transformer": [
+        "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AudioSpectrogramTransformerConfig",
     ],
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -677,7 +677,7 @@
         name for name in dir(dummy_speech_objects) if not name.startswith("_")
     ]
 else:
-    _import_structure["models.audio_spectogram_transformer"].append("AudioSpectogramTransformerFeatureExtractor")
+    _import_structure["models.audio_spectrogram_transformer"].append("AudioSpectrogramTransformerFeatureExtractor")
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
 
@@ -2165,12 +2165,12 @@
             "ViTPreTrainedModel",
         ]
     )
-    _import_structure["models.audio_spectogram_transformer"].extend(
+    _import_structure["models.audio_spectrogram_transformer"].extend(
         [
-            "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "AudioSpectogramTransformerModel",
-            "AudioSpectogramTransformerPreTrainedModel",
-            "AudioSpectogramTransformerForSequenceClassification",
+            "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AudioSpectrogramTransformerModel",
+            "AudioSpectrogramTransformerPreTrainedModel",
+            "AudioSpectrogramTransformerForSequenceClassification",
         ]
     )
     _import_structure["models.vit_mae"].extend(
@@ -3315,9 +3315,9 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-    from .models.audio_spectogram_transformer import (
-        AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        AudioSpectogramTransformerConfig,
+    from .models.audio_spectrogram_transformer import (
+        AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AudioSpectrogramTransformerConfig,
     )
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3816,7 +3816,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_speech_objects import *
     else:
-        from .models.audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
+        from .models.audio_spectrogram_transformer import AudioSpectrogramTransformerFeatureExtractor
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
 
@@ -3973,11 +3973,11 @@
             AlbertPreTrainedModel,
             load_tf_weights_in_albert,
         )
-        from .models.audio_spectogram_transformer import (
-            AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectogramTransformerForSequenceClassification,
-            AudioSpectogramTransformerModel,
-            AudioSpectogramTransformerPreTrainedModel,
+        from .models.audio_spectrogram_transformer import (
+            AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AudioSpectrogramTransformerForSequenceClassification,
+            AudioSpectrogramTransformerModel,
+            AudioSpectrogramTransformerPreTrainedModel,
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0035b8d6eb0862..d49d6699e070b5 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -18,7 +18,7 @@
 
 from . import (
     albert,
-    audio_spectogram_transformer,
+    audio_spectrogram_transformer,
     auto,
     bart,
     barthez,
diff --git a/src/transformers/models/audio_spectogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py
similarity index 57%
rename from src/transformers/models/audio_spectogram_transformer/__init__.py
rename to src/transformers/models/audio_spectrogram_transformer/__init__.py
index 8bae67516da2ab..6fe7b0ff2f26fd 100644
--- a/src/transformers/models/audio_spectogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py
@@ -21,10 +21,10 @@
 
 
 _import_structure = {
-    "configuration_audio_spectogram_transformer": [
-        "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "AudioSpectogramTransformerConfig",
-        "AudioSpectogramTransformerOnnxConfig",
+    "configuration_audio_spectrogram_transformer": [
+        "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "AudioSpectrogramTransformerConfig",
+        "AudioSpectrogramTransformerOnnxConfig",
     ]
 }
 
@@ -34,8 +34,8 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_audio_spectogram_transformer"] = [
-        "AudioSpectogramTransformerFeatureExtractor"
+    _import_structure["feature_extraction_audio_spectrogram_transformer"] = [
+        "AudioSpectrogramTransformerFeatureExtractor"
     ]
 
 try:
@@ -44,18 +44,18 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_audio_spectogram_transformer"] = [
-        "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "AudioSpectogramTransformerModel",
-        "AudioSpectogramTransformerPreTrainedModel",
-        "AudioSpectogramTransformerForSequenceClassification",
+    _import_structure["modeling_audio_spectrogram_transformer"] = [
+        "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "AudioSpectrogramTransformerModel",
+        "AudioSpectrogramTransformerPreTrainedModel",
+        "AudioSpectrogramTransformerForSequenceClassification",
     ]
 
 if TYPE_CHECKING:
-    from .configuration_audio_spectogram_transformer import (
-        AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        AudioSpectogramTransformerConfig,
-        AudioSpectogramTransformerOnnxConfig,
+    from .configuration_audio_spectrogram_transformer import (
+        AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        AudioSpectrogramTransformerConfig,
+        AudioSpectrogramTransformerOnnxConfig,
     )
 
     try:
@@ -64,7 +64,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_audio_spectogram_transformer import AudioSpectogramTransformerFeatureExtractor
+        from .feature_extraction_audio_spectrogram_transformer import AudioSpectrogramTransformerFeatureExtractor
 
     try:
         if not is_torch_available():
@@ -72,11 +72,11 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_audio_spectogram_transformer import (
-            AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectogramTransformerForSequenceClassification,
-            AudioSpectogramTransformerModel,
-            AudioSpectogramTransformerPreTrainedModel,
+        from .modeling_audio_spectrogram_transformer import (
+            AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AudioSpectrogramTransformerForSequenceClassification,
+            AudioSpectrogramTransformerModel,
+            AudioSpectrogramTransformerPreTrainedModel,
         )
 
 else:
diff --git a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
similarity index 77%
rename from src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
rename to src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 6082e1a4cb71f6..d676c3d7304a97 100644
--- a/src/transformers/models/audio_spectogram_transformer/configuration_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AudioSpectogramTransformer model configuration"""
+""" AudioSpectrogramTransformer model configuration"""
 
 
 from ...configuration_utils import PretrainedConfig
@@ -21,19 +21,19 @@
 
 logger = logging.get_logger(__name__)
 
-AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     # TODO update to appropriate organization
-    "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593": "https://huggingface.co/nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593/resolve/main/config.json",
+    "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593": "https://huggingface.co/nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593/resolve/main/config.json",
 }
 
 
-class AudioSpectogramTransformerConfig(PretrainedConfig):
+class AudioSpectrogramTransformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AudioSpectogramTransformerModel`]. It is used to
-    instantiate an AudioSpectogramTransformer model according to the specified arguments, defining the model
+    This is the configuration class to store the configuration of a [`AudioSpectrogramTransformerModel`]. It is used to
+    instantiate an AudioSpectrogramTransformer model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    AudioSpectogramTransformer
-    [nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593](https://huggingface.co/nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593)
+    AudioSpectrogramTransformer
+    [nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593](https://huggingface.co/nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593)
     architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -64,9 +64,9 @@ class AudioSpectogramTransformerConfig(PretrainedConfig):
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         frequency_stride (`int`, *optional*, defaults to 10):
-            Frequency stride to use when patchifying the spectograms.
+            Frequency stride to use when patchifying the spectrograms.
         time_stride (`int`, *optional*, defaults to 10):
-            Temporal stride to use when patchifying the spectograms.
+            Temporal stride to use when patchifying the spectrograms.
         time_dimension (`int`, *optional*, defaults to 1024):
             Temporal dimension of the spectrograms.
         frequency_dimension (`int`, *optional*, defaults to 128):
@@ -75,18 +75,18 @@ class AudioSpectogramTransformerConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import AudioSpectogramTransformerModel, AudioSpectogramTransformerConfig
+    >>> from transformers import AudioSpectrogramTransformerModel, AudioSpectrogramTransformerConfig
 
-    >>> # Initializing a AudioSpectogramTransformer audio_spectogram_transformer-base-patch16-224 style configuration
-    >>> configuration = AudioSpectogramTransformerConfig()
+    >>> # Initializing a AudioSpectrogramTransformer audio_spectrogram_transformer-base-patch16-224 style configuration
+    >>> configuration = AudioSpectrogramTransformerConfig()
 
-    >>> # Initializing a model from the audio_spectogram_transformer-base-patch16-224 style configuration
-    >>> model = AudioSpectogramTransformerModel(configuration)
+    >>> # Initializing a model from the audio_spectrogram_transformer-base-patch16-224 style configuration
+    >>> model = AudioSpectrogramTransformerModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "audio-spectogram-transformer"
+    model_type = "audio-spectrogram-transformer"
 
     def __init__(
         self,
diff --git a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
similarity index 71%
rename from src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
rename to src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index e79cbd0727b891..3d025eee56f2dc 100644
--- a/src/transformers/models/audio_spectogram_transformer/convert_audio_spectogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert Audio Spectogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
+"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
 
 
 import argparse
@@ -25,9 +25,9 @@
 
 from huggingface_hub import hf_hub_download
 from transformers import (
-    AudioSpectogramTransformerConfig,
-    AudioSpectogramTransformerFeatureExtractor,
-    AudioSpectogramTransformerForSequenceClassification,
+    AudioSpectrogramTransformerConfig,
+    AudioSpectrogramTransformerFeatureExtractor,
+    AudioSpectrogramTransformerForSequenceClassification,
 )
 from transformers.utils import logging
 
@@ -36,8 +36,8 @@
 logger = logging.get_logger(__name__)
 
 
-def get_audio_spectogram_transformer_config(model_name):
-    config = AudioSpectogramTransformerConfig()
+def get_audio_spectrogram_transformer_config(model_name):
+    config = AudioSpectrogramTransformerConfig()
 
     if "10-10" in model_name:
         pass
@@ -73,7 +73,7 @@ def get_audio_spectogram_transformer_config(model_name):
 
 def rename_key(name):
     if "module.v" in name:
-        name = name.replace("module.v", "audio_spectogram_transformer")
+        name = name.replace("module.v", "audio_spectrogram_transformer")
     if "cls_token" in name:
         name = name.replace("cls_token", "embeddings.cls_token")
     if "dist_token" in name:
@@ -98,8 +98,8 @@ def rename_key(name):
     if "mlp.fc2" in name:
         name = name.replace("mlp.fc2", "output.dense")
     # final layernorm
-    if "audio_spectogram_transformer.norm" in name:
-        name = name.replace("audio_spectogram_transformer.norm", "audio_spectogram_transformer.layernorm")
+    if "audio_spectrogram_transformer.norm" in name:
+        name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
     # classifier head
     if "module.mlp_head.0" in name:
         name = name.replace("module.mlp_head.0", "layernorm")
@@ -119,23 +119,23 @@ def convert_state_dict(orig_state_dict, config):
             dim = config.hidden_size
             if "weight" in key:
                 orig_state_dict[
-                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
+                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
                 ] = val[:dim, :]
                 orig_state_dict[
-                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
+                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
                 ] = val[dim : dim * 2, :]
                 orig_state_dict[
-                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
+                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
                 ] = val[-dim:, :]
             else:
                 orig_state_dict[
-                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
+                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
                 ] = val[:dim]
                 orig_state_dict[
-                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
+                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
                 ] = val[dim : dim * 2]
                 orig_state_dict[
-                    f"audio_spectogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
+                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
                 ] = val[-dim:]
         else:
             orig_state_dict[rename_key(key)] = val
@@ -155,35 +155,35 @@ def remove_keys(state_dict):
 
 
 @torch.no_grad()
-def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
+def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
     """
-    Copy/paste/tweak model's weights to our Audio Spectogram Transformer structure.
+    Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure.
     """
-    config = get_audio_spectogram_transformer_config(model_name)
+    config = get_audio_spectrogram_transformer_config(model_name)
 
     model_name_to_url = {
-        "audio-spectogram-transformer-finetuned-audioset-10-10-0.4593": (
+        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593": (
             "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-audioset-10-10-0.450": (
+        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.450": (
             "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-audioset-10-10-0.448": (
+        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448": (
             "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-audioset-10-10-0.448-v2": (
+        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448-v2": (
             "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-audioset-12-12-0.447": (
+        "audio-spectrogram-transformer-finetuned-audioset-12-12-0.447": (
             "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-audioset-14-14-0.443": (
+        "audio-spectrogram-transformer-finetuned-audioset-14-14-0.443": (
             "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-audioset-16-16-0.442": (
+        "audio-spectrogram-transformer-finetuned-audioset-16-16-0.442": (
             "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
         ),
-        "audio-spectogram-transformer-finetuned-speech-commands-v2": (
+        "audio-spectrogram-transformer-finetuned-speech-commands-v2": (
             "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
         ),
     }
@@ -197,7 +197,7 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     new_state_dict = convert_state_dict(state_dict, config)
 
     # load 🤗 model
-    model = AudioSpectogramTransformerForSequenceClassification(config)
+    model = AudioSpectrogramTransformerForSequenceClassification(config)
     model.eval()
 
     model.load_state_dict(new_state_dict)
@@ -206,14 +206,16 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
     mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
     std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
-    feature_extractor = AudioSpectogramTransformerFeatureExtractor(mean=mean, std=std)
+    feature_extractor = AudioSpectrogramTransformerFeatureExtractor(mean=mean, std=std)
 
     if "speech-commands" in model_name:
         dataset = load_dataset("speech_commands", "v0.02", split="validation")
         waveform = dataset[0]["audio"]["array"]
     else:
         filepath = hf_hub_download(
-            repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
+            repo_id="nielsr/audio-spectogram-transformer-checkpoint",
+            filename="sample_audio.flac",
+            repo_type="dataset",
         )
 
         waveform, _ = torchaudio.load(filepath)
@@ -226,21 +228,21 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     outputs = model(**inputs)
     logits = outputs.logits
 
-    if model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.4593":
+    if model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593":
         expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
-    elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.450":
+    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.450":
         expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
-    elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.448":
+    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448":
         expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
-    elif model_name == "audio-spectogram-transformer-finetuned-audioset-10-10-0.448-v2":
+    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448-v2":
         expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
-    elif model_name == "audio-spectogram-transformer-finetuned-audioset-12-12-0.447":
+    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-12-12-0.447":
         expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
-    elif model_name == "audio-spectogram-transformer-finetuned-audioset-14-14-0.443":
+    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-14-14-0.443":
         expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
-    elif model_name == "audio-spectogram-transformer-finetuned-audioset-16-16-0.442":
+    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-16-16-0.442":
         expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
-    elif model_name == "audio-spectogram-transformer-finetuned-speech-commands-v2":
+    elif model_name == "audio-spectrogram-transformer-finetuned-speech-commands-v2":
         expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
     else:
         raise ValueError("Unknown model name")
@@ -266,9 +268,9 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="audio-spectogram-transformer-finetuned-audioset-10-10-0.4593",
+        default="audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593",
         type=str,
-        help="Name of the Audio Spectogram Transformer model you'd like to convert.",
+        help="Name of the Audio Spectrogram Transformer model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
@@ -278,4 +280,4 @@ def convert_audio_spectogram_transformer_checkpoint(model_name, pytorch_dump_fol
     )
 
     args = parser.parse_args()
-    convert_audio_spectogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
similarity index 93%
rename from src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
rename to src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index 7488b5f23ab731..0307fd1b41a180 100644
--- a/src/transformers/models/audio_spectogram_transformer/feature_extraction_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Feature extractor class for Audio Spectogram Transformer.
+Feature extractor class for Audio Spectrogram Transformer.
 """
 
 from typing import List, Optional, Union
@@ -30,12 +30,12 @@
 logger = logging.get_logger(__name__)
 
 
-class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
+class AudioSpectrogramTransformerFeatureExtractor(SequenceFeatureExtractor):
     r"""
-    Constructs a Audio Spectogram Transformer feature extractor.
+    Constructs a Audio Spectrogram Transformer feature extractor.
 
-    This feature extractor inherits from [`AudioSpectogramTransformerFeatureExtractor`] which contains most of the main
-    methods. Users should refer to this superclass for more information regarding those methods.
+    This feature extractor inherits from [`AudioSpectrogramTransformerFeatureExtractor`] which contains most of the
+    main methods. Users should refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
     mean and variance normalization to the extracted features.
@@ -56,7 +56,7 @@ class AudioSpectogramTransformerFeatureExtractor(SequenceFeatureExtractor):
             Whether or not to normalize the extracted features to a std of 05. Uses the AudioSet std by default,
             obtained by the authors.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not [`~AudioSpectogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
+            Whether or not [`~AudioSpectrogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
     """
 
     model_input_names = ["input_values", "attention_mask"]
diff --git a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
similarity index 81%
rename from src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
rename to src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 9fad0ce82c63db..9cf750d41d1985 100644
--- a/src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Audio Spectogram Transformer model."""
+""" PyTorch Audio Spectrogram Transformer model."""
 
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
@@ -27,45 +27,45 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
-from .configuration_audio_spectogram_transformer import AudioSpectogramTransformerConfig
+from .configuration_audio_spectrogram_transformer import AudioSpectrogramTransformerConfig
 
 
 logger = logging.get_logger(__name__)
 
 # General docstring
-_CONFIG_FOR_DOC = "AudioSpectogramTransformerConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AudioSpectogramTransformerFeatureExtractor"
+_CONFIG_FOR_DOC = "AudioSpectrogramTransformerConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "AudioSpectrogramTransformerFeatureExtractor"
 
 # Base docstring
 # TODO update to appropriate organization
-_CHECKPOINT_FOR_DOC = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
+_CHECKPOINT_FOR_DOC = "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
 _EXPECTED_OUTPUT_SHAPE = [1, 1214, 768]
 
 # Audio classification docstring
 # TODO update to appropriate organization
-_SEQ_CLASS_CHECKPOINT = "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
+_SEQ_CLASS_CHECKPOINT = "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'Speech'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.17
 
 
-AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     # TODO update to appropriate organization
-    "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593",
-    # See all Audio Spectogram Transformer models at https://huggingface.co/models?filter=audio-spectogram-transformer
+    "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593",
+    # See all Audio Spectrogram Transformer models at https://huggingface.co/models?filter=audio-spectrogram-transformer
 ]
 
 
-class AudioSpectogramTransformerEmbeddings(nn.Module):
+class AudioSpectrogramTransformerEmbeddings(nn.Module):
     """
     Construct the CLS token, position and patch embeddings.
     """
 
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.patch_embeddings = AudioSpectogramTransformerPatchEmbeddings(config)
+        self.patch_embeddings = AudioSpectrogramTransformerPatchEmbeddings(config)
 
         frequency_dimension, time_dimension = self.get_shape(config)
         num_patches = frequency_dimension * time_dimension
@@ -103,7 +103,7 @@ def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class AudioSpectogramTransformerPatchEmbeddings(nn.Module):
+class AudioSpectrogramTransformerPatchEmbeddings(nn.Module):
     """
     This class turns `input_values` of shape `(batch_size, num_channels, height, width)` into the initial
     `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
@@ -128,9 +128,9 @@ def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerSelfAttention(nn.Module):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerSelfAttention(nn.Module):
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
@@ -189,14 +189,14 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerSelfOutput(nn.Module):
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerSelfOutput(nn.Module):
     """
-    The residual connection is defined in AudioSpectogramTransformerLayer instead of here (as is the case with other
+    The residual connection is defined in AudioSpectrogramTransformerLayer instead of here (as is the case with other
     models), due to the layernorm applied before each block.
     """
 
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -209,12 +209,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerAttention(nn.Module):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerAttention(nn.Module):
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
-        self.attention = AudioSpectogramTransformerSelfAttention(config)
-        self.output = AudioSpectogramTransformerSelfOutput(config)
+        self.attention = AudioSpectrogramTransformerSelfAttention(config)
+        self.output = AudioSpectrogramTransformerSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads: Set[int]) -> None:
@@ -249,9 +249,9 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerIntermediate(nn.Module):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerIntermediate(nn.Module):
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
         if isinstance(config.hidden_act, str):
@@ -267,9 +267,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerOutput(nn.Module):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerOutput(nn.Module):
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -283,17 +283,17 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerLayer(nn.Module):
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
 
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = AudioSpectogramTransformerAttention(config)
-        self.intermediate = AudioSpectogramTransformerIntermediate(config)
-        self.output = AudioSpectogramTransformerOutput(config)
+        self.attention = AudioSpectrogramTransformerAttention(config)
+        self.intermediate = AudioSpectrogramTransformerIntermediate(config)
+        self.output = AudioSpectrogramTransformerOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -306,7 +306,7 @@ def forward(
         self_attention_outputs = self.attention(
             self.layernorm_before(
                 hidden_states
-            ),  # in AudioSpectogramTransformer, layernorm is applied before self-attention
+            ),  # in AudioSpectrogramTransformer, layernorm is applied before self-attention
             head_mask,
             output_attentions=output_attentions,
         )
@@ -316,7 +316,7 @@ def forward(
         # first residual connection
         hidden_states = attention_output + hidden_states
 
-        # in AudioSpectogramTransformer, layernorm is also applied after self-attention
+        # in AudioSpectrogramTransformer, layernorm is also applied after self-attention
         layer_output = self.layernorm_after(hidden_states)
         layer_output = self.intermediate(layer_output)
 
@@ -328,12 +328,12 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AudioSpectogramTransformer
-class AudioSpectogramTransformerEncoder(nn.Module):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AudioSpectrogramTransformer
+class AudioSpectrogramTransformerEncoder(nn.Module):
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([AudioSpectogramTransformerLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([AudioSpectrogramTransformerLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -386,14 +386,14 @@ def custom_forward(*inputs):
         )
 
 
-class AudioSpectogramTransformerPreTrainedModel(PreTrainedModel):
+class AudioSpectrogramTransformerPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = AudioSpectogramTransformerConfig
-    base_model_prefix = "audio_spectogram_transformer"
+    config_class = AudioSpectrogramTransformerConfig
+    base_model_prefix = "audio_spectrogram_transformer"
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
 
@@ -408,28 +408,29 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing with ViT->AudioSpectogramTransformer
-    def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder, value: bool = False) -> None:
-        if isinstance(module, AudioSpectogramTransformerEncoder):
+    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing with ViT->AudioSpectrogramTransformer
+    def _set_gradient_checkpointing(self, module: AudioSpectrogramTransformerEncoder, value: bool = False) -> None:
+        if isinstance(module, AudioSpectrogramTransformerEncoder):
             module.gradient_checkpointing = value
 
 
-AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING = r"""
+AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
     as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
     behavior.
 
     Parameters:
-        config ([`AudioSpectogramTransformerConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        config ([`AudioSpectrogramTransformerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
+AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AudioSpectogramTransformerFeatureExtractor`]. See
-            [`AudioSpectogramTransformerFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`AudioSpectrogramTransformerFeatureExtractor`]. See
+            [`AudioSpectrogramTransformerFeatureExtractor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -449,24 +450,24 @@ def _set_gradient_checkpointing(self, module: AudioSpectogramTransformerEncoder,
 
 
 @add_start_docstrings(
-    "The bare AudioSpectogramTransformer Model transformer outputting raw hidden-states without any specific head on"
+    "The bare AudioSpectrogramTransformer Model transformer outputting raw hidden-states without any specific head on"
     " top.",
-    AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
+    AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
-class AudioSpectogramTransformerModel(AudioSpectogramTransformerPreTrainedModel):
-    def __init__(self, config: AudioSpectogramTransformerConfig):
+class AudioSpectrogramTransformerModel(AudioSpectrogramTransformerPreTrainedModel):
+    def __init__(self, config: AudioSpectrogramTransformerConfig):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = AudioSpectogramTransformerEmbeddings(config)
-        self.encoder = AudioSpectogramTransformerEncoder(config)
+        self.embeddings = AudioSpectrogramTransformerEmbeddings(config)
+        self.encoder = AudioSpectrogramTransformerEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> AudioSpectogramTransformerPatchEmbeddings:
+    def get_input_embeddings(self) -> AudioSpectrogramTransformerPatchEmbeddings:
         return self.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
@@ -477,7 +478,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -537,17 +538,17 @@ def forward(
 
 @add_start_docstrings(
     """
-    Audio Spectogram Transformer model with an audio classification head on top (a linear layer on top of the final
+    Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the final
     hidden state of the [CLS] token) e.g. for AudioSet.
     """,
-    AUDIO_SPECTOGRAM_TRANSFORMER_START_DOCSTRING,
+    AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
-class AudioSpectogramTransformerForSequenceClassification(AudioSpectogramTransformerPreTrainedModel):
-    def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
+class AudioSpectrogramTransformerForSequenceClassification(AudioSpectrogramTransformerPreTrainedModel):
+    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.audio_spectogram_transformer = AudioSpectogramTransformerModel(config)
+        self.audio_spectrogram_transformer = AudioSpectrogramTransformerModel(config)
 
         # Classifier head
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -556,7 +557,7 @@ def __init__(self, config: AudioSpectogramTransformerConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(AUDIO_SPECTOGRAM_TRANSFORMER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_SEQ_CLASS_CHECKPOINT,
@@ -583,7 +584,7 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.audio_spectogram_transformer(
+        outputs = self.audio_spectrogram_transformer(
             input_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
diff --git a/src/transformers/models/audio_spectogram_transformer/test.py b/src/transformers/models/audio_spectrogram_transformer/test.py
similarity index 100%
rename from src/transformers/models/audio_spectogram_transformer/test.py
rename to src/transformers/models/audio_spectrogram_transformer/test.py
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index cb0c588b673275..5e772a95b627f9 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -30,7 +30,7 @@
     [
         # Add configs here
         ("albert", "AlbertConfig"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerConfig"),
+        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
         ("bert", "BertConfig"),
@@ -180,7 +180,7 @@
     [
         # Add archive maps here)
         ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("audio-spectogram-transformer", "AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("audio-spectrogram-transformer", "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -315,7 +315,7 @@
     [
         # Add full (and cased) model names here
         ("albert", "ALBERT"),
-        ("audio-spectogram-transformer", "Audio Spectogram Transformer"),
+        ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
         ("bart", "BART"),
         ("barthez", "BARThez"),
         ("bartpho", "BARTpho"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 7baed7e625630d..80af08d4df94fd 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -37,7 +37,7 @@
 
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
     [
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerFeatureExtractor"),
+        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index d6b10908a74c76..7c18d45680787a 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -29,7 +29,7 @@
     [
         # Base model mapping
         ("albert", "AlbertModel"),
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerModel"),
+        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
         ("bert", "BertModel"),
@@ -785,7 +785,7 @@
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Audio Classification mapping
-        ("audio-spectogram-transformer", "AudioSpectogramTransformerForSequenceClassification"),
+        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerForSequenceClassification"),
         ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
         ("hubert", "HubertForSequenceClassification"),
         ("sew", "SEWForSequenceClassification"),
diff --git a/tests/models/audio_spectogram_transformer/__init__.py b/tests/models/audio_spectrogram_transformer/__init__.py
similarity index 100%
rename from tests/models/audio_spectogram_transformer/__init__.py
rename to tests/models/audio_spectrogram_transformer/__init__.py
diff --git a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
similarity index 91%
rename from tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
rename to tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index 1220b5299e5667..75277b6848defb 100644
--- a/tests/models/audio_spectogram_transformer/test_feature_extraction_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from transformers import AudioSpectogramTransformerFeatureExtractor
+from transformers import AudioSpectrogramTransformerFeatureExtractor
 from transformers.testing_utils import require_torch
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@@ -43,7 +43,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
     return values
 
 
-class AudioSpectogramTransformerFeatureExtractionTester(unittest.TestCase):
+class AudioSpectrogramTransformerFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -95,12 +95,12 @@ def _flatten(list_of_lists):
         return speech_inputs
 
 
-class AudioSpectogramTransformerFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+class AudioSpectrogramTransformerFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
 
-    feature_extraction_class = AudioSpectogramTransformerFeatureExtractor
+    feature_extraction_class = AudioSpectrogramTransformerFeatureExtractor
 
     def setUp(self):
-        self.feat_extract_tester = AudioSpectogramTransformerFeatureExtractionTester(self)
+        self.feat_extract_tester = AudioSpectrogramTransformerFeatureExtractionTester(self)
 
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
diff --git a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
similarity index 80%
rename from tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
rename to tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 81fee1afc3aeb3..e68b975c51f5c6 100644
--- a/tests/models/audio_spectogram_transformer/test_modeling_audio_spectogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -12,13 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch AudioSpectogramTransformer model. """
+""" Testing suite for the PyTorch AudioSpectrogramTransformer model. """
 
 import inspect
 import unittest
 
 from huggingface_hub import hf_hub_download
-from transformers import AudioSpectogramTransformerConfig
+from transformers import AudioSpectrogramTransformerConfig
 from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
 
@@ -30,19 +30,19 @@
     import torch
     from torch import nn
 
-    from transformers import AudioSpectogramTransformerForSequenceClassification, AudioSpectogramTransformerModel
-    from transformers.models.audio_spectogram_transformer.modeling_audio_spectogram_transformer import (
-        AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    from transformers import AudioSpectrogramTransformerForSequenceClassification, AudioSpectrogramTransformerModel
+    from transformers.models.audio_spectrogram_transformer.modeling_audio_spectrogram_transformer import (
+        AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
 
 if is_torchaudio_available():
     import torchaudio
 
-    from transformers import AudioSpectogramTransformerFeatureExtractor
+    from transformers import AudioSpectrogramTransformerFeatureExtractor
 
 
-class AudioSpectogramTransformerModelTester:
+class AudioSpectrogramTransformerModelTester:
     def __init__(
         self,
         parent,
@@ -85,7 +85,7 @@ def __init__(
         self.frequency_stride = frequency_stride
         self.time_stride = time_stride
 
-        # in AudioSpectogramTransformer, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        # in AudioSpectrogramTransformer, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
         test_input = torch.randn(1, 1, self.frequency_dimension, self.time_dimension)
         test_projection = nn.Conv2d(
             1,
@@ -111,7 +111,7 @@ def prepare_config_and_inputs(self):
         return config, input_values, labels
 
     def get_config(self):
-        return AudioSpectogramTransformerConfig(
+        return AudioSpectrogramTransformerConfig(
             patch_size=self.patch_size,
             time_dimension=self.time_dimension,
             frequency_dimension=self.frequency_dimension,
@@ -129,7 +129,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, input_values, labels):
-        model = AudioSpectogramTransformerModel(config=config)
+        model = AudioSpectrogramTransformerModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_values)
@@ -147,16 +147,16 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class AudioSpectogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
+class AudioSpectrogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as AudioSpectogramTransformer does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as AudioSpectrogramTransformer does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (
         (
-            AudioSpectogramTransformerModel,
-            AudioSpectogramTransformerForSequenceClassification,
+            AudioSpectrogramTransformerModel,
+            AudioSpectrogramTransformerForSequenceClassification,
         )
         if is_torch_available()
         else ()
@@ -167,15 +167,15 @@ class AudioSpectogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = AudioSpectogramTransformerModelTester(self)
+        self.model_tester = AudioSpectrogramTransformerModelTester(self)
         self.config_tester = ConfigTester(
-            self, config_class=AudioSpectogramTransformerConfig, has_text_modality=False, hidden_size=37
+            self, config_class=AudioSpectrogramTransformerConfig, has_text_modality=False, hidden_size=37
         )
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="AudioSpectogramTransformer does not use inputs_embeds")
+    @unittest.skip(reason="AudioSpectrogramTransformer does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -206,8 +206,8 @@ def test_model(self):
 
     @slow
     def test_model_from_pretrained(self):
-        for model_name in AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AudioSpectogramTransformerModel.from_pretrained(model_name)
+        for model_name in AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = AudioSpectrogramTransformerModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
@@ -224,13 +224,13 @@ def prepare_audio():
 
 @require_torch
 @require_torchaudio
-class AudioSpectogramTransformerModelIntegrationTest(unittest.TestCase):
+class AudioSpectrogramTransformerModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         # TODO rename nielsr to appropriate organization
         return (
-            AudioSpectogramTransformerFeatureExtractor.from_pretrained(
-                "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
+            AudioSpectrogramTransformerFeatureExtractor.from_pretrained(
+                "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
             )
             if is_torchaudio_available()
             else None
@@ -241,8 +241,8 @@ def test_inference_audio_classification(self):
 
         feature_extractor = self.default_feature_extractor
         # TODO rename nielsr to appropriate organization
-        model = AudioSpectogramTransformerForSequenceClassification.from_pretrained(
-            "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593"
+        model = AudioSpectrogramTransformerForSequenceClassification.from_pretrained(
+            "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
         ).to(torch_device)
 
         feature_extractor = self.default_feature_extractor

From 9dede0c364c553e2b50542d7eeba4d98b66e766b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 16:55:14 +0100
Subject: [PATCH 17/37] Fix copies

---
 README.md                                     |  2 +-
 README_es.md                                  |  2 +-
 README_ko.md                                  |  2 +-
 README_zh-hans.md                             |  2 +-
 README_zh-hant.md                             |  2 +-
 docs/source/en/index.mdx                      |  2 +-
 .../audio-spectrogram-transformer.mdx         | 13 ++++---
 ...xtraction_audio_spectrogram_transformer.py | 17 +++++----
 .../audio_spectrogram_transformer/test.py     | 36 -------------------
 src/transformers/utils/dummy_pt_objects.py    |  8 ++---
 .../utils/dummy_speech_objects.py             |  2 +-
 11 files changed, 25 insertions(+), 63 deletions(-)
 delete mode 100644 src/transformers/models/audio_spectrogram_transformer/test.py

diff --git a/README.md b/README.md
index e7de20c48cc0a2..a2823ba6a4dd7a 100644
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_es.md b/README_es.md
index 8fc32c2ddc19e8..c768acaf4f24ae 100644
--- a/README_es.md
+++ b/README_es.md
@@ -262,7 +262,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 🤗 Transformers actualmente proporciona las siguientes arquitecturas (ver [aquí](https://huggingface.co/docs/transformers/model_summary) para un resumen de alto nivel de cada uno de ellas.):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_ko.md b/README_ko.md
index 3281fc07ee41f8..49bf203a4a3d1d 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -212,7 +212,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요):
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 188a5c16f24ac0..88bb1f45c80eb8 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -236,7 +236,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectrogram-transformer)** (来自 MIT) 伴随论文 [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) 由 Yuan Gong, Yu-An Chung, James Glass 发布。
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 259be0df430573..ba55109e11e5dc 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -248,7 +248,7 @@ conda install -c huggingface transformers
 🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 867c39247f41ec..3dcea7e85888b0 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -50,7 +50,7 @@ The documentation is organized into five sections:
 <!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
 
 1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[Audio Spectogram Transformer](model_doc/audio-spectogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
index 5d8bee23231895..e43fe23b145880 100644
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
@@ -24,13 +24,12 @@ The abstract from the paper is the following:
 
 Tips:
 
-- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization.
-The authors normalize the input audio to have a 0 mean and 0.5 std. To use the pretrained model, you should roughly normalize the input to
-this range. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
-the authors compute the stats, or you can try using the default mean and std of [`AudioSpectrogramTransformerFeatureExtractor`] (which uses the
-AudioSet statistics).
-- Note that the AST needs a smaller learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
-[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges faster, so please search the learning rate and learning rate scheduler for your task.
+- When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make
+sure the input has mean of 0 and std of 0.5). [`AudioSpectrogramTransformerFeatureExtractor`] takes care of this. Note that it uses the AudioSet
+mean and std by default. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
+the authors compute the stats for a downstream dataset.
+- Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
+[PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task.
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/audio_spectogram_transformer_architecture.png"
 alt="drawing" width="600"/>
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index 0307fd1b41a180..e3db344b8fe5dc 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -48,13 +48,12 @@ class AudioSpectrogramTransformerFeatureExtractor(SequenceFeatureExtractor):
         num_mel_bins (`int`, defaults to 128):
             Number of Mel-frequency bins.
         do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
-        mean (`int`, *optional*, defaults to -4.2677393):
-            Whether or not to normalize the extracted features to a mean of 0. Uses the AudioSet mean by default,
-            obtained by the authors.
-        std (`int`, *optional*, defaults to 4.5689974):
-            Whether or not to normalize the extracted features to a std of 05. Uses the AudioSet std by default,
-            obtained by the authors.
+            Whether or not to normalize the log-Mel features using `mean` and `std`.
+        mean (`float`, *optional*, defaults to -4.2677393):
+            The mean value used to normalize the log-Mel features. Uses the AudioSet mean by default.
+        std (`float`, *optional*, defaults to 4.5689974):
+            The standard deviation value used to normalize the log-Mel features. Uses the AudioSet standard deviation
+            by default.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
             Whether or not [`~AudioSpectrogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
     """
@@ -107,8 +106,8 @@ def _extract_fbank_features(
 
         # pad or truncate, depending on difference
         if difference > 0:
-            m = torch.nn.ZeroPad2d((0, 0, 0, difference))
-            fbank = m(fbank)
+            pad_module = torch.nn.ZeroPad2d((0, 0, 0, difference))
+            fbank = pad_module(fbank)
         elif difference < 0:
             fbank = fbank[0:max_length, :]
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/test.py b/src/transformers/models/audio_spectrogram_transformer/test.py
deleted file mode 100644
index 1f2cf270ee32ff..00000000000000
--- a/src/transformers/models/audio_spectrogram_transformer/test.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torchaudio
-
-from huggingface_hub import hf_hub_download
-from transformers import (
-    AudioSpectogramTransformerConfig,
-    AudioSpectogramTransformerFeatureExtractor,
-    AudioSpectogramTransformerForSequenceClassification,
-)
-
-
-# define feature extractor and model
-feature_extractor = AudioSpectogramTransformerFeatureExtractor()
-config = AudioSpectogramTransformerConfig(num_labels=527)
-model = AudioSpectogramTransformerForSequenceClassification(config)
-
-# read audio
-filepath = hf_hub_download(
-    repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
-)
-
-raw_speech, _ = torchaudio.load(filepath)
-
-raw_speech = raw_speech.squeeze().numpy()
-
-# prepare audio for the model
-inputs = feature_extractor(raw_speech, padding="max_length", return_tensors="pt")
-
-for k, v in inputs.items():
-    print(k, v.shape)
-
-outputs = model(inputs.input_features)
-
-print("Shape of logits:", outputs.logits.shape)
-
-# for name, param in model.named_parameters():
-#     print(name, param.shape)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index edfcd3839c75b1..cff49eb0302b29 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -350,24 +350,24 @@ def load_tf_weights_in_albert(*args, **kwargs):
     requires_backends(load_tf_weights_in_albert, ["torch"])
 
 
-AUDIO_SPECTOGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class AudioSpectogramTransformerForSequenceClassification(metaclass=DummyObject):
+class AudioSpectrogramTransformerForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class AudioSpectogramTransformerModel(metaclass=DummyObject):
+class AudioSpectrogramTransformerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class AudioSpectogramTransformerPreTrainedModel(metaclass=DummyObject):
+class AudioSpectrogramTransformerPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index 66c2d85fdacdac..cbe700be1bc8b9 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -3,7 +3,7 @@
 from ..utils import DummyObject, requires_backends
 
 
-class AudioSpectogramTransformerFeatureExtractor(metaclass=DummyObject):
+class AudioSpectrogramTransformerFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 
     def __init__(self, *args, **kwargs):

From fdead748186ba712f55599e421cfbe4e538a57e3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 17:14:21 +0100
Subject: [PATCH 18/37] Add integration test

---
 .../modeling_audio_spectrogram_transformer.py |  4 +--
 ...xtraction_audio_spectrogram_transformer.py | 29 +++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 9cf750d41d1985..fc826788edaecc 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -538,8 +538,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the final
-    hidden state of the [CLS] token) e.g. for AudioSet.
+    Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled
+    output) e.g. for AudioSet, Speech Commands v2.
     """,
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index 75277b6848defb..9543e3632fa120 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -22,12 +22,16 @@
 
 from transformers import AudioSpectrogramTransformerFeatureExtractor
 from transformers.testing_utils import require_torch
+from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
 
 
 global_rng = random.Random()
 
+if is_torch_available():
+    import torch
+
 
 def floats_list(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
@@ -133,3 +137,28 @@ def test_double_precision_pad(self):
             self.assertTrue(np_processed.input_values.dtype == np.float32)
             pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
             self.assertTrue(pt_processed.input_values.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    @require_torch
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [-0.9894, -1.2776, -0.9066, -1.2776, -0.9349, -1.2609, -1.0386, -1.2776,
+             -1.1561, -1.2776, -1.2052, -1.2723, -1.2190, -1.2132, -1.2776, -1.1133,
+             -1.1953, -1.1343, -1.1584, -1.2203, -1.1770, -1.2474, -1.2381, -1.1936,
+             -0.9270, -0.8317, -0.8049, -0.7706, -0.7565, -0.7869]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = AudioSpectrogramTransformerFeatureExtractor()
+        input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))

From 70c948a5f99848c34e29e1681aab0f21d106767e Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 1 Nov 2022 17:23:30 +0100
Subject: [PATCH 19/37] Remove dummy conv

---
 .../modeling_audio_spectrogram_transformer.py | 25 ++++++-------------
 ..._modeling_audio_spectrogram_transformer.py | 14 +++--------
 2 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index fc826788edaecc..f2549e2cf1b305 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -67,28 +67,19 @@ def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
         self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.patch_embeddings = AudioSpectrogramTransformerPatchEmbeddings(config)
 
-        frequency_dimension, time_dimension = self.get_shape(config)
-        num_patches = frequency_dimension * time_dimension
+        frequency_out_dimension, time_out_dimension = self.get_shape(config)
+        num_patches = frequency_out_dimension * time_out_dimension
         self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size))
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.config = config
 
     def get_shape(self, config):
-        frequency_stride = config.frequency_stride
-        time_stride = config.time_stride
-        frequency_dimension = config.frequency_dimension
-        time_dimension = config.time_dimension
-        test_input = torch.randn(1, 1, frequency_dimension, time_dimension)
-        test_projection = nn.Conv2d(
-            1,
-            config.hidden_size,
-            kernel_size=(config.patch_size, config.patch_size),
-            stride=(frequency_stride, time_stride),
-        )
-        test_out = test_projection(test_input)
-        frequency_dimension = test_out.shape[2]
-        time_dimension = test_out.shape[3]
-        return frequency_dimension, time_dimension
+        # see Karpathy's cs231n blog on how to calculate the output dimensions
+        # https://cs231n.github.io/convolutional-networks/#conv
+        frequency_out_dimension = (config.frequency_dimension - config.patch_size) // config.frequency_stride + 1
+        time_out_dimension = (config.time_dimension - config.patch_size) // config.time_stride + 1
+
+        return frequency_out_dimension, time_out_dimension
 
     def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         batch_size = input_values.shape[0]
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index e68b975c51f5c6..4a101f677010d5 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -86,17 +86,9 @@ def __init__(
         self.time_stride = time_stride
 
         # in AudioSpectrogramTransformer, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
-        test_input = torch.randn(1, 1, self.frequency_dimension, self.time_dimension)
-        test_projection = nn.Conv2d(
-            1,
-            self.hidden_size,
-            kernel_size=(self.patch_size, self.patch_size),
-            stride=(self.frequency_stride, self.time_stride),
-        )
-        test_out = test_projection(test_input)
-        frequency_dimension = test_out.shape[2]
-        time_dimension = test_out.shape[3]
-        num_patches = frequency_dimension * time_dimension
+        frequency_out_dimension = (self.frequency_dimension - self.patch_size) // self.frequency_stride + 1
+        time_out_dimension = (self.time_dimension - self.patch_size) // self.time_stride + 1
+        num_patches = frequency_out_dimension * time_out_dimension
         self.seq_length = num_patches + 2
 
     def prepare_config_and_inputs(self):

From fd10b76d79a0d5d2daed80e92c6b3f721f770ae1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 14 Nov 2022 17:10:47 +0100
Subject: [PATCH 20/37] Update to ast

---
 README_ja.md                                  |  1 +
 ...iguration_audio_spectrogram_transformer.py | 13 ++---
 ...trogram_transformer_original_to_pytorch.py | 34 ++++++-------
 ...xtraction_audio_spectrogram_transformer.py | 10 ++--
 .../modeling_audio_spectrogram_transformer.py | 11 ++---
 .../utils/dummy_vision_objects.py             | 49 -------------------
 6 files changed, 34 insertions(+), 84 deletions(-)

diff --git a/README_ja.md b/README_ja.md
index ddef823e583a84..3b47573f48856c 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -297,6 +297,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 🤗Transformersは現在、以下のアーキテクチャを提供しています（それぞれのハイレベルな要約は[こちら](https://huggingface.co/docs/transformers/model_summary)を参照してください）:
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/main/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index d676c3d7304a97..97b99a963db716 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -22,8 +22,9 @@
 logger = logging.get_logger(__name__)
 
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    # TODO update to appropriate organization
-    "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593": "https://huggingface.co/nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593/resolve/main/config.json",
+    "MIT/ast-finetuned-audioset-10-10-0.4593": (
+        "https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593/resolve/main/config.json"
+    ),
 }
 
 
@@ -33,7 +34,7 @@ class AudioSpectrogramTransformerConfig(PretrainedConfig):
     instantiate an AudioSpectrogramTransformer model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
     AudioSpectrogramTransformer
-    [nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593](https://huggingface.co/nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593)
+    [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
     architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -75,12 +76,12 @@ class AudioSpectrogramTransformerConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import AudioSpectrogramTransformerModel, AudioSpectrogramTransformerConfig
+    >>> from transformers import AudioSpectrogramTransformerConfig, AudioSpectrogramTransformerModel
 
-    >>> # Initializing a AudioSpectrogramTransformer audio_spectrogram_transformer-base-patch16-224 style configuration
+    >>> # Initializing a AudioSpectrogramTransformer MIT/ast-finetuned-audioset-10-10-0.4593 style configuration
     >>> configuration = AudioSpectrogramTransformerConfig()
 
-    >>> # Initializing a model from the audio_spectrogram_transformer-base-patch16-224 style configuration
+    >>> # Initializing a model (with random weights) from the MIT/ast-finetuned-audioset-10-10-0.4593 style configuration
     >>> model = AudioSpectrogramTransformerModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index 3d025eee56f2dc..8a4b6bda088baa 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -162,28 +162,28 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     config = get_audio_spectrogram_transformer_config(model_name)
 
     model_name_to_url = {
-        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593": (
+        "ast-finetuned-audioset-10-10-0.4593": (
             "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.450": (
+        "ast-finetuned-audioset-10-10-0.450": (
             "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448": (
+        "ast-finetuned-audioset-10-10-0.448": (
             "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448-v2": (
+        "ast-finetuned-audioset-10-10-0.448-v2": (
             "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-audioset-12-12-0.447": (
+        "ast-finetuned-audioset-12-12-0.447": (
             "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-audioset-14-14-0.443": (
+        "ast-finetuned-audioset-14-14-0.443": (
             "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-audioset-16-16-0.442": (
+        "ast-finetuned-audioset-16-16-0.442": (
             "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
         ),
-        "audio-spectrogram-transformer-finetuned-speech-commands-v2": (
+        "ast-finetuned-speech-commands-v2": (
             "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
         ),
     }
@@ -228,21 +228,21 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     outputs = model(**inputs)
     logits = outputs.logits
 
-    if model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593":
+    if model_name == "ast-finetuned-audioset-10-10-0.4593":
         expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
-    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.450":
+    elif model_name == "ast-finetuned-audioset-10-10-0.450":
         expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
-    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448":
+    elif model_name == "ast-finetuned-audioset-10-10-0.448":
         expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
-    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-10-10-0.448-v2":
+    elif model_name == "ast-finetuned-audioset-10-10-0.448-v2":
         expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
-    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-12-12-0.447":
+    elif model_name == "ast-finetuned-audioset-12-12-0.447":
         expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
-    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-14-14-0.443":
+    elif model_name == "ast-finetuned-audioset-14-14-0.443":
         expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
-    elif model_name == "audio-spectrogram-transformer-finetuned-audioset-16-16-0.442":
+    elif model_name == "ast-finetuned-audioset-16-16-0.442":
         expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
-    elif model_name == "audio-spectrogram-transformer-finetuned-speech-commands-v2":
+    elif model_name == "ast-finetuned-speech-commands-v2":
         expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
     else:
         raise ValueError("Unknown model name")
@@ -268,7 +268,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593",
+        default="ast-finetuned-audioset-10-10-0.4593",
         type=str,
         help="Name of the Audio Spectrogram Transformer model you'd like to convert.",
     )
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index e3db344b8fe5dc..da1eff71925993 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -34,18 +34,18 @@ class AudioSpectrogramTransformerFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Audio Spectrogram Transformer feature extractor.
 
-    This feature extractor inherits from [`AudioSpectrogramTransformerFeatureExtractor`] which contains most of the
-    main methods. Users should refer to this superclass for more information regarding those methods.
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
     mean and variance normalization to the extracted features.
 
     Args:
-        feature_size (`int`, defaults to 1):
+        feature_size (`int`, *optional*, defaults to 1):
             The feature dimension of the extracted features.
-        sampling_rate (`int`, defaults to 16000):
+        sampling_rate (`int`, *optional*, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        num_mel_bins (`int`, defaults to 128):
+        num_mel_bins (`int`, *optional*, defaults to 128):
             Number of Mel-frequency bins.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the log-Mel features using `mean` and `std`.
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index f2549e2cf1b305..efba0d46052a86 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -37,20 +37,17 @@
 _FEAT_EXTRACTOR_FOR_DOC = "AudioSpectrogramTransformerFeatureExtractor"
 
 # Base docstring
-# TODO update to appropriate organization
-_CHECKPOINT_FOR_DOC = "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
+_CHECKPOINT_FOR_DOC = "MIT/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
 _EXPECTED_OUTPUT_SHAPE = [1, 1214, 768]
 
 # Audio classification docstring
-# TODO update to appropriate organization
-_SEQ_CLASS_CHECKPOINT = "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
+_SEQ_CLASS_CHECKPOINT = "MIT/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'Speech'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.17
 
 
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # TODO update to appropriate organization
-    "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593",
+    "MIT/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593",
     # See all Audio Spectrogram Transformer models at https://huggingface.co/models?filter=audio-spectrogram-transformer
 ]
 
@@ -530,7 +527,7 @@ def forward(
 @add_start_docstrings(
     """
     Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled
-    output) e.g. for AudioSet, Speech Commands v2.
+    output) e.g. for datasets like AudioSet, Speech Commands v2.
     """,
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 4b57bfe2f1895b..d4226677e44ff6 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -36,13 +36,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class BeitImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class CLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -50,13 +43,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class CLIPImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -281,13 +267,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class PerceiverImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class PoolFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -295,13 +274,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class PoolFormerImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class SegformerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -309,13 +281,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class SegformerImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class VideoMAEFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -323,13 +288,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class VideoMAEImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class ViltFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -337,13 +295,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
-class ViltImageProcessor(metaclass=DummyObject):
-    _backends = ["vision"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["vision"])
-
-
 class ViltProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 341ade2e89bafaccf68db0cdc66e155f1a5259ad Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 14 Nov 2022 20:23:37 +0100
Subject: [PATCH 21/37] Update organization

---
 ...udio_spectrogram_transformer_original_to_pytorch.py |  4 ++--
 .../modeling_audio_spectrogram_transformer.py          |  8 ++++----
 .../test_modeling_audio_spectrogram_transformer.py     | 10 +++-------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index 8a4b6bda088baa..eef8d472aff8fc 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -259,8 +259,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
 
     if push_to_hub:
         print("Pushing model and feature extractor to the hub...")
-        model.push_to_hub(model_name, organization="nielsr")
-        feature_extractor.push_to_hub(model_name, organization="nielsr")
+        model.push_to_hub(f"MIT/{model_name}")
+        feature_extractor.push_to_hub(f"MIT/{model_name}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index efba0d46052a86..81512d420bb5e5 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -37,18 +37,18 @@
 _FEAT_EXTRACTOR_FOR_DOC = "AudioSpectrogramTransformerFeatureExtractor"
 
 # Base docstring
-_CHECKPOINT_FOR_DOC = "MIT/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
+_CHECKPOINT_FOR_DOC = "MIT/ast-finetuned-audioset-10-10-0.4593"
 _EXPECTED_OUTPUT_SHAPE = [1, 1214, 768]
 
 # Audio classification docstring
-_SEQ_CLASS_CHECKPOINT = "MIT/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
+_SEQ_CLASS_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593"
 _SEQ_CLASS_EXPECTED_OUTPUT = "'Speech'"
 _SEQ_CLASS_EXPECTED_LOSS = 0.17
 
 
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "MIT/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593",
-    # See all Audio Spectrogram Transformer models at https://huggingface.co/models?filter=audio-spectrogram-transformer
+    "MIT/ast-finetuned-audioset-10-10-0.4593",
+    # See all Audio Spectrogram Transformer models at https://huggingface.co/models?filter=ast
 ]
 
 
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 4a101f677010d5..779b5b2335832b 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch AudioSpectrogramTransformer model. """
+""" Testing suite for the PyTorch Audio Spectrogram Transformer (AST) model. """
 
 import inspect
 import unittest
@@ -219,11 +219,8 @@ def prepare_audio():
 class AudioSpectrogramTransformerModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        # TODO rename nielsr to appropriate organization
         return (
-            AudioSpectrogramTransformerFeatureExtractor.from_pretrained(
-                "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
-            )
+            AudioSpectrogramTransformerFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
             if is_torchaudio_available()
             else None
         )
@@ -232,9 +229,8 @@ def default_feature_extractor(self):
     def test_inference_audio_classification(self):
 
         feature_extractor = self.default_feature_extractor
-        # TODO rename nielsr to appropriate organization
         model = AudioSpectrogramTransformerForSequenceClassification.from_pretrained(
-            "nielsr/audio-spectrogram-transformer-finetuned-audioset-10-10-0.4593"
+            "MIT/ast-finetuned-audioset-10-10-0.4593"
         ).to(torch_device)
 
         feature_extractor = self.default_feature_extractor

From 5685d7301006ba2366fc8e10caf2a6e93d8347df Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 15 Nov 2022 09:15:53 +0100
Subject: [PATCH 22/37] Fix init

---
 src/transformers/__init__.py                  | 98 +++++++++----------
 .../audio_spectrogram_transformer/__init__.py | 27 ++---
 src/transformers/utils/dummy_pt_objects.py    |  7 ++
 .../utils/dummy_speech_objects.py             |  7 --
 .../utils/dummy_vision_objects.py             | 56 +++++++++++
 5 files changed, 117 insertions(+), 78 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1e4cbebcbfbe9a..8cb739049d6108 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -677,7 +677,6 @@
         name for name in dir(dummy_speech_objects) if not name.startswith("_")
     ]
 else:
-    _import_structure["models.audio_spectrogram_transformer"].append("AudioSpectrogramTransformerFeatureExtractor")
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
 
@@ -740,14 +739,13 @@
     _import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"])
     _import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"])
     _import_structure["models.owlvit"].append("OwlViTFeatureExtractor")
-    _import_structure["models.perceiver"].append("PerceiverFeatureExtractor")
-    _import_structure["models.poolformer"].append("PoolFormerFeatureExtractor")
-    _import_structure["models.segformer"].append("SegformerFeatureExtractor")
-    _import_structure["models.videomae"].append("VideoMAEFeatureExtractor")
-    _import_structure["models.vilt"].append("ViltFeatureExtractor")
-    _import_structure["models.vilt"].append("ViltProcessor")
-    _import_structure["models.vit"].append("ViTFeatureExtractor")
-    _import_structure["models.yolos"].append("YolosFeatureExtractor")
+    _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
+    _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
+    _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
+    _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
+    _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
+    _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
+    _import_structure["models.yolos"].extend(["YolosFeatureExtractor"])
 
 # Timm-backed objects
 try:
@@ -859,29 +857,13 @@
 
     # PyTorch models structure
 
-    _import_structure["models.roc_bert"].extend(
-        [
-            "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "RoCBertForMaskedLM",
-            "RoCBertForCausalLM",
-            "RoCBertForMultipleChoice",
-            "RoCBertForQuestionAnswering",
-            "RoCBertForSequenceClassification",
-            "RoCBertForTokenClassification",
-            "RoCBertLayer",
-            "RoCBertModel",
-            "RoCBertForPreTraining",
-            "RoCBertPreTrainedModel",
-            "load_tf_weights_in_roc_bert",
-        ]
-    )
-
-    _import_structure["models.time_series_transformer"].extend(
+    _import_structure["models.audio_spectrogram_transformer"].extend(
         [
-            "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TimeSeriesTransformerForPrediction",
-            "TimeSeriesTransformerModel",
-            "TimeSeriesTransformerPreTrainedModel",
+            "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AudioSpectrogramTransformerForSequenceClassification",
+            "AudioSpectrogramTransformerModel",
+            "AudioSpectrogramTransformerPreTrainedModel",
+            "AudioSpectrogramTransformerFeatureExtractor",
         ]
     )
     _import_structure["models.albert"].extend(
@@ -1932,6 +1914,22 @@
             "RobertaPreTrainedModel",
         ]
     )
+    _import_structure["models.roc_bert"].extend(
+        [
+            "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RoCBertForMaskedLM",
+            "RoCBertForCausalLM",
+            "RoCBertForMultipleChoice",
+            "RoCBertForQuestionAnswering",
+            "RoCBertForSequenceClassification",
+            "RoCBertForTokenClassification",
+            "RoCBertLayer",
+            "RoCBertModel",
+            "RoCBertForPreTraining",
+            "RoCBertPreTrainedModel",
+            "load_tf_weights_in_roc_bert",
+        ]
+    )
     _import_structure["models.lilt"].extend(
         [
             "LILT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2056,6 +2054,14 @@
             "load_tf_weights_in_tapas",
         ]
     )
+    _import_structure["models.time_series_transformer"].extend(
+        [
+            "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TimeSeriesTransformerForPrediction",
+            "TimeSeriesTransformerModel",
+            "TimeSeriesTransformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.t5"].extend(
         [
             "T5_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2165,14 +2171,6 @@
             "ViTPreTrainedModel",
         ]
     )
-    _import_structure["models.audio_spectrogram_transformer"].extend(
-        [
-            "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "AudioSpectrogramTransformerModel",
-            "AudioSpectrogramTransformerPreTrainedModel",
-            "AudioSpectrogramTransformerForSequenceClassification",
-        ]
-    )
     _import_structure["models.vit_mae"].extend(
         [
             "VIT_MAE_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3816,7 +3814,6 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_speech_objects import *
     else:
-        from .models.audio_spectrogram_transformer import AudioSpectrogramTransformerFeatureExtractor
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
 
@@ -3845,8 +3842,8 @@
         from .image_processing_utils import ImageProcessingMixin
         from .image_transforms import rescale, resize, to_pil_image
         from .image_utils import ImageFeatureExtractionMixin
-        from .models.beit import BeitFeatureExtractor
-        from .models.clip import CLIPFeatureExtractor
+        from .models.beit import BeitFeatureExtractor, BeitImageProcessor
+        from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
         from .models.conditional_detr import ConditionalDetrFeatureExtractor
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
         from .models.deformable_detr import DeformableDetrFeatureExtractor
@@ -3865,12 +3862,12 @@
         from .models.mobilenet_v2 import MobileNetV2FeatureExtractor, MobileNetV2ImageProcessor
         from .models.mobilevit import MobileViTFeatureExtractor, MobileViTImageProcessor
         from .models.owlvit import OwlViTFeatureExtractor
-        from .models.perceiver import PerceiverFeatureExtractor
-        from .models.poolformer import PoolFormerFeatureExtractor
-        from .models.segformer import SegformerFeatureExtractor
-        from .models.videomae import VideoMAEFeatureExtractor
-        from .models.vilt import ViltFeatureExtractor, ViltProcessor
-        from .models.vit import ViTFeatureExtractor
+        from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
+        from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
+        from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
+        from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
+        from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
+        from .models.vit import ViTFeatureExtractor, ViTImageProcessor
         from .models.yolos import YolosFeatureExtractor
 
     # Modeling
@@ -3959,8 +3956,6 @@
             top_k_top_p_filtering,
         )
         from .modeling_utils import PreTrainedModel
-
-        # PyTorch model imports
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
@@ -3973,8 +3968,11 @@
             AlbertPreTrainedModel,
             load_tf_weights_in_albert,
         )
+
+        # PyTorch model imports
         from .models.audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AudioSpectrogramTransformerFeatureExtractor,
             AudioSpectrogramTransformerForSequenceClassification,
             AudioSpectrogramTransformerModel,
             AudioSpectrogramTransformerPreTrainedModel,
diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py
index 6fe7b0ff2f26fd..51c1199e85021a 100644
--- a/src/transformers/models/audio_spectrogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,19 +17,18 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_speech_available, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
     "configuration_audio_spectrogram_transformer": [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "AudioSpectrogramTransformerConfig",
-        "AudioSpectrogramTransformerOnnxConfig",
     ]
 }
 
 try:
-    if not is_speech_available():
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -37,41 +36,26 @@
     _import_structure["feature_extraction_audio_spectrogram_transformer"] = [
         "AudioSpectrogramTransformerFeatureExtractor"
     ]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
     _import_structure["modeling_audio_spectrogram_transformer"] = [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "AudioSpectrogramTransformerForSequenceClassification",
         "AudioSpectrogramTransformerModel",
         "AudioSpectrogramTransformerPreTrainedModel",
-        "AudioSpectrogramTransformerForSequenceClassification",
     ]
 
 if TYPE_CHECKING:
     from .configuration_audio_spectrogram_transformer import (
         AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         AudioSpectrogramTransformerConfig,
-        AudioSpectrogramTransformerOnnxConfig,
     )
 
-    try:
-        if not is_speech_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_audio_spectrogram_transformer import AudioSpectrogramTransformerFeatureExtractor
-
     try:
         if not is_torch_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         pass
     else:
+        from .feature_extraction_audio_spectrogram_transformer import AudioSpectrogramTransformerFeatureExtractor
         from .modeling_audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             AudioSpectrogramTransformerForSequenceClassification,
@@ -79,6 +63,7 @@
             AudioSpectrogramTransformerPreTrainedModel,
         )
 
+
 else:
     import sys
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index cff49eb0302b29..bf98edaadaaf0e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -353,6 +353,13 @@ def load_tf_weights_in_albert(*args, **kwargs):
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class AudioSpectrogramTransformerFeatureExtractor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AudioSpectrogramTransformerForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index cbe700be1bc8b9..ae5589292a4cf9 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -3,13 +3,6 @@
 from ..utils import DummyObject, requires_backends
 
 
-class AudioSpectrogramTransformerFeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])
-
-
 class MCTCTFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index d4226677e44ff6..7ce1f1867057f3 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -36,6 +36,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class BeitImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class CLIPFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -43,6 +50,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class CLIPImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -267,6 +281,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PerceiverImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class PoolFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -274,6 +295,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PoolFormerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SegformerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -281,6 +309,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class SegformerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VideoMAEFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -288,6 +323,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class VideoMAEImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ViltFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -295,6 +337,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ViltImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ViltProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -309,6 +358,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class ViTImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class YolosFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From a0e7d506dce40e7558122f51354a6fe28711d4ca Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 15 Nov 2022 09:22:13 +0100
Subject: [PATCH 23/37] Rename model to AST

---
 .../audio-spectrogram-transformer.mdx         |  18 +--
 src/transformers/__init__.py                  |  20 +--
 .../audio_spectrogram_transformer/__init__.py |  22 ++--
 ...iguration_audio_spectrogram_transformer.py |  19 ++-
 ...trogram_transformer_original_to_pytorch.py |  12 +-
 ...xtraction_audio_spectrogram_transformer.py |   4 +-
 .../modeling_audio_spectrogram_transformer.py | 115 +++++++++---------
 .../models/auto/configuration_auto.py         |   2 +-
 .../models/auto/feature_extraction_auto.py    |   2 +-
 src/transformers/models/auto/modeling_auto.py |   4 +-
 src/transformers/utils/dummy_pt_objects.py    |   8 +-
 ...xtraction_audio_spectrogram_transformer.py |  12 +-
 ..._modeling_audio_spectrogram_transformer.py |  42 +++----
 13 files changed, 134 insertions(+), 146 deletions(-)

diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
index e43fe23b145880..ff28e9f30564bb 100644
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
@@ -25,7 +25,7 @@ The abstract from the paper is the following:
 Tips:
 
 - When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make
-sure the input has mean of 0 and std of 0.5). [`AudioSpectrogramTransformerFeatureExtractor`] takes care of this. Note that it uses the AudioSet
+sure the input has mean of 0 and std of 0.5). [`ASTFeatureExtractor`] takes care of this. Note that it uses the AudioSet
 mean and std by default. You can check [`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py) to see how
 the authors compute the stats for a downstream dataset.
 - Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
@@ -40,21 +40,21 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/YuanGongND/ast).
 
 
-## AudioSpectrogramTransformerConfig
+## ASTConfig
 
-[[autodoc]] AudioSpectrogramTransformerConfig
+[[autodoc]] ASTConfig
 
-## AudioSpectrogramTransformerFeatureExtractor
+## ASTFeatureExtractor
 
-[[autodoc]] AudioSpectrogramTransformerFeatureExtractor
+[[autodoc]] ASTFeatureExtractor
     - __call__
 
-## AudioSpectrogramTransformerModel
+## ASTModel
 
-[[autodoc]] AudioSpectrogramTransformerModel
+[[autodoc]] ASTModel
     - forward
 
-## AudioSpectrogramTransformerForSequenceClassification
+## ASTForSequenceClassification
 
-[[autodoc]] AudioSpectrogramTransformerForSequenceClassification
+[[autodoc]] ASTForSequenceClassification
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8cb739049d6108..aca91d7a75f762 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -124,7 +124,7 @@
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.audio_spectrogram_transformer": [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "AudioSpectrogramTransformerConfig",
+        "ASTConfig",
     ],
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -860,10 +860,10 @@
     _import_structure["models.audio_spectrogram_transformer"].extend(
         [
             "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "AudioSpectrogramTransformerForSequenceClassification",
-            "AudioSpectrogramTransformerModel",
-            "AudioSpectrogramTransformerPreTrainedModel",
-            "AudioSpectrogramTransformerFeatureExtractor",
+            "ASTForSequenceClassification",
+            "ASTModel",
+            "ASTPreTrainedModel",
+            "ASTFeatureExtractor",
         ]
     )
     _import_structure["models.albert"].extend(
@@ -3315,7 +3315,7 @@
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
     from .models.audio_spectrogram_transformer import (
         AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        AudioSpectrogramTransformerConfig,
+        ASTConfig,
     )
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3972,10 +3972,10 @@
         # PyTorch model imports
         from .models.audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectrogramTransformerFeatureExtractor,
-            AudioSpectrogramTransformerForSequenceClassification,
-            AudioSpectrogramTransformerModel,
-            AudioSpectrogramTransformerPreTrainedModel,
+            ASTFeatureExtractor,
+            ASTForSequenceClassification,
+            ASTModel,
+            ASTPreTrainedModel,
         )
         from .models.auto import (
             MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py
index 51c1199e85021a..255dd1dfe9ed8e 100644
--- a/src/transformers/models/audio_spectrogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py
@@ -23,7 +23,7 @@
 _import_structure = {
     "configuration_audio_spectrogram_transformer": [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "AudioSpectrogramTransformerConfig",
+        "ASTConfig",
     ]
 }
 
@@ -33,20 +33,18 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_audio_spectrogram_transformer"] = [
-        "AudioSpectrogramTransformerFeatureExtractor"
-    ]
+    _import_structure["feature_extraction_audio_spectrogram_transformer"] = ["ASTFeatureExtractor"]
     _import_structure["modeling_audio_spectrogram_transformer"] = [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "AudioSpectrogramTransformerForSequenceClassification",
-        "AudioSpectrogramTransformerModel",
-        "AudioSpectrogramTransformerPreTrainedModel",
+        "ASTForSequenceClassification",
+        "ASTModel",
+        "ASTPreTrainedModel",
     ]
 
 if TYPE_CHECKING:
     from .configuration_audio_spectrogram_transformer import (
         AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        AudioSpectrogramTransformerConfig,
+        ASTConfig,
     )
 
     try:
@@ -55,12 +53,12 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_audio_spectrogram_transformer import AudioSpectrogramTransformerFeatureExtractor
+        from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor
         from .modeling_audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AudioSpectrogramTransformerForSequenceClassification,
-            AudioSpectrogramTransformerModel,
-            AudioSpectrogramTransformerPreTrainedModel,
+            ASTForSequenceClassification,
+            ASTModel,
+            ASTPreTrainedModel,
         )
 
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 97b99a963db716..b9a2f4afd9f4f1 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AudioSpectrogramTransformer model configuration"""
+""" Audio Spectogram Transformer (AST) model configuration"""
 
 
 from ...configuration_utils import PretrainedConfig
@@ -28,12 +28,11 @@
 }
 
 
-class AudioSpectrogramTransformerConfig(PretrainedConfig):
+class ASTConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AudioSpectrogramTransformerModel`]. It is used to
-    instantiate an AudioSpectrogramTransformer model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    AudioSpectrogramTransformer
+    This is the configuration class to store the configuration of a [`ASTModel`]. It is used to instantiate an AST
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the AST
     [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
     architecture.
 
@@ -76,13 +75,13 @@ class AudioSpectrogramTransformerConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import AudioSpectrogramTransformerConfig, AudioSpectrogramTransformerModel
+    >>> from transformers import ASTConfig, ASTModel
 
-    >>> # Initializing a AudioSpectrogramTransformer MIT/ast-finetuned-audioset-10-10-0.4593 style configuration
-    >>> configuration = AudioSpectrogramTransformerConfig()
+    >>> # Initializing a AST MIT/ast-finetuned-audioset-10-10-0.4593 style configuration
+    >>> configuration = ASTConfig()
 
     >>> # Initializing a model (with random weights) from the MIT/ast-finetuned-audioset-10-10-0.4593 style configuration
-    >>> model = AudioSpectrogramTransformerModel(configuration)
+    >>> model = ASTModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index eef8d472aff8fc..b28f9a660e1ce8 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -24,11 +24,7 @@
 from datasets import load_dataset
 
 from huggingface_hub import hf_hub_download
-from transformers import (
-    AudioSpectrogramTransformerConfig,
-    AudioSpectrogramTransformerFeatureExtractor,
-    AudioSpectrogramTransformerForSequenceClassification,
-)
+from transformers import ASTConfig, ASTFeatureExtractor, ASTForSequenceClassification
 from transformers.utils import logging
 
 
@@ -37,7 +33,7 @@
 
 
 def get_audio_spectrogram_transformer_config(model_name):
-    config = AudioSpectrogramTransformerConfig()
+    config = ASTConfig()
 
     if "10-10" in model_name:
         pass
@@ -197,7 +193,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     new_state_dict = convert_state_dict(state_dict, config)
 
     # load 🤗 model
-    model = AudioSpectrogramTransformerForSequenceClassification(config)
+    model = ASTForSequenceClassification(config)
     model.eval()
 
     model.load_state_dict(new_state_dict)
@@ -206,7 +202,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
     mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
     std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
-    feature_extractor = AudioSpectrogramTransformerFeatureExtractor(mean=mean, std=std)
+    feature_extractor = ASTFeatureExtractor(mean=mean, std=std)
 
     if "speech-commands" in model_name:
         dataset = load_dataset("speech_commands", "v0.02", split="validation")
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index da1eff71925993..9cb6d1885dafe5 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -30,7 +30,7 @@
 logger = logging.get_logger(__name__)
 
 
-class AudioSpectrogramTransformerFeatureExtractor(SequenceFeatureExtractor):
+class ASTFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Audio Spectrogram Transformer feature extractor.
 
@@ -55,7 +55,7 @@ class AudioSpectrogramTransformerFeatureExtractor(SequenceFeatureExtractor):
             The standard deviation value used to normalize the log-Mel features. Uses the AudioSet standard deviation
             by default.
         return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not [`~AudioSpectrogramTransformerFeatureExtractor.__call__`] should return `attention_mask`.
+            Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
     """
 
     model_input_names = ["input_values", "attention_mask"]
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 81512d420bb5e5..30223928459c40 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Audio Spectrogram Transformer model."""
+""" PyTorch Audio Spectrogram Transformer (AST) model."""
 
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
@@ -27,14 +27,14 @@
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
-from .configuration_audio_spectrogram_transformer import AudioSpectrogramTransformerConfig
+from .configuration_audio_spectrogram_transformer import ASTConfig
 
 
 logger = logging.get_logger(__name__)
 
 # General docstring
-_CONFIG_FOR_DOC = "AudioSpectrogramTransformerConfig"
-_FEAT_EXTRACTOR_FOR_DOC = "AudioSpectrogramTransformerFeatureExtractor"
+_CONFIG_FOR_DOC = "ASTConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "ASTFeatureExtractor"
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "MIT/ast-finetuned-audioset-10-10-0.4593"
@@ -52,17 +52,17 @@
 ]
 
 
-class AudioSpectrogramTransformerEmbeddings(nn.Module):
+class ASTEmbeddings(nn.Module):
     """
     Construct the CLS token, position and patch embeddings.
     """
 
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.patch_embeddings = AudioSpectrogramTransformerPatchEmbeddings(config)
+        self.patch_embeddings = ASTPatchEmbeddings(config)
 
         frequency_out_dimension, time_out_dimension = self.get_shape(config)
         num_patches = frequency_out_dimension * time_out_dimension
@@ -91,7 +91,7 @@ def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-class AudioSpectrogramTransformerPatchEmbeddings(nn.Module):
+class ASTPatchEmbeddings(nn.Module):
     """
     This class turns `input_values` of shape `(batch_size, num_channels, height, width)` into the initial
     `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
@@ -116,9 +116,9 @@ def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerSelfAttention(nn.Module):
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AST
+class ASTSelfAttention(nn.Module):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
@@ -177,14 +177,14 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerSelfOutput(nn.Module):
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AST
+class ASTSelfOutput(nn.Module):
     """
-    The residual connection is defined in AudioSpectrogramTransformerLayer instead of here (as is the case with other
-    models), due to the layernorm applied before each block.
+    The residual connection is defined in ASTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
     """
 
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -197,12 +197,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerAttention(nn.Module):
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AST
+class ASTAttention(nn.Module):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
-        self.attention = AudioSpectrogramTransformerSelfAttention(config)
-        self.output = AudioSpectrogramTransformerSelfOutput(config)
+        self.attention = ASTSelfAttention(config)
+        self.output = ASTSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads: Set[int]) -> None:
@@ -237,9 +237,9 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerIntermediate(nn.Module):
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AST
+class ASTIntermediate(nn.Module):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
         if isinstance(config.hidden_act, str):
@@ -255,9 +255,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerOutput(nn.Module):
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AST
+class ASTOutput(nn.Module):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -271,17 +271,17 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerLayer(nn.Module):
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AST
+class ASTLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
 
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = AudioSpectrogramTransformerAttention(config)
-        self.intermediate = AudioSpectrogramTransformerIntermediate(config)
-        self.output = AudioSpectrogramTransformerOutput(config)
+        self.attention = ASTAttention(config)
+        self.intermediate = ASTIntermediate(config)
+        self.output = ASTOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
@@ -292,9 +292,7 @@ def forward(
         output_attentions: bool = False,
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
         self_attention_outputs = self.attention(
-            self.layernorm_before(
-                hidden_states
-            ),  # in AudioSpectrogramTransformer, layernorm is applied before self-attention
+            self.layernorm_before(hidden_states),  # in AST, layernorm is applied before self-attention
             head_mask,
             output_attentions=output_attentions,
         )
@@ -304,7 +302,7 @@ def forward(
         # first residual connection
         hidden_states = attention_output + hidden_states
 
-        # in AudioSpectrogramTransformer, layernorm is also applied after self-attention
+        # in AST, layernorm is also applied after self-attention
         layer_output = self.layernorm_after(hidden_states)
         layer_output = self.intermediate(layer_output)
 
@@ -316,12 +314,12 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AudioSpectrogramTransformer
-class AudioSpectrogramTransformerEncoder(nn.Module):
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AST
+class ASTEncoder(nn.Module):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([AudioSpectrogramTransformerLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([ASTLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     def forward(
@@ -374,13 +372,13 @@ def custom_forward(*inputs):
         )
 
 
-class AudioSpectrogramTransformerPreTrainedModel(PreTrainedModel):
+class ASTPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = AudioSpectrogramTransformerConfig
+    config_class = ASTConfig
     base_model_prefix = "audio_spectrogram_transformer"
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
@@ -396,9 +394,9 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing with ViT->AudioSpectrogramTransformer
-    def _set_gradient_checkpointing(self, module: AudioSpectrogramTransformerEncoder, value: bool = False) -> None:
-        if isinstance(module, AudioSpectrogramTransformerEncoder):
+    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._set_gradient_checkpointing with ViT->AST
+    def _set_gradient_checkpointing(self, module: ASTEncoder, value: bool = False) -> None:
+        if isinstance(module, ASTEncoder):
             module.gradient_checkpointing = value
 
 
@@ -408,7 +406,7 @@ def _set_gradient_checkpointing(self, module: AudioSpectrogramTransformerEncoder
     behavior.
 
     Parameters:
-        config ([`AudioSpectrogramTransformerConfig`]):
+        config ([`ASTConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -417,8 +415,8 @@ def _set_gradient_checkpointing(self, module: AudioSpectrogramTransformerEncoder
 AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AudioSpectrogramTransformerFeatureExtractor`]. See
-            [`AudioSpectrogramTransformerFeatureExtractor.__call__`] for details.
+            Pixel values. Pixel values can be obtained using [`ASTFeatureExtractor`]. See
+            [`ASTFeatureExtractor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -438,24 +436,23 @@ def _set_gradient_checkpointing(self, module: AudioSpectrogramTransformerEncoder
 
 
 @add_start_docstrings(
-    "The bare AudioSpectrogramTransformer Model transformer outputting raw hidden-states without any specific head on"
-    " top.",
+    "The bare AST Model transformer outputting raw hidden-states without any specific head on top.",
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
-class AudioSpectrogramTransformerModel(AudioSpectrogramTransformerPreTrainedModel):
-    def __init__(self, config: AudioSpectrogramTransformerConfig):
+class ASTModel(ASTPreTrainedModel):
+    def __init__(self, config: ASTConfig):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = AudioSpectrogramTransformerEmbeddings(config)
-        self.encoder = AudioSpectrogramTransformerEncoder(config)
+        self.embeddings = ASTEmbeddings(config)
+        self.encoder = ASTEncoder(config)
 
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> AudioSpectrogramTransformerPatchEmbeddings:
+    def get_input_embeddings(self) -> ASTPatchEmbeddings:
         return self.embeddings.patch_embeddings
 
     def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
@@ -531,12 +528,12 @@ def forward(
     """,
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
-class AudioSpectrogramTransformerForSequenceClassification(AudioSpectrogramTransformerPreTrainedModel):
-    def __init__(self, config: AudioSpectrogramTransformerConfig) -> None:
+class ASTForSequenceClassification(ASTPreTrainedModel):
+    def __init__(self, config: ASTConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
-        self.audio_spectrogram_transformer = AudioSpectrogramTransformerModel(config)
+        self.audio_spectrogram_transformer = ASTModel(config)
 
         # Classifier head
         self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5e772a95b627f9..0ff1dd5671e304 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -30,7 +30,7 @@
     [
         # Add configs here
         ("albert", "AlbertConfig"),
-        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerConfig"),
+        ("audio-spectrogram-transformer", "ASTConfig"),
         ("bart", "BartConfig"),
         ("beit", "BeitConfig"),
         ("bert", "BertConfig"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 80af08d4df94fd..3deb1d838473cb 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -37,7 +37,7 @@
 
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
     [
-        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerFeatureExtractor"),
+        ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7c18d45680787a..4133ab750da2ed 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -29,7 +29,7 @@
     [
         # Base model mapping
         ("albert", "AlbertModel"),
-        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerModel"),
+        ("audio-spectrogram-transformer", "ASTModel"),
         ("bart", "BartModel"),
         ("beit", "BeitModel"),
         ("bert", "BertModel"),
@@ -785,7 +785,7 @@
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Audio Classification mapping
-        ("audio-spectrogram-transformer", "AudioSpectrogramTransformerForSequenceClassification"),
+        ("audio-spectrogram-transformer", "ASTForSequenceClassification"),
         ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
         ("hubert", "HubertForSequenceClassification"),
         ("sew", "SEWForSequenceClassification"),
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index bf98edaadaaf0e..db5870d7fe63d6 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -353,28 +353,28 @@ def load_tf_weights_in_albert(*args, **kwargs):
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class AudioSpectrogramTransformerFeatureExtractor(metaclass=DummyObject):
+class ASTFeatureExtractor(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class AudioSpectrogramTransformerForSequenceClassification(metaclass=DummyObject):
+class ASTForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class AudioSpectrogramTransformerModel(metaclass=DummyObject):
+class ASTModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class AudioSpectrogramTransformerPreTrainedModel(metaclass=DummyObject):
+class ASTPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index 9543e3632fa120..e5828d8ff3bda6 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from transformers import AudioSpectrogramTransformerFeatureExtractor
+from transformers import ASTFeatureExtractor
 from transformers.testing_utils import require_torch
 from transformers.utils.import_utils import is_torch_available
 
@@ -47,7 +47,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
     return values
 
 
-class AudioSpectrogramTransformerFeatureExtractionTester(unittest.TestCase):
+class ASTFeatureExtractionTester(unittest.TestCase):
     def __init__(
         self,
         parent,
@@ -99,12 +99,12 @@ def _flatten(list_of_lists):
         return speech_inputs
 
 
-class AudioSpectrogramTransformerFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
 
-    feature_extraction_class = AudioSpectrogramTransformerFeatureExtractor
+    feature_extraction_class = ASTFeatureExtractor
 
     def setUp(self):
-        self.feat_extract_tester = AudioSpectrogramTransformerFeatureExtractionTester(self)
+        self.feat_extract_tester = ASTFeatureExtractionTester(self)
 
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
@@ -159,6 +159,6 @@ def test_integration(self):
         # fmt: on
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = AudioSpectrogramTransformerFeatureExtractor()
+        feaure_extractor = ASTFeatureExtractor()
         input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
         self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 779b5b2335832b..8119d639eae025 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -18,7 +18,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
-from transformers import AudioSpectrogramTransformerConfig
+from transformers import ASTConfig
 from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
 
@@ -30,7 +30,7 @@
     import torch
     from torch import nn
 
-    from transformers import AudioSpectrogramTransformerForSequenceClassification, AudioSpectrogramTransformerModel
+    from transformers import ASTForSequenceClassification, ASTModel
     from transformers.models.audio_spectrogram_transformer.modeling_audio_spectrogram_transformer import (
         AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
@@ -39,10 +39,10 @@
 if is_torchaudio_available():
     import torchaudio
 
-    from transformers import AudioSpectrogramTransformerFeatureExtractor
+    from transformers import ASTFeatureExtractor
 
 
-class AudioSpectrogramTransformerModelTester:
+class ASTModelTester:
     def __init__(
         self,
         parent,
@@ -85,7 +85,7 @@ def __init__(
         self.frequency_stride = frequency_stride
         self.time_stride = time_stride
 
-        # in AudioSpectrogramTransformer, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        # in AST, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
         frequency_out_dimension = (self.frequency_dimension - self.patch_size) // self.frequency_stride + 1
         time_out_dimension = (self.time_dimension - self.patch_size) // self.time_stride + 1
         num_patches = frequency_out_dimension * time_out_dimension
@@ -103,7 +103,7 @@ def prepare_config_and_inputs(self):
         return config, input_values, labels
 
     def get_config(self):
-        return AudioSpectrogramTransformerConfig(
+        return ASTConfig(
             patch_size=self.patch_size,
             time_dimension=self.time_dimension,
             frequency_dimension=self.frequency_dimension,
@@ -121,7 +121,7 @@ def get_config(self):
         )
 
     def create_and_check_model(self, config, input_values, labels):
-        model = AudioSpectrogramTransformerModel(config=config)
+        model = ASTModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_values)
@@ -139,16 +139,16 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class AudioSpectrogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
+class ASTModelTest(ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as AudioSpectrogramTransformer does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as AST does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
     all_model_classes = (
         (
-            AudioSpectrogramTransformerModel,
-            AudioSpectrogramTransformerForSequenceClassification,
+            ASTModel,
+            ASTForSequenceClassification,
         )
         if is_torch_available()
         else ()
@@ -159,15 +159,13 @@ class AudioSpectrogramTransformerModelTest(ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = AudioSpectrogramTransformerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AudioSpectrogramTransformerConfig, has_text_modality=False, hidden_size=37
-        )
+        self.model_tester = ASTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="AudioSpectrogramTransformer does not use inputs_embeds")
+    @unittest.skip(reason="AST does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -199,7 +197,7 @@ def test_model(self):
     @slow
     def test_model_from_pretrained(self):
         for model_name in AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AudioSpectrogramTransformerModel.from_pretrained(model_name)
+            model = ASTModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
@@ -216,11 +214,11 @@ def prepare_audio():
 
 @require_torch
 @require_torchaudio
-class AudioSpectrogramTransformerModelIntegrationTest(unittest.TestCase):
+class ASTModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
         return (
-            AudioSpectrogramTransformerFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+            ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
             if is_torchaudio_available()
             else None
         )
@@ -229,9 +227,9 @@ def default_feature_extractor(self):
     def test_inference_audio_classification(self):
 
         feature_extractor = self.default_feature_extractor
-        model = AudioSpectrogramTransformerForSequenceClassification.from_pretrained(
-            "MIT/ast-finetuned-audioset-10-10-0.4593"
-        ).to(torch_device)
+        model = ASTForSequenceClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         audio, sampling_rate = prepare_audio()

From 19cf9f6bd0e29669634e8dd260f36ad921df56a8 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 15 Nov 2022 15:45:49 +0100
Subject: [PATCH 24/37] Add require_torchaudio annotator

---
 .../test_feature_extraction_audio_spectrogram_transformer.py  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index e5828d8ff3bda6..cf6bb1d27f7920 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from transformers import ASTFeatureExtractor
-from transformers.testing_utils import require_torch
+from transformers.testing_utils import require_torch, require_torchaudio
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@@ -99,6 +99,8 @@ def _flatten(list_of_lists):
         return speech_inputs
 
 
+@require_torch
+@require_torchaudio
 class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
 
     feature_extraction_class = ASTFeatureExtractor

From 6588ebaa7334de7a7aae964fdddf2305cc43efc7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 15 Nov 2022 15:58:37 +0100
Subject: [PATCH 25/37] Move import of ASTFeatureExtractor under a
 is_speech_available

---
 src/transformers/__init__.py                  |  4 ++--
 .../audio_spectrogram_transformer/__init__.py | 20 ++++++++++++++++---
 src/transformers/utils/dummy_pt_objects.py    |  7 -------
 .../utils/dummy_speech_objects.py             |  7 +++++++
 4 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aca91d7a75f762..1e942449544cff 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -677,6 +677,7 @@
         name for name in dir(dummy_speech_objects) if not name.startswith("_")
     ]
 else:
+    _import_structure["models.audio_spectrogram_transformer"].append("ASTFeatureExtractor")
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
 
@@ -863,7 +864,6 @@
             "ASTForSequenceClassification",
             "ASTModel",
             "ASTPreTrainedModel",
-            "ASTFeatureExtractor",
         ]
     )
     _import_structure["models.albert"].extend(
@@ -3814,6 +3814,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_speech_objects import *
     else:
+        from .models.audio_spectrogram_transformer import ASTFeatureExtractor
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
 
@@ -3972,7 +3973,6 @@
         # PyTorch model imports
         from .models.audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ASTFeatureExtractor,
             ASTForSequenceClassification,
             ASTModel,
             ASTPreTrainedModel,
diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py
index 255dd1dfe9ed8e..008b43aea90f79 100644
--- a/src/transformers/models/audio_spectrogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_speech_available, is_torch_available
 
 
 _import_structure = {
@@ -33,7 +33,6 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["feature_extraction_audio_spectrogram_transformer"] = ["ASTFeatureExtractor"]
     _import_structure["modeling_audio_spectrogram_transformer"] = [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
         "ASTForSequenceClassification",
@@ -41,6 +40,14 @@
         "ASTPreTrainedModel",
     ]
 
+try:
+    if not is_speech_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_audio_spectrogram_transformer"] = ["ASTFeatureExtractor"]
+
 if TYPE_CHECKING:
     from .configuration_audio_spectrogram_transformer import (
         AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -53,7 +60,6 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor
         from .modeling_audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             ASTForSequenceClassification,
@@ -61,6 +67,14 @@
             ASTPreTrainedModel,
         )
 
+    try:
+        if not is_speech_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor
+
 
 else:
     import sys
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index db5870d7fe63d6..ae12190ca4be88 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -353,13 +353,6 @@ def load_tf_weights_in_albert(*args, **kwargs):
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class ASTFeatureExtractor(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ASTForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index ae5589292a4cf9..d1929dd2853b1b 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
 
 
+class ASTFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
+
+
 class MCTCTFeatureExtractor(metaclass=DummyObject):
     _backends = ["speech"]
 

From 3de16865fc837aae1557538c133b9575c2768b09 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 15 Nov 2022 16:05:19 +0100
Subject: [PATCH 26/37] Fix rebase

---
 .../modeling_audio_spectrogram_transformer.py               | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 30223928459c40..2f23fcc019a0b6 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -387,7 +387,11 @@ class ASTPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv2d)):
-            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, nn.LayerNorm):

From 3b3679780d9876f29d874d32b946148a960e3871 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 15 Nov 2022 16:49:47 +0100
Subject: [PATCH 27/37] Add pipeline config

---
 .../modeling_audio_spectrogram_transformer.py               | 5 ++---
 .../test_modeling_audio_spectrogram_transformer.py          | 6 ++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 2f23fcc019a0b6..aa119dab345652 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -93,9 +93,8 @@ def forward(self, input_values: torch.Tensor) -> torch.Tensor:
 
 class ASTPatchEmbeddings(nn.Module):
     """
-    This class turns `input_values` of shape `(batch_size, num_channels, height, width)` into the initial
-    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
-    Transformer.
+    This class turns `input_values` into the initial `hidden_states` (patch embeddings) of shape `(batch_size,
+    seq_length, hidden_size)` to be consumed by a Transformer.
     """
 
     def __init__(self, config):
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 8119d639eae025..54e091ee8781a8 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -120,6 +120,12 @@ def get_config(self):
             time_stride=self.time_stride,
         )
 
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.time_dimension = 1024
+        config.frequency_dimension = 128
+        return config
+
     def create_and_check_model(self, config, input_values, labels):
         model = ASTModel(config=config)
         model.to(torch_device)

From 0a8afcae81427a957038f11e28d2dd981d0f0325 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 17 Nov 2022 15:44:16 +0100
Subject: [PATCH 28/37] Update name of classifier head

---
 ...ectrogram_transformer_original_to_pytorch.py |  4 ++--
 .../modeling_audio_spectrogram_transformer.py   | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index b28f9a660e1ce8..48519d68cbc90e 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -98,9 +98,9 @@ def rename_key(name):
         name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
     # classifier head
     if "module.mlp_head.0" in name:
-        name = name.replace("module.mlp_head.0", "layernorm")
+        name = name.replace("module.mlp_head.0", "classifier.layernorm")
     if "module.mlp_head.1" in name:
-        name = name.replace("module.mlp_head.1", "classifier")
+        name = name.replace("module.mlp_head.1", "classifier.dense")
 
     return name
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index aa119dab345652..7974b9e523fe79 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -524,6 +524,18 @@ def forward(
         )
 
 
+class ASTMLPHead(nn.Module):
+    def __init__(self, config: ASTConfig):
+        super().__init__()
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = self.dense(hidden_state)
+        return hidden_state
+
+
 @add_start_docstrings(
     """
     Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled
@@ -539,8 +551,7 @@ def __init__(self, config: ASTConfig) -> None:
         self.audio_spectrogram_transformer = ASTModel(config)
 
         # Classifier head
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        self.classifier = ASTMLPHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -581,8 +592,6 @@ def forward(
         )
 
         pooled_output = outputs[1]
-
-        pooled_output = self.layernorm(pooled_output)
         logits = self.classifier(pooled_output)
 
         loss = None

From 4deea23b398f9644f1c49053cd205c8b0b2889d4 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 17 Nov 2022 16:23:20 +0100
Subject: [PATCH 29/37] Rename time_dimension and frequency_dimension for
 clarity

---
 ...iguration_audio_spectrogram_transformer.py | 14 ++++++------
 .../modeling_audio_spectrogram_transformer.py |  6 +++--
 .../audio_spectrogram_transformer/test.py     | 20 +++++++++++++++++
 ..._modeling_audio_spectrogram_transformer.py | 22 +++++++++----------
 4 files changed, 42 insertions(+), 20 deletions(-)
 create mode 100644 src/transformers/models/audio_spectrogram_transformer/test.py

diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index b9a2f4afd9f4f1..19f85189ad0dbd 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -67,10 +67,10 @@ class ASTConfig(PretrainedConfig):
             Frequency stride to use when patchifying the spectrograms.
         time_stride (`int`, *optional*, defaults to 10):
             Temporal stride to use when patchifying the spectrograms.
-        time_dimension (`int`, *optional*, defaults to 1024):
+        max_length (`int`, *optional*, defaults to 1024):
             Temporal dimension of the spectrograms.
-        frequency_dimension (`int`, *optional*, defaults to 128):
-            Frequency dimension of the spectrograms.
+        num_mel_bins (`int`, *optional*, defaults to 128):
+            Frequency dimension of the spectrograms (number of Mel-frequency bins).
 
     Example:
 
@@ -103,8 +103,8 @@ def __init__(
         qkv_bias=True,
         frequency_stride=10,
         time_stride=10,
-        time_dimension=1024,
-        frequency_dimension=128,
+        max_length=1024,
+        num_mel_bins=128,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -122,5 +122,5 @@ def __init__(
         self.qkv_bias = qkv_bias
         self.frequency_stride = frequency_stride
         self.time_stride = time_stride
-        self.time_dimension = time_dimension
-        self.frequency_dimension = frequency_dimension
+        self.max_length = max_length
+        self.num_mel_bins = num_mel_bins
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 7974b9e523fe79..21bb88e4fe67b7 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -73,8 +73,8 @@ def __init__(self, config: ASTConfig) -> None:
     def get_shape(self, config):
         # see Karpathy's cs231n blog on how to calculate the output dimensions
         # https://cs231n.github.io/convolutional-networks/#conv
-        frequency_out_dimension = (config.frequency_dimension - config.patch_size) // config.frequency_stride + 1
-        time_out_dimension = (config.time_dimension - config.patch_size) // config.time_stride + 1
+        frequency_out_dimension = (config.num_mel_bins - config.patch_size) // config.frequency_stride + 1
+        time_out_dimension = (config.max_length - config.patch_size) // config.time_stride + 1
 
         return frequency_out_dimension, time_out_dimension
 
@@ -583,6 +583,8 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        print("Shape of input_values: ", input_values.shape)
+
         outputs = self.audio_spectrogram_transformer(
             input_values,
             head_mask=head_mask,
diff --git a/src/transformers/models/audio_spectrogram_transformer/test.py b/src/transformers/models/audio_spectrogram_transformer/test.py
new file mode 100644
index 00000000000000..511fbd771c2e26
--- /dev/null
+++ b/src/transformers/models/audio_spectrogram_transformer/test.py
@@ -0,0 +1,20 @@
+from huggingface_hub import hf_hub_download
+import torchaudio
+
+from transformers import ASTFeatureExtractor
+
+filepath = hf_hub_download(
+    repo_id="nielsr/audio-spectogram-transformer-checkpoint",
+    filename="sample_audio.flac",
+    repo_type="dataset",
+)
+
+waveform, _ = torchaudio.load(filepath)
+waveform = waveform.squeeze().numpy()
+
+max_length = 24
+feature_extractor = ASTFeatureExtractor(num_mel_bins=16)
+inputs = feature_extractor(waveform, sampling_rate=16000, max_length=max_length, return_tensors="pt")
+
+for k,v in inputs.items():
+    print(k,v.shape)
\ No newline at end of file
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 54e091ee8781a8..fb41c4534ae3ca 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -48,8 +48,8 @@ def __init__(
         parent,
         batch_size=13,
         patch_size=2,
-        time_dimension=24,
-        frequency_dimension=16,
+        max_length=24,
+        num_mel_bins=16,
         is_training=True,
         use_labels=True,
         hidden_size=32,
@@ -68,8 +68,8 @@ def __init__(
         self.parent = parent
         self.batch_size = batch_size
         self.patch_size = patch_size
-        self.time_dimension = time_dimension
-        self.frequency_dimension = frequency_dimension
+        self.max_length = max_length
+        self.num_mel_bins = num_mel_bins
         self.is_training = is_training
         self.use_labels = use_labels
         self.hidden_size = hidden_size
@@ -86,13 +86,13 @@ def __init__(
         self.time_stride = time_stride
 
         # in AST, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
-        frequency_out_dimension = (self.frequency_dimension - self.patch_size) // self.frequency_stride + 1
-        time_out_dimension = (self.time_dimension - self.patch_size) // self.time_stride + 1
+        frequency_out_dimension = (self.num_mel_bins - self.patch_size) // self.frequency_stride + 1
+        time_out_dimension = (self.max_length - self.patch_size) // self.time_stride + 1
         num_patches = frequency_out_dimension * time_out_dimension
         self.seq_length = num_patches + 2
 
     def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.time_dimension, self.frequency_dimension])
+        input_values = floats_tensor([self.batch_size, self.max_length, self.num_mel_bins])
 
         labels = None
         if self.use_labels:
@@ -105,8 +105,8 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return ASTConfig(
             patch_size=self.patch_size,
-            time_dimension=self.time_dimension,
-            frequency_dimension=self.frequency_dimension,
+            max_length=self.max_length,
+            num_mel_bins=self.num_mel_bins,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
@@ -122,8 +122,8 @@ def get_config(self):
 
     def get_pipeline_config(self):
         config = self.get_config()
-        config.time_dimension = 1024
-        config.frequency_dimension = 128
+        config.max_length = 1024
+        config.num_mel_bins = 128
         return config
 
     def create_and_check_model(self, config, input_values, labels):

From 8113ed12d0d93213be6463bb1edcbfea947382da Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 17 Nov 2022 16:56:58 +0100
Subject: [PATCH 30/37] Remove print statement

---
 .../modeling_audio_spectrogram_transformer.py              | 2 --
 .../models/audio_spectrogram_transformer/test.py           | 7 ++++---
 .../test_modeling_audio_spectrogram_transformer.py         | 6 ------
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 21bb88e4fe67b7..9c0483047a7a8d 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -583,8 +583,6 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        print("Shape of input_values: ", input_values.shape)
-
         outputs = self.audio_spectrogram_transformer(
             input_values,
             head_mask=head_mask,
diff --git a/src/transformers/models/audio_spectrogram_transformer/test.py b/src/transformers/models/audio_spectrogram_transformer/test.py
index 511fbd771c2e26..e45fe9aa5ece9e 100644
--- a/src/transformers/models/audio_spectrogram_transformer/test.py
+++ b/src/transformers/models/audio_spectrogram_transformer/test.py
@@ -1,8 +1,9 @@
-from huggingface_hub import hf_hub_download
 import torchaudio
 
+from huggingface_hub import hf_hub_download
 from transformers import ASTFeatureExtractor
 
+
 filepath = hf_hub_download(
     repo_id="nielsr/audio-spectogram-transformer-checkpoint",
     filename="sample_audio.flac",
@@ -16,5 +17,5 @@
 feature_extractor = ASTFeatureExtractor(num_mel_bins=16)
 inputs = feature_extractor(waveform, sampling_rate=16000, max_length=max_length, return_tensors="pt")
 
-for k,v in inputs.items():
-    print(k,v.shape)
\ No newline at end of file
+for k, v in inputs.items():
+    print(k, v.shape)
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index fb41c4534ae3ca..347f1f6e948837 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -120,12 +120,6 @@ def get_config(self):
             time_stride=self.time_stride,
         )
 
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.max_length = 1024
-        config.num_mel_bins = 128
-        return config
-
     def create_and_check_model(self, config, input_values, labels):
         model = ASTModel(config=config)
         model.to(torch_device)

From 3154869b43920dfd67161caa97d4fb5c032e94df Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 18 Nov 2022 09:30:52 +0100
Subject: [PATCH 31/37] Fix pipeline test

---
 ..._extraction_audio_spectrogram_transformer.py | 17 +++++++++--------
 tests/pipelines/test_pipelines_common.py        |  4 ++++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index 9cb6d1885dafe5..73041b7ae48ab8 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -32,13 +32,13 @@
 
 class ASTFeatureExtractor(SequenceFeatureExtractor):
     r"""
-    Constructs a Audio Spectrogram Transformer feature extractor.
+    Constructs a Audio Spectrogram Transformer (AST) feature extractor.
 
     This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
     most of the main methods. Users should refer to this superclass for more information regarding those methods.
 
-    This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
-    mean and variance normalization to the extracted features.
+    This class extracts mel-filter bank features from raw speech using TorchAudio, pads/truncates them to a fixed
+    length and normalizes them using a mean and standard deviation.
 
     Args:
         feature_size (`int`, *optional*, defaults to 1):
@@ -47,6 +47,8 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
         num_mel_bins (`int`, *optional*, defaults to 128):
             Number of Mel-frequency bins.
+        max_length (`int`, *optional*, defaults to 1024):
+            Maximum length to which to pad/truncate the extracted features.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the log-Mel features using `mean` and `std`.
         mean (`float`, *optional*, defaults to -4.2677393):
@@ -65,6 +67,7 @@ def __init__(
         feature_size=1,
         sampling_rate=16000,
         num_mel_bins=128,
+        max_length=1024,
         padding_value=0.0,
         do_normalize=True,
         mean=-4.2677393,
@@ -74,6 +77,7 @@ def __init__(
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
+        self.max_length = max_length
         self.do_normalize = do_normalize
         self.mean = mean
         self.std = std
@@ -121,7 +125,6 @@ def normalize(self, input_values: np.ndarray) -> np.ndarray:
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        max_length: int = 1024,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs
@@ -133,8 +136,6 @@ def __call__(
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            max_length (`int`, *optional*, defaults to 1024):
-                Maximum length of the returned list and optionally padding length (see above).
             sampling_rate (`int`, *optional*):
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                 `sampling_rate` at the forward call to prevent silent errors.
@@ -175,8 +176,8 @@ def __call__(
         if not is_batched:
             raw_speech = [raw_speech]
 
-        # extract fbank features (padded/truncated to max_length)
-        features = [self._extract_fbank_features(waveform, max_length=max_length) for waveform in raw_speech]
+        # extract fbank features and pad/truncate to max_length
+        features = [self._extract_fbank_features(waveform, max_length=self.max_length) for waveform in raw_speech]
 
         # convert into BatchFeature
         padded_inputs = BatchFeature({"input_values": features})
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 492a63a4ccd0fd..1e7a4f44ac99d8 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -155,6 +155,10 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
     if hasattr(tiny_config, "image_size") and feature_extractor:
         feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
 
+    # Audio Spectogram Transformer specific.
+    if feature_extractor.__class__.__name__ == "ASTFeatureExtractor":
+        feature_extractor = feature_extractor.__class__(max_length=24, num_mel_bins=16)
+
     # Speech2TextModel specific.
     if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
         feature_extractor = feature_extractor.__class__(

From 4282f68e3c585d1b3c454bbf11eb57633e61847e Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 18 Nov 2022 14:15:12 +0100
Subject: [PATCH 32/37] Fix pipeline test

---
 tests/pipelines/test_pipelines_common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 1e7a4f44ac99d8..5593da273bddc9 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -157,7 +157,9 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
 
     # Audio Spectogram Transformer specific.
     if feature_extractor.__class__.__name__ == "ASTFeatureExtractor":
-        feature_extractor = feature_extractor.__class__(max_length=24, num_mel_bins=16)
+        feature_extractor = feature_extractor.__class__(
+            max_length=tiny_config.max_length, num_mel_bins=tiny_config.num_mel_bins
+        )
 
     # Speech2TextModel specific.
     if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:

From 7e38d962b2f580cb0af7ac7ea56e746efbc0c2cc Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 21 Nov 2022 11:13:50 +0100
Subject: [PATCH 33/37] Fix index table

---
 docs/source/en/index.mdx | 291 ++++++++++++++++++++-------------------
 1 file changed, 146 insertions(+), 145 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 3dcea7e85888b0..b202278890dfe6 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -217,150 +217,151 @@ Flax), PyTorch, and/or TensorFlow.
 
 <!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
 
-|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           CLIPSeg           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      Conditional DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             CvT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Deformable DETR       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            DiNAT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          DonutSwin          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|            ERNIE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ESM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      GPT NeoX Japanese      |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          GroupViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Jukebox           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            LiLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|          MarkupLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|         MobileNetV1         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         MobileNetV2         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          MobileViT          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|             NAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          PEGASUS-X          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|           RoCBert           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|     SwitchTransformers      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|      Table Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|   Time Series Transformer   |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
-|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           ViTMSN            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|           Whisper           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|           X-CLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Model             | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:-----------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|            ALBERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+| Audio Spectrogram Transformer |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             BART              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BEiT              |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             BERT              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Bert Generation        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            BigBird            |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|        BigBird-Pegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Blenderbot           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        BlenderbotSmall        |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             BLOOM             |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           CamemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            CANINE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             CLIP              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            CLIPSeg            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CodeGen            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Conditional DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ConvBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ConvNeXT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CTRL              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              CvT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Data2VecAudio         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Data2VecText          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecVision         |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DeBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          DeBERTa-v2           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|     Decision Transformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Deformable DETR        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DeiT              |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             DETR              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             DiNAT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          DistilBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|             ERNIE             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ESM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|  FairSeq Machine-Translation  |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           FlauBERT            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             FLAVA             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             FNet              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Funnel Transformer       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             GLPN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT Neo            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|           GPT NeoX            |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       GPT NeoX Japanese       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             GPT-J             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           GroupViT            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Hubert             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            I-BERT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ImageGPT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Jukebox            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LayoutLM            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             LUKE              |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            LXMERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            M-CTC-T            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            M2M100             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Marian             |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           MarkupLM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          MaskFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             mBART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|         Megatron-BERT         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileNetV1          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          MobileNetV2          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           MobileViT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             MPNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|              MT5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              MVP              |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|              NAT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             Nezha             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Nyströmformer         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          OpenAI GPT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         OpenAI GPT-2          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              OPT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            OWL-ViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Pegasus            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           PEGASUS-X           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Perceiver           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            PLBart             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          PoolFormer           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ProphetNet           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            QDQBert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              RAG              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             REALM             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           Reformer            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RegNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            RemBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            ResNet             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           RetriBERT           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            RoBERTa            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            RoCBert            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           RoFormer            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           SegFormer           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|              SEW              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             SEW-D             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Speech Encoder decoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          Speech2Text          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Speech2Text2          |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|           Splinter            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          SqueezeBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|       Swin Transformer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|      Swin Transformer V2      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|      SwitchTransformers       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              T5               |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Table Transformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             TAPAS             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|    Time Series Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Trajectory Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              VAN              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           VideoMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|    Vision Encoder decoder     |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     VisionTextDualEncoder     |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          VisualBERT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              ViT              |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|            ViTMAE             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            ViTMSN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Wav2Vec2            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|      Wav2Vec2-Conformer       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             WavLM             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            Whisper            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            X-CLIP             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XGLM              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|              XLM              |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        XLM-ProphetNet         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          XLM-RoBERTa          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        XLM-RoBERTa-XL         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             XLNet             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 
 <!-- End table-->

From 519481cf6d388670267697832ab785e9ad3cba8b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 21 Nov 2022 11:29:57 +0100
Subject: [PATCH 34/37] Fix init

---
 src/transformers/__init__.py | 63 ++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1e942449544cff..a0340e0a188223 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -858,12 +858,29 @@
 
     # PyTorch models structure
 
-    _import_structure["models.audio_spectrogram_transformer"].extend(
+    _import_structure["models.roc_bert"].extend(
         [
-            "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ASTForSequenceClassification",
-            "ASTModel",
-            "ASTPreTrainedModel",
+            "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RoCBertForMaskedLM",
+            "RoCBertForCausalLM",
+            "RoCBertForMultipleChoice",
+            "RoCBertForQuestionAnswering",
+            "RoCBertForSequenceClassification",
+            "RoCBertForTokenClassification",
+            "RoCBertLayer",
+            "RoCBertModel",
+            "RoCBertForPreTraining",
+            "RoCBertPreTrainedModel",
+            "load_tf_weights_in_roc_bert",
+        ]
+    )
+
+    _import_structure["models.time_series_transformer"].extend(
+        [
+            "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TimeSeriesTransformerForPrediction",
+            "TimeSeriesTransformerModel",
+            "TimeSeriesTransformerPreTrainedModel",
         ]
     )
     _import_structure["models.albert"].extend(
@@ -880,6 +897,14 @@
             "load_tf_weights_in_albert",
         ]
     )
+    _import_structure["models.audio_spectrogram_transformer"].extend(
+        [
+            "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ASTModel",
+            "ASTPreTrainedModel",
+            "ASTForSequenceClassification",
+        ]
+    )
     _import_structure["models.auto"].extend(
         [
             "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
@@ -1914,22 +1939,6 @@
             "RobertaPreTrainedModel",
         ]
     )
-    _import_structure["models.roc_bert"].extend(
-        [
-            "ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "RoCBertForMaskedLM",
-            "RoCBertForCausalLM",
-            "RoCBertForMultipleChoice",
-            "RoCBertForQuestionAnswering",
-            "RoCBertForSequenceClassification",
-            "RoCBertForTokenClassification",
-            "RoCBertLayer",
-            "RoCBertModel",
-            "RoCBertForPreTraining",
-            "RoCBertPreTrainedModel",
-            "load_tf_weights_in_roc_bert",
-        ]
-    )
     _import_structure["models.lilt"].extend(
         [
             "LILT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2054,14 +2063,6 @@
             "load_tf_weights_in_tapas",
         ]
     )
-    _import_structure["models.time_series_transformer"].extend(
-        [
-            "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TimeSeriesTransformerForPrediction",
-            "TimeSeriesTransformerModel",
-            "TimeSeriesTransformerPreTrainedModel",
-        ]
-    )
     _import_structure["models.t5"].extend(
         [
             "T5_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3957,6 +3958,8 @@
             top_k_top_p_filtering,
         )
         from .modeling_utils import PreTrainedModel
+
+        # PyTorch model imports
         from .models.albert import (
             ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             AlbertForMaskedLM,
@@ -3969,8 +3972,6 @@
             AlbertPreTrainedModel,
             load_tf_weights_in_albert,
         )
-
-        # PyTorch model imports
         from .models.audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             ASTForSequenceClassification,

From 5dbf8998340649323d5d83e605f85778fe52d56b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 21 Nov 2022 11:59:19 +0100
Subject: [PATCH 35/37] Fix conversion script

---
 ...t_audio_spectrogram_transformer_original_to_pytorch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index 48519d68cbc90e..443d1f21f5c03a 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -38,7 +38,7 @@ def get_audio_spectrogram_transformer_config(model_name):
     if "10-10" in model_name:
         pass
     elif "speech-commands" in model_name:
-        config.time_dimension = 128
+        config.max_length = 128
     elif "12-12" in model_name:
         config.time_stride = 12
         config.frequency_stride = 12
@@ -202,7 +202,8 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
     mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
     std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
-    feature_extractor = ASTFeatureExtractor(mean=mean, std=std)
+    max_length = 1024 if "speech-commands" not in model_name else 128
+    feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
 
     if "speech-commands" in model_name:
         dataset = load_dataset("speech_commands", "v0.02", split="validation")
@@ -217,8 +218,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
         waveform, _ = torchaudio.load(filepath)
         waveform = waveform.squeeze().numpy()
 
-    max_length = 1024 if "speech-commands" not in model_name else 128
-    inputs = feature_extractor(waveform, sampling_rate=16000, max_length=max_length, return_tensors="pt")
+    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt")
 
     # forward pass
     outputs = model(**inputs)

From 369ed798b6413ae2d72ebe3fe121f956f75212f7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 21 Nov 2022 16:18:17 +0100
Subject: [PATCH 36/37] Rename to ForAudioClassification

---
 .../audio-spectrogram-transformer.mdx         |  4 ++--
 src/transformers/__init__.py                  |  4 ++--
 .../audio_spectrogram_transformer/__init__.py |  4 ++--
 ...trogram_transformer_original_to_pytorch.py |  4 ++--
 .../modeling_audio_spectrogram_transformer.py |  2 +-
 .../audio_spectrogram_transformer/test.py     | 21 -------------------
 src/transformers/models/auto/modeling_auto.py |  2 +-
 src/transformers/utils/doc.py                 |  2 +-
 src/transformers/utils/dummy_pt_objects.py    |  2 +-
 ..._modeling_audio_spectrogram_transformer.py |  8 +++----
 10 files changed, 15 insertions(+), 38 deletions(-)
 delete mode 100644 src/transformers/models/audio_spectrogram_transformer/test.py

diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
index ff28e9f30564bb..d6093198fc68eb 100644
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.mdx
@@ -54,7 +54,7 @@ The original code can be found [here](https://github.com/YuanGongND/ast).
 [[autodoc]] ASTModel
     - forward
 
-## ASTForSequenceClassification
+## ASTForAudioClassification
 
-[[autodoc]] ASTForSequenceClassification
+[[autodoc]] ASTForAudioClassification
     - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a0340e0a188223..9c5f33bea535a0 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -902,7 +902,7 @@
             "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
             "ASTModel",
             "ASTPreTrainedModel",
-            "ASTForSequenceClassification",
+            "ASTForAudioClassification",
         ]
     )
     _import_structure["models.auto"].extend(
@@ -3974,7 +3974,7 @@
         )
         from .models.audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ASTForSequenceClassification,
+            ASTForAudioClassification,
             ASTModel,
             ASTPreTrainedModel,
         )
diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py
index 008b43aea90f79..37fab5996acbce 100644
--- a/src/transformers/models/audio_spectrogram_transformer/__init__.py
+++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py
@@ -35,7 +35,7 @@
 else:
     _import_structure["modeling_audio_spectrogram_transformer"] = [
         "AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "ASTForSequenceClassification",
+        "ASTForAudioClassification",
         "ASTModel",
         "ASTPreTrainedModel",
     ]
@@ -62,7 +62,7 @@
     else:
         from .modeling_audio_spectrogram_transformer import (
             AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-            ASTForSequenceClassification,
+            ASTForAudioClassification,
             ASTModel,
             ASTPreTrainedModel,
         )
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index 443d1f21f5c03a..f339bbc6c2bf53 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -24,7 +24,7 @@
 from datasets import load_dataset
 
 from huggingface_hub import hf_hub_download
-from transformers import ASTConfig, ASTFeatureExtractor, ASTForSequenceClassification
+from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
 from transformers.utils import logging
 
 
@@ -193,7 +193,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
     new_state_dict = convert_state_dict(state_dict, config)
 
     # load 🤗 model
-    model = ASTForSequenceClassification(config)
+    model = ASTForAudioClassification(config)
     model.eval()
 
     model.load_state_dict(new_state_dict)
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 9c0483047a7a8d..6daec258b6e7d3 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -543,7 +543,7 @@ def forward(self, hidden_state):
     """,
     AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING,
 )
-class ASTForSequenceClassification(ASTPreTrainedModel):
+class ASTForAudioClassification(ASTPreTrainedModel):
     def __init__(self, config: ASTConfig) -> None:
         super().__init__(config)
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/test.py b/src/transformers/models/audio_spectrogram_transformer/test.py
deleted file mode 100644
index e45fe9aa5ece9e..00000000000000
--- a/src/transformers/models/audio_spectrogram_transformer/test.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torchaudio
-
-from huggingface_hub import hf_hub_download
-from transformers import ASTFeatureExtractor
-
-
-filepath = hf_hub_download(
-    repo_id="nielsr/audio-spectogram-transformer-checkpoint",
-    filename="sample_audio.flac",
-    repo_type="dataset",
-)
-
-waveform, _ = torchaudio.load(filepath)
-waveform = waveform.squeeze().numpy()
-
-max_length = 24
-feature_extractor = ASTFeatureExtractor(num_mel_bins=16)
-inputs = feature_extractor(waveform, sampling_rate=16000, max_length=max_length, return_tensors="pt")
-
-for k, v in inputs.items():
-    print(k, v.shape)
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4133ab750da2ed..7f9d11ce0d5acf 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -785,7 +785,7 @@
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Audio Classification mapping
-        ("audio-spectrogram-transformer", "ASTForSequenceClassification"),
+        ("audio-spectrogram-transformer", "ASTForAudioClassification"),
         ("data2vec-audio", "Data2VecAudioForSequenceClassification"),
         ("hubert", "HubertForSequenceClassification"),
         ("sew", "SEWForSequenceClassification"),
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index a5610a32ba6ba4..360d98a460ab50 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -1087,7 +1087,7 @@ def docstring_decorator(fn):
             expected_loss=expected_loss,
         )
 
-        if "SequenceClassification" in model_class and modality == "audio":
+        if ["SequenceClassification" in model_class or "AudioClassification" in model_class] and modality == "audio":
             code_sample = sample_docstrings["AudioClassification"]
         elif "SequenceClassification" in model_class:
             code_sample = sample_docstrings["SequenceClassification"]
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ae12190ca4be88..09ee78c849423a 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -353,7 +353,7 @@ def load_tf_weights_in_albert(*args, **kwargs):
 AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class ASTForSequenceClassification(metaclass=DummyObject):
+class ASTForAudioClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 347f1f6e948837..90d748ebea4a85 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -30,7 +30,7 @@
     import torch
     from torch import nn
 
-    from transformers import ASTForSequenceClassification, ASTModel
+    from transformers import ASTForAudioClassification, ASTModel
     from transformers.models.audio_spectrogram_transformer.modeling_audio_spectrogram_transformer import (
         AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
@@ -148,7 +148,7 @@ class ASTModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ASTModel,
-            ASTForSequenceClassification,
+            ASTForAudioClassification,
         )
         if is_torch_available()
         else ()
@@ -227,9 +227,7 @@ def default_feature_extractor(self):
     def test_inference_audio_classification(self):
 
         feature_extractor = self.default_feature_extractor
-        model = ASTForSequenceClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(
-            torch_device
-        )
+        model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         audio, sampling_rate = prepare_audio()

From b8e31b92abe3a65e9bb6ad43e13c995f57296b28 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 21 Nov 2022 16:24:03 +0100
Subject: [PATCH 37/37] Fix index table

---
 docs/source/en/index.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index b202278890dfe6..790ce8f4d17643 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -364,4 +364,4 @@ Flax), PyTorch, and/or TensorFlow.
 |             YOLOS             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             YOSO              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 
-<!-- End table-->
+<!-- End table-->
\ No newline at end of file