[WIP] Add FLAVA model

This PR aims to add [FLAVA](ihttps://arxiv.org/abs/2112.04482) model to the transformers repo. Following checklist delineates the list of things to be done for this PR to be complete: [x] Flava init [x] Flava base models [x] Flava layers [x] Flava Configs [x] Flava encoders [x] Flava pretraining models [ ] Flava classification/retrieval models (in progress) [x] Documentation updates (in progress) [x] Imports updates (in progress) [x] Argstring updates [x] Flava pretrained checkpoints (in progress) [ ] Flava tests [x] Flava processors (in progress) [x] Sanity check [x] Lint
huggingface · Apr 14, 2022 · eb4e9d5 · eb4e9d5
1 parent d55fcbc
commit eb4e9d5
Show file tree

Hide file tree

Showing 19 changed files with 5,009 additions and 0 deletions.
diff --git a/docs/source/en/model_doc/flava.mdx b/docs/source/en/model_doc/flava.mdx
@@ -0,0 +1,93 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FLAVA
+
+## Overview
+
+The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model
+](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022.
+
+The paper aims at creating a single unified foundation model which can work across vision, language 
+as well as vision-and-language multimodal tasks.
+
+The abstract from the paper is the following:
+
+State-of-the-art vision and vision-and-language models rely on large-scale visio-linguistic pretraining for obtaining good performance on a variety 
+of downstream tasks. Generally, such models are often either cross-modal (contrastive) or multi-modal 
+(with earlier fusion) but not both; and they often only target specific modalities or tasks. A promising 
+direction would be to use a single holistic universal model, as a "foundation", that targets all modalities 
+at once -- a true vision and language foundation model should be good at vision tasks, language tasks, and 
+cross- and multi-modal vision and language tasks. We introduce FLAVA as such a model and demonstrate 
+impressive performance on a wide range of 35 tasks spanning these target modalities.
+
+
+<!-- Tips: -->
+
+This model was contributed by [aps](https://huggingface.co/aps).
+<!-- The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).-->
+
+
+## FLAVAConfig
+
+[[autodoc]] FLAVAConfig
+ - from_configs
+
+## FLAVATextConfig
+
+[[autodoc]] FLAVATextConfig
+
+## FLAVAImageConfig
+
+[[autodoc]] FLAVAImageConfig
+
+## FLAVAMultimodalConfig
+
+[[autodoc]] FLAVAMultimodalConfig
+
+## FLAVACodebookConfig
+
+[[autodoc]] FLAVACodebookConfig
+
+## FLAVAForPretraining
+
+[[autodoc]] FLAVAForPretraining
+ - forward
+
+## FLAVAModel
+
+[[autodoc]] FLAVAModel
+ - forward
+ - get_text_features
+ - get_image_features
+
+## FLAVACodebook
+
+[[autodoc]] FLAVACodebook
+ - forward
+ - get_codebook_indices
+ - get_codebook_probs
+
+## FLAVATextModel
+
+[[autodoc]] FLAVATextModel
+ - forward
+
+## FLAVAImageModel
+
+[[autodoc]] FLAVAImageModel
+ - forward
+
+## FLAVAMultimodalModel
+
+[[autodoc]] FLAVAMultimodalModel
+ - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -191,6 +191,17 @@
  "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"],
  "models.encoder_decoder": ["EncoderDecoderConfig"],
  "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
+ "models.flava": [
+ "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+ "FLAVACodebookConfig",
+ "FLAVACodebookFeatureExtractor",
+ "FLAVAConfig",
+ "FLAVAFeatureExtractor",
+ "FLAVAImageConfig",
+ "FLAVAMultimodalConfig",
+ "FLAVAProcessor",
+ "FLAVATextConfig",
+ ],
  "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig", "FNetTokenizer"],
  "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
  "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
@@ -986,6 +997,19 @@
  "FlaubertWithLMHeadModel",
  ]
  )
+ _import_structure["models.flava"].extend(
+ [
+ "FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "FLAVACodebook",
+ "FLAVAForPretraining",
+ "FLAVAImageModel",
+ "FLAVALayer",
+ "FLAVAModel",
+ "FLAVAMultimodalModel",
+ "FLAVAPreTrainedModel",
+ "FLAVATextModel",
+ ]
+ )
  _import_structure["models.fnet"].extend(
  [
  "FNET_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2565,6 +2589,17 @@
  from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer
  from .models.encoder_decoder import EncoderDecoderConfig
  from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
+ from .models.flava import (
+ FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+ FLAVACodebookConfig,
+ FLAVACodebookFeatureExtractor,
+ FLAVAConfig,
+ FLAVAFeatureExtractor,
+ FLAVAImageConfig,
+ FLAVAMultimodalConfig,
+ FLAVAProcessor,
+ FLAVATextConfig,
+ )
  from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig, FNetTokenizer
  from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
  from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
@@ -3238,6 +3273,14 @@
  FlaubertModel,
  FlaubertWithLMHeadModel,
  )
+ from .models.flava import (
+ FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+ FLAVAForPretraining,
+ FLAVAImageModel,
+ FLAVAModel,
+ FLAVAPreTrainedModel,
+ FLAVATextModel,
+ )
  from .models.fnet import (
  FNET_PRETRAINED_MODEL_ARCHIVE_LIST,
  FNetForMaskedLM,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -54,6 +54,7 @@
  electra,
  encoder_decoder,
  flaubert,
+ flava,
  fnet,
  fsmt,
  funnel,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -62,6 +62,7 @@
  ("canine", "CanineConfig"),
  ("roformer", "RoFormerConfig"),
  ("clip", "CLIPConfig"),
+ ("flava", "FLAVAConfig"),
  ("bigbird_pegasus", "BigBirdPegasusConfig"),
  ("deit", "DeiTConfig"),
  ("luke", "LukeConfig"),
@@ -164,6 +165,7 @@
  ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
  ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
  ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+ ("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
  ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
  ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
  ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -258,6 +260,7 @@
  ("canine", "Canine"),
  ("roformer", "RoFormer"),
  ("clip", "CLIP"),
+ ("flava", "flava"),
  ("bigbird_pegasus", "BigBirdPegasus"),
  ("deit", "DeiT"),
  ("luke", "LUKE"),

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -47,6 +47,7 @@
  ("detr", "DetrFeatureExtractor"),
  ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
  ("clip", "CLIPFeatureExtractor"),
+ ("flava", "FLAVAFeatureExtractor"),
  ("perceiver", "PerceiverFeatureExtractor"),
  ("swin", "ViTFeatureExtractor"),
  ("vit_mae", "ViTFeatureExtractor"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -59,6 +59,7 @@
  ("canine", "CanineModel"),
  ("roformer", "RoFormerModel"),
  ("clip", "CLIPModel"),
+ ("flava", "FLAVAModel"),
  ("bigbird_pegasus", "BigBirdPegasusModel"),
  ("deit", "DeiTModel"),
  ("luke", "LukeModel"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -38,6 +38,7 @@
 PROCESSOR_MAPPING_NAMES = OrderedDict(
  [
  ("clip", "CLIPProcessor"),
+ ("flava", "FLAVAProcessor"),
  ("layoutlmv2", "LayoutLMv2Processor"),
  ("layoutxlm", "LayoutXLMProcessor"),
  ("speech_to_text", "Speech2TextProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -218,6 +218,13 @@
  "CLIPTokenizerFast" if is_tokenizers_available() else None,
  ),
  ),
+ # (
+ # "flava",
+ # (
+ # "CLIPTokenizer",
+ # "CLIPTokenizerFast" if is_tokenizers_available() else None,
+ # ),
+ # ),
  ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
  (
  "perceiver",

diff --git a/src/transformers/models/flava/__init__.py b/src/transformers/models/flava/__init__.py
@@ -0,0 +1,77 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+ "configuration_flava": [
+ "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+ "FLAVACodebookConfig",
+ "FLAVAConfig",
+ "FLAVAImageConfig",
+ "FLAVAMultimodalConfig",
+ "FLAVATextConfig",
+ ],
+}
+
+if is_vision_available():
+ _import_structure["feature_extraction_flava"] = ["FLAVACodebookFeatureExtractor", "FLAVAFeatureExtractor"]
+ _import_structure["processing_flava"] = ["FLAVAProcessor"]
+
+if is_torch_available():
+ _import_structure["modeling_flava"] = [
+ "FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "FLAVACodebook",
+ "FLAVAForPretraining",
+ "FLAVAImageModel",
+ "FLAVAModel",
+ "FLAVAMultimodalModel",
+ "FLAVAPreTrainedModel",
+ "FLAVATextModel",
+ ]
+
+if TYPE_CHECKING:
+ from .configuration_flava import (
+ FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+ FLAVAConfig,
+ FLAVAImageConfig,
+ FLAVAMultimodalConfig,
+ FLAVATextConfig,
+ )
+
+ if is_vision_available():
+ from .feature_extraction_flava import FLAVACodebookFeatureExtractor, FLAVAFeatureExtractor
+ from .processing_flava import FLAVAProcessor
+
+ if is_torch_available():
+ from .modeling_flava import (
+ FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+ FLAVACodebook,
+ FLAVAImageModel,
+ FLAVAModel,
+ FLAVAMultimodalModel,
+ FLAVAPreTrainedModel,
+ FLAVATextModel,
+ )
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)