Demo model sweep (#216)

* Bidaf works on 2.0 * Update fine-grained-ner * Transformer QA * bidaf-elmo * fgner-transformer * Update Vilbert VQA, but the Python console steps still don't work * Fixes vilbert-vqa * This will not work correctly until we make a release * ner-elmo * Formatting * We'll need a release before this makes sense * Let's see if we can show instructions despite licensing restrictions. * Slight stylistic alignment * ELMo warnings * Metrics for BiDAF * Metrics for bidaf * Metrics for fgner_transformer * Metrics for fine-grained-ner * Adds URLs to VQA * Metrics for VQA * Changed where the metrics for for ner_elmo * Fixes the transformer_qa eval script * Metrics for ner_elmo * Fixes the TransformerQA eval script * TransformerQA updates * Adds naquanet URLs * Fixes training paths for ?NLI * Update all the version numbers for the models that work now * Changelog * Formatting
allenai · Feb 24, 2021 · 1c006c5 · 1c006c5
1 parent 1d44f1a
commit 1c006c5
Show file tree

Hide file tree

Showing 15 changed files with 87 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - Made `label` parameter in `TransformerMCReader.text_to_instance` optional with default of `None`.
+- Updated many of the models for version 2.1.0. Fixed and re-trained many of the models.
+
 
 ## [v2.0.1](https:/allenai/allennlp-models/releases/tag/v2.0.1) - 2021-02-01
 

diff --git a/allennlp_models/modelcards/pair-classification-roberta-mnli.json b/allennlp_models/modelcards/pair-classification-roberta-mnli.json
@@ -64,7 +64,7 @@
  },
  "model_usage": {
  "archive_file": "mnli-roberta-2020-07-29.tar.gz",
- "training_config": "snli_roberta.jsonnet",
+ "training_config": "pair_classification/mnli_roberta.jsonnet",
  "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/pair-classification-roberta-snli.json b/allennlp_models/modelcards/pair-classification-roberta-snli.json
@@ -64,7 +64,7 @@
  },
  "model_usage": {
  "archive_file": "snli-roberta-2020-07-29.tar.gz",
- "training_config": "snli_roberta.jsonnet",
+ "training_config": "pair_classification/snli_roberta.jsonnet",
  "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/rc-bidaf-elmo.json b/allennlp_models/modelcards/rc-bidaf-elmo.json
@@ -53,18 +53,18 @@
  "preprocessing": null
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "On the validation set:\nStart accuracy: 66%\nEnd accuracy: 69%\nOverall span accuracy: 57%\nExact match: 71%\nF1: 80%",
  "intersectional_results": null
  },
  "model_caveats_and_recommendations": {
- "caveats_and_recommendations": null
+ "caveats_and_recommendations": "This model is based on ELMo. ELMo is not deterministic, meaning that you will see slight differences every time you run it. Also, ELMo likes to be warmed up, so we recommend processing dummy input before processing real workloads with it."
  },
  "model_ethical_considerations": {
  "ethical_considerations": null
  },
  "model_usage": {
- "archive_file": "bidaf-elmo-model-2020.03.19.tar.gz",
+ "archive_file": "bidaf-elmo.2021-02-11.tar.gz",
  "training_config": "rc/bidaf_elmo.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/rc-bidaf.json b/allennlp_models/modelcards/rc-bidaf.json
@@ -30,7 +30,7 @@
  "evaluation_factors": null
  },
  "metrics": {
- "model_performance_measures": "Start, end and overall span accuracy, Exact Match, F1 score",
+ "model_performance_measures": "Start, end, and overall span accuracy, Exact Match, F1 score",
  "decision_thresholds": null,
  "variation_approaches": null
  },
@@ -53,7 +53,7 @@
  "preprocessing": null
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "On the validation set:\nStart accuracy: 61%\nEnd accuracy: 66%\nOverall span accuracy: 52%\nExact match: 66%\nF1: 76%",
  "intersectional_results": null
  },
  "model_caveats_and_recommendations": {
@@ -65,6 +65,6 @@
  "model_usage": {
  "archive_file": "bidaf-model-2020.03.19.tar.gz",
  "training_config": "rc/bidaf.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/rc-naqanet.json b/allennlp_models/modelcards/rc-naqanet.json
@@ -38,7 +38,7 @@
  "dataset": {
  "name": "DROP",
  "url": "https://allennlp.org/drop",
- "notes": "Please download the data from the url provided."
+ "processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip!drop_dataset/drop_dataset_dev.json"
  },
  "motivation": null,
  "preprocessing": null
@@ -47,7 +47,7 @@
  "dataset": {
  "name": "DROP",
  "url": "https://allennlp.org/drop",
- "notes": "Please download the data from the url provided."
+ "processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip!drop_dataset/drop_dataset_train.json"
  },
  "motivation": null,
  "preprocessing": null

diff --git a/allennlp_models/modelcards/rc-transformer-qa.json b/allennlp_models/modelcards/rc-transformer-qa.json
@@ -30,30 +30,30 @@
  "evaluation_factors": null
  },
  "metrics": {
- "model_performance_measures": "F1-score, Span Accuracy, Exact Match. Note that the metrics that the model produces are calculated on a per-instance basis only. Since there could be more than one instance per question, these metrics are not the official numbers on the SQuAD task. To get official numbers, run the evaluation script v2.0 at https://rajpurkar.github.io/SQuAD-explorer/",
+ "model_performance_measures": "F1-score, Span Accuracy, Exact Match",
  "decision_thresholds": null,
  "variation_approaches": null
  },
  "evaluation_data": {
  "dataset": {
  "name": "SQuAD dev set",
- "url": "https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/",
- "processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v1.1.json"
+ "url": "https://rajpurkar.github.io/SQuAD-explorer/explore/2.0/dev/",
+ "processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v2.0.json"
  },
  "motivation": null,
  "preprocessing": null
  },
  "training_data": {
  "dataset": {
  "name": "SQuAD training set",
- "url": "https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/",
- "processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-train-v1.1.json"
+ "url": "https://rajpurkar.github.io/SQuAD-explorer/explore/2.0/dev/",
+ "processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-train-v2.0.json"
  },
  "motivation": "For the pretrained RoBERTa model, document-level corpora were used rather than a shuffled sentence-level corpus such as the Billion Word Benchmark (Chelba et al., 2013) in order to extract long contiguous sequences",
  "preprocessing": "For the pretrained RoBERTa model, only the text passages were extracted from English Wikipedia; lists, tables, and headers were ignored."
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "On the validation set:\nF1: 88%\nExact match: 84%\nThese are metrics using the official evaluation. Note that the metrics that the model produces while training are calculated on a per-instance basis only. Since there could be more than one instance per question, these metrics are not the official numbers on the SQuAD task. To get official numbers, run the evaluation script at allennlp_models/rc/tools/transformer_qa_eval.py.",
  "intersectional_results": null
  },
  "model_caveats_and_recommendations": {
@@ -63,8 +63,8 @@
  "ethical_considerations": null
  },
  "model_usage": {
- "archive_file": "transformer-qa-2020-10-03.tar.gz",
+ "archive_file": "transformer-qa.2021-02-11.tar.gz",
  "training_config": "rc/transformer_qa.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/tagging-elmo-crf-tagger.json b/allennlp_models/modelcards/tagging-elmo-crf-tagger.json
@@ -38,7 +38,8 @@
  "dataset": {
  "name": "CoNLL-2003 NER dataset",
  "url": "https://www.clips.uantwerpen.be/conll2003/ner/",
- "notes": "The NER model was evaluated on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions."
+ "notes": "The NER model was evaluated on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions.",
+ "processed_url": "path/to/dataset"
  },
  "motivation": null,
  "preprocessing": null
@@ -47,24 +48,25 @@
  "dataset": {
  "name": "CoNLL-2003 NER dataset",
  "url": "https://www.clips.uantwerpen.be/conll2003/ner/",
- "notes": "The NER model was trained on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions."
+ "notes": "The NER model was trained on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions.",
+ "processed_url": "/path/to/dataset"
  },
  "motivation": null,
  "preprocessing": null
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "Achieves 99% accuracy and 96% F1 on the CoNLL-2003 validation set.",
  "intersectional_results": null
  },
  "model_caveats_and_recommendations": {
- "caveats_and_recommendations": null
+ "caveats_and_recommendations": "This model is based on ELMo. ELMo is not deterministic, meaning that you will see slight differences every time you run it. Also, ELMo likes to be warmed up, so we recommend processing dummy input before processing real workloads with it."
  },
  "model_ethical_considerations": {
  "ethical_considerations": null
  },
  "model_usage": {
- "archive_file": "ner-model-2020.02.10.tar.gz",
+ "archive_file": "ner-elmo.2021-02-12.tar.gz",
  "training_config": "tagging/ner_elmo.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/tagging-fine-grained-crf-tagger.json b/allennlp_models/modelcards/tagging-fine-grained-crf-tagger.json
@@ -38,7 +38,8 @@
  "dataset": {
  "name": "Ontonotes 5.0",
  "url": "https://catalog.ldc.upenn.edu/LDC2013T19",
- "notes": "We cannot release this data due to licensing restrictions."
+ "notes": "Unfortunately we cannot release this data due to licensing restrictions.",
+ "processed_url": "/path/do/dataset"
  },
  "motivation": null,
  "preprocessing": null
@@ -47,13 +48,14 @@
  "dataset": {
  "name": "Ontonotes 5.0",
  "url": "https://catalog.ldc.upenn.edu/LDC2013T19",
- "notes": "We cannot release this data due to licensing restrictions."
+ "notes": "Unfortunately we cannot release this data due to licensing restrictions.",
+ "processed_url": "/path/do/dataset"
  },
  "motivation": null,
  "preprocessing": null
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "On the validation set:\nAccuracy: 97%\nF1: 88%",
  "intersectional_results": null
  },
  "model_caveats_and_recommendations": {
@@ -63,8 +65,8 @@
  "ethical_considerations": null
  },
  "model_usage": {
- "archive_file": "fine-grained-ner.2020-06-24.tar.gz",
+ "archive_file": "fine-grained-ner.2021-02-11.tar.gz",
  "training_config": "tagging/fine-grained-ner.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/tagging-fine-grained-transformer-crf-tagger.json b/allennlp_models/modelcards/tagging-fine-grained-transformer-crf-tagger.json
@@ -34,7 +34,7 @@
  "dataset": {
  "name": "Ontonotes 5.0",
  "url": "https://catalog.ldc.upenn.edu/LDC2013T19",
- "notes": "We cannot release this data due to licensing restrictions."
+ "notes": "Unfortunately we cannot release this data due to licensing restrictions."
  },
  "motivation": null,
  "preprocessing": null
@@ -43,13 +43,13 @@
  "dataset": {
  "name": "Ontonotes 5.0",
  "url": "https://catalog.ldc.upenn.edu/LDC2013T19",
- "notes": "We cannot release this data due to licensing restrictions."
+ "notes": "Unfortunately we cannot release this data due to licensing restrictions."
  },
  "motivation": null,
  "preprocessing": null
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "On the validation set:\nAccuracy: 98%\nF1: 88%",
  "intersectional_results": null
  },
  "model_caveats_and_recommendations": {
@@ -59,8 +59,8 @@
  "ethical_considerations": null
  },
  "model_usage": {
- "archive_file": "fgner_transformer.2020-07-14.tar.gz",
+ "archive_file": "fgner-transformer.2021-02-11.tar.gz",
  "training_config": "tagging/fgner_transformer.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/modelcards/vqa-vilbert.json b/allennlp_models/modelcards/vqa-vilbert.json
@@ -38,21 +38,24 @@
  "dataset": {
  "name": "VQA dataset",
  "url": "https://visualqa.org/",
- "notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste."
+ "notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
+ "processed_url": "balanced_real_val"
  },
  "motivation": null,
  "preprocessing": null
  },
  "training_data": {
  "dataset": {
  "name": "VQA dataset",
- "url": "https://visualqa.org/"
+ "url": "https://visualqa.org/",
+ "notes": "Training requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
+ "processed_url": "balanced_real_train"
  },
  "motivation": null,
  "preprocessing": null
  },
  "quantitative_analyses": {
- "unitary_results": null,
+ "unitary_results": "On the validation set:\nF1: 41%\nVQA: 52%.\nThese scores do not match the performance in the VilBERT paper. Please contact us if you want to match those scores!",
  "intersectional_results": null
  },
  "model_ethical_considerations": {
@@ -62,8 +65,8 @@
  "caveats_and_recommendations": null
  },
  "model_usage": {
- "archive_file": "vilbert-vqa-2020.10.01.tar.gz",
- "training_config": "vision/vilbert_vqa_from_huggingface.jsonnet",
- "install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
+ "archive_file": "vilbert-vqa-pretrained.2021-02-11.tar.gz",
+ "training_config": "vision/vilbert_vqa_pretrained.jsonnet",
+ "install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
  }
-}
+}
diff --git a/allennlp_models/rc/tools/transformer_qa_eval.py b/allennlp_models/rc/tools/transformer_qa_eval.py
@@ -14,7 +14,7 @@
 logger = logging.getLogger(__name__)
 
 if __name__ == "__main__":
- import allennlp_models.rc.transformer_qa # noqa F401: Needed to register the registrables.
+ import allennlp_models.rc # noqa F401: Needed to register the registrables.
  import argparse
 
  logging.basicConfig(level=logging.INFO)
@@ -35,7 +35,6 @@
  args.qa_model, predictor_name="transformer_qa", cuda_device=args.cuda_device
  )
  instances = predictor._dataset_reader.read(args.input_file)
- logger.info("Running on %d instances", len(instances))
 
  # We have to make sure we put instances with the same qid all into the same batch.
  def batch_instances_by_qid(instances: Iterable[Instance]) -> Iterable[List[Instance]]:
@@ -74,10 +73,16 @@ def make_batches(
  metric = SquadEmAndF1()
  answers = {}
  for batch in make_batches(tqdm(instances, desc="Evaluating instances")):
+ gold_answers = {
+ instance["metadata"]["id"]: instance["metadata"]["answers"] for instance in batch
+ }
  for result in predictor.predict_batch_instance(batch):
  assert result["id"] not in ids_seen
  ids_seen.add(result["id"])
- metric(result["best_span_str"], result["answers"])
+ gold_answer = gold_answers[result["id"]]
+ if len(gold_answer) == 0:
+ gold_answer = [""] # no-answer case
+ metric(result["best_span_str"], gold_answer)
  answers[result["id"]] = result["best_span_str"]
  if time.monotonic() - last_logged_scores_time > 30:
  exact_match, f1_score = metric.get_metric()

diff --git a/allennlp_models/vision/predictors/vilbert_vqa.py b/allennlp_models/vision/predictors/vilbert_vqa.py
@@ -11,9 +11,9 @@
 
 @Predictor.register("vilbert_vqa")
 class VilbertVqaPredictor(Predictor):
- def predict(self, image: str, sentence: str) -> JsonDict:
+ def predict(self, image: str, question: str) -> JsonDict:
  image = cached_path(image)
- return self.predict_json({"question": sentence, "image": image})
+ return self.predict_json({"question": question, "image": image})
 
  @overrides
  def _json_to_instance(self, json_dict: JsonDict) -> Instance:

diff --git a/tests/pretrained_test.py b/tests/pretrained_test.py
@@ -459,6 +459,16 @@ def test_transformer_qa(self):
  result = predictor.predict(question, passage)
  assert result["best_span_str"] == ""
 
+ def test_vilbert_vqa(self):
+ predictor = load_predictor("vqa-vilbert")
+
+ result = predictor.predict(
+ question="What game are they playing?",
+ image="https://storage.googleapis.com/allennlp-public-data/vqav2/baseball.jpg",
+ )
+ max_answer = max((prob, answer) for answer, prob in result["tokens"].items())[1]
+ assert max_answer == "baseball"
+
  @pytest.mark.parametrize("model_id, model_card", get_pretrained_models().items())
  def test_pretrained_models(self, model_id, model_card):
  assert model_card.model_usage.archive_file is not None