Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Demo model sweep (#216)
Browse files Browse the repository at this point in the history
* Bidaf works on 2.0

* Update fine-grained-ner

* Transformer QA

* bidaf-elmo

* fgner-transformer

* Update Vilbert VQA, but the Python console steps still don't work

* Fixes vilbert-vqa

* This will not work correctly until we make a release

* ner-elmo

* Formatting

* We'll need a release before this makes sense

* Let's see if we can show instructions despite licensing restrictions.

* Slight stylistic alignment

* ELMo warnings

* Metrics for BiDAF

* Metrics for bidaf

* Metrics for fgner_transformer

* Metrics for fine-grained-ner

* Adds URLs to VQA

* Metrics for VQA

* Changed where the metrics for for ner_elmo

* Fixes the transformer_qa eval script

* Metrics for ner_elmo

* Fixes the TransformerQA eval script

* TransformerQA updates

* Adds naquanet URLs

* Fixes training paths for ?NLI

* Update all the version numbers for the models that work now

* Changelog

* Formatting
  • Loading branch information
dirkgr authored Feb 24, 2021
1 parent 1d44f1a commit 1c006c5
Show file tree
Hide file tree
Showing 15 changed files with 87 additions and 64 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

- Made `label` parameter in `TransformerMCReader.text_to_instance` optional with default of `None`.
- Updated many of the models for version 2.1.0. Fixed and re-trained many of the models.


## [v2.0.1](https:/allenai/allennlp-models/releases/tag/v2.0.1) - 2021-02-01

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
},
"model_usage": {
"archive_file": "mnli-roberta-2020-07-29.tar.gz",
"training_config": "snli_roberta.jsonnet",
"training_config": "pair_classification/mnli_roberta.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
},
"model_usage": {
"archive_file": "snli-roberta-2020-07-29.tar.gz",
"training_config": "snli_roberta.jsonnet",
"training_config": "pair_classification/snli_roberta.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
}
}
}
10 changes: 5 additions & 5 deletions allennlp_models/modelcards/rc-bidaf-elmo.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,18 @@
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "On the validation set:\nStart accuracy: 66%\nEnd accuracy: 69%\nOverall span accuracy: 57%\nExact match: 71%\nF1: 80%",
"intersectional_results": null
},
"model_caveats_and_recommendations": {
"caveats_and_recommendations": null
"caveats_and_recommendations": "This model is based on ELMo. ELMo is not deterministic, meaning that you will see slight differences every time you run it. Also, ELMo likes to be warmed up, so we recommend processing dummy input before processing real workloads with it."
},
"model_ethical_considerations": {
"ethical_considerations": null
},
"model_usage": {
"archive_file": "bidaf-elmo-model-2020.03.19.tar.gz",
"archive_file": "bidaf-elmo.2021-02-11.tar.gz",
"training_config": "rc/bidaf_elmo.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
8 changes: 4 additions & 4 deletions allennlp_models/modelcards/rc-bidaf.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"evaluation_factors": null
},
"metrics": {
"model_performance_measures": "Start, end and overall span accuracy, Exact Match, F1 score",
"model_performance_measures": "Start, end, and overall span accuracy, Exact Match, F1 score",
"decision_thresholds": null,
"variation_approaches": null
},
Expand All @@ -53,7 +53,7 @@
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "On the validation set:\nStart accuracy: 61%\nEnd accuracy: 66%\nOverall span accuracy: 52%\nExact match: 66%\nF1: 76%",
"intersectional_results": null
},
"model_caveats_and_recommendations": {
Expand All @@ -65,6 +65,6 @@
"model_usage": {
"archive_file": "bidaf-model-2020.03.19.tar.gz",
"training_config": "rc/bidaf.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
4 changes: 2 additions & 2 deletions allennlp_models/modelcards/rc-naqanet.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"dataset": {
"name": "DROP",
"url": "https://allennlp.org/drop",
"notes": "Please download the data from the url provided."
"processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip!drop_dataset/drop_dataset_dev.json"
},
"motivation": null,
"preprocessing": null
Expand All @@ -47,7 +47,7 @@
"dataset": {
"name": "DROP",
"url": "https://allennlp.org/drop",
"notes": "Please download the data from the url provided."
"processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip!drop_dataset/drop_dataset_train.json"
},
"motivation": null,
"preprocessing": null
Expand Down
18 changes: 9 additions & 9 deletions allennlp_models/modelcards/rc-transformer-qa.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,30 +30,30 @@
"evaluation_factors": null
},
"metrics": {
"model_performance_measures": "F1-score, Span Accuracy, Exact Match. Note that the metrics that the model produces are calculated on a per-instance basis only. Since there could be more than one instance per question, these metrics are not the official numbers on the SQuAD task. To get official numbers, run the evaluation script v2.0 at https://rajpurkar.github.io/SQuAD-explorer/",
"model_performance_measures": "F1-score, Span Accuracy, Exact Match",
"decision_thresholds": null,
"variation_approaches": null
},
"evaluation_data": {
"dataset": {
"name": "SQuAD dev set",
"url": "https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/",
"processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v1.1.json"
"url": "https://rajpurkar.github.io/SQuAD-explorer/explore/2.0/dev/",
"processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v2.0.json"
},
"motivation": null,
"preprocessing": null
},
"training_data": {
"dataset": {
"name": "SQuAD training set",
"url": "https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/",
"processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-train-v1.1.json"
"url": "https://rajpurkar.github.io/SQuAD-explorer/explore/2.0/dev/",
"processed_url": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-train-v2.0.json"
},
"motivation": "For the pretrained RoBERTa model, document-level corpora were used rather than a shuffled sentence-level corpus such as the Billion Word Benchmark (Chelba et al., 2013) in order to extract long contiguous sequences",
"preprocessing": "For the pretrained RoBERTa model, only the text passages were extracted from English Wikipedia; lists, tables, and headers were ignored."
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "On the validation set:\nF1: 88%\nExact match: 84%\nThese are metrics using the official evaluation. Note that the metrics that the model produces while training are calculated on a per-instance basis only. Since there could be more than one instance per question, these metrics are not the official numbers on the SQuAD task. To get official numbers, run the evaluation script at allennlp_models/rc/tools/transformer_qa_eval.py.",
"intersectional_results": null
},
"model_caveats_and_recommendations": {
Expand All @@ -63,8 +63,8 @@
"ethical_considerations": null
},
"model_usage": {
"archive_file": "transformer-qa-2020-10-03.tar.gz",
"archive_file": "transformer-qa.2021-02-11.tar.gz",
"training_config": "rc/transformer_qa.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
16 changes: 9 additions & 7 deletions allennlp_models/modelcards/tagging-elmo-crf-tagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
"dataset": {
"name": "CoNLL-2003 NER dataset",
"url": "https://www.clips.uantwerpen.be/conll2003/ner/",
"notes": "The NER model was evaluated on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions."
"notes": "The NER model was evaluated on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions.",
"processed_url": "path/to/dataset"
},
"motivation": null,
"preprocessing": null
Expand All @@ -47,24 +48,25 @@
"dataset": {
"name": "CoNLL-2003 NER dataset",
"url": "https://www.clips.uantwerpen.be/conll2003/ner/",
"notes": "The NER model was trained on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions."
"notes": "The NER model was trained on the CoNLL-2003 NER dataset. Unfortunately we cannot release this data due to licensing restrictions.",
"processed_url": "/path/to/dataset"
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "Achieves 99% accuracy and 96% F1 on the CoNLL-2003 validation set.",
"intersectional_results": null
},
"model_caveats_and_recommendations": {
"caveats_and_recommendations": null
"caveats_and_recommendations": "This model is based on ELMo. ELMo is not deterministic, meaning that you will see slight differences every time you run it. Also, ELMo likes to be warmed up, so we recommend processing dummy input before processing real workloads with it."
},
"model_ethical_considerations": {
"ethical_considerations": null
},
"model_usage": {
"archive_file": "ner-model-2020.02.10.tar.gz",
"archive_file": "ner-elmo.2021-02-12.tar.gz",
"training_config": "tagging/ner_elmo.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
14 changes: 8 additions & 6 deletions allennlp_models/modelcards/tagging-fine-grained-crf-tagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
"dataset": {
"name": "Ontonotes 5.0",
"url": "https://catalog.ldc.upenn.edu/LDC2013T19",
"notes": "We cannot release this data due to licensing restrictions."
"notes": "Unfortunately we cannot release this data due to licensing restrictions.",
"processed_url": "/path/do/dataset"
},
"motivation": null,
"preprocessing": null
Expand All @@ -47,13 +48,14 @@
"dataset": {
"name": "Ontonotes 5.0",
"url": "https://catalog.ldc.upenn.edu/LDC2013T19",
"notes": "We cannot release this data due to licensing restrictions."
"notes": "Unfortunately we cannot release this data due to licensing restrictions.",
"processed_url": "/path/do/dataset"
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "On the validation set:\nAccuracy: 97%\nF1: 88%",
"intersectional_results": null
},
"model_caveats_and_recommendations": {
Expand All @@ -63,8 +65,8 @@
"ethical_considerations": null
},
"model_usage": {
"archive_file": "fine-grained-ner.2020-06-24.tar.gz",
"archive_file": "fine-grained-ner.2021-02-11.tar.gz",
"training_config": "tagging/fine-grained-ner.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"dataset": {
"name": "Ontonotes 5.0",
"url": "https://catalog.ldc.upenn.edu/LDC2013T19",
"notes": "We cannot release this data due to licensing restrictions."
"notes": "Unfortunately we cannot release this data due to licensing restrictions."
},
"motivation": null,
"preprocessing": null
Expand All @@ -43,13 +43,13 @@
"dataset": {
"name": "Ontonotes 5.0",
"url": "https://catalog.ldc.upenn.edu/LDC2013T19",
"notes": "We cannot release this data due to licensing restrictions."
"notes": "Unfortunately we cannot release this data due to licensing restrictions."
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "On the validation set:\nAccuracy: 98%\nF1: 88%",
"intersectional_results": null
},
"model_caveats_and_recommendations": {
Expand All @@ -59,8 +59,8 @@
"ethical_considerations": null
},
"model_usage": {
"archive_file": "fgner_transformer.2020-07-14.tar.gz",
"archive_file": "fgner-transformer.2021-02-11.tar.gz",
"training_config": "tagging/fgner_transformer.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
17 changes: 10 additions & 7 deletions allennlp_models/modelcards/vqa-vilbert.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,24 @@
"dataset": {
"name": "VQA dataset",
"url": "https://visualqa.org/",
"notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste."
"notes": "Evaluation requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
"processed_url": "balanced_real_val"
},
"motivation": null,
"preprocessing": null
},
"training_data": {
"dataset": {
"name": "VQA dataset",
"url": "https://visualqa.org/"
"url": "https://visualqa.org/",
"notes": "Training requires a large amount of images to be accessible locally, so we cannot provide a command you can easily copy and paste. The first time you run it, you will get an error message that tells you how to get the rest of the data.",
"processed_url": "balanced_real_train"
},
"motivation": null,
"preprocessing": null
},
"quantitative_analyses": {
"unitary_results": null,
"unitary_results": "On the validation set:\nF1: 41%\nVQA: 52%.\nThese scores do not match the performance in the VilBERT paper. Please contact us if you want to match those scores!",
"intersectional_results": null
},
"model_ethical_considerations": {
Expand All @@ -62,8 +65,8 @@
"caveats_and_recommendations": null
},
"model_usage": {
"archive_file": "vilbert-vqa-2020.10.01.tar.gz",
"training_config": "vision/vilbert_vqa_from_huggingface.jsonnet",
"install_instructions": "pip install allennlp==1.0.0 allennlp-models==1.0.0"
"archive_file": "vilbert-vqa-pretrained.2021-02-11.tar.gz",
"training_config": "vision/vilbert_vqa_pretrained.jsonnet",
"install_instructions": "pip install allennlp==2.1.0 allennlp-models==2.1.0"
}
}
}
11 changes: 8 additions & 3 deletions allennlp_models/rc/tools/transformer_qa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
logger = logging.getLogger(__name__)

if __name__ == "__main__":
import allennlp_models.rc.transformer_qa # noqa F401: Needed to register the registrables.
import allennlp_models.rc # noqa F401: Needed to register the registrables.
import argparse

logging.basicConfig(level=logging.INFO)
Expand All @@ -35,7 +35,6 @@
args.qa_model, predictor_name="transformer_qa", cuda_device=args.cuda_device
)
instances = predictor._dataset_reader.read(args.input_file)
logger.info("Running on %d instances", len(instances))

# We have to make sure we put instances with the same qid all into the same batch.
def batch_instances_by_qid(instances: Iterable[Instance]) -> Iterable[List[Instance]]:
Expand Down Expand Up @@ -74,10 +73,16 @@ def make_batches(
metric = SquadEmAndF1()
answers = {}
for batch in make_batches(tqdm(instances, desc="Evaluating instances")):
gold_answers = {
instance["metadata"]["id"]: instance["metadata"]["answers"] for instance in batch
}
for result in predictor.predict_batch_instance(batch):
assert result["id"] not in ids_seen
ids_seen.add(result["id"])
metric(result["best_span_str"], result["answers"])
gold_answer = gold_answers[result["id"]]
if len(gold_answer) == 0:
gold_answer = [""] # no-answer case
metric(result["best_span_str"], gold_answer)
answers[result["id"]] = result["best_span_str"]
if time.monotonic() - last_logged_scores_time > 30:
exact_match, f1_score = metric.get_metric()
Expand Down
4 changes: 2 additions & 2 deletions allennlp_models/vision/predictors/vilbert_vqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

@Predictor.register("vilbert_vqa")
class VilbertVqaPredictor(Predictor):
def predict(self, image: str, sentence: str) -> JsonDict:
def predict(self, image: str, question: str) -> JsonDict:
image = cached_path(image)
return self.predict_json({"question": sentence, "image": image})
return self.predict_json({"question": question, "image": image})

@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
Expand Down
10 changes: 10 additions & 0 deletions tests/pretrained_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,16 @@ def test_transformer_qa(self):
result = predictor.predict(question, passage)
assert result["best_span_str"] == ""

def test_vilbert_vqa(self):
predictor = load_predictor("vqa-vilbert")

result = predictor.predict(
question="What game are they playing?",
image="https://storage.googleapis.com/allennlp-public-data/vqav2/baseball.jpg",
)
max_answer = max((prob, answer) for answer, prob in result["tokens"].items())[1]
assert max_answer == "baseball"

@pytest.mark.parametrize("model_id, model_card", get_pretrained_models().items())
def test_pretrained_models(self, model_id, model_card):
assert model_card.model_usage.archive_file is not None
Expand Down
Loading

0 comments on commit 1c006c5

Please sign in to comment.