Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Fine grained NER (#84)
Browse files Browse the repository at this point in the history
* Config file for fine-grained NER

* RoBERTa based fine-grained NER

* Need to specify input_dim

* Adds the new fine-grained NER to the list of models

* Changelog

* Adds test for fine-grained NER
  • Loading branch information
dirkgr authored Jun 26, 2020
1 parent 4b5b939 commit 82aa9ac
Show file tree
Hide file tree
Showing 5 changed files with 253 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
to the tokenized source. If you want these in the tokenized source, it's up to
the source tokenizer.

### Added

- Added two models for fine-grained NER

## [v1.0.0](https:/allenai/allennlp-models/releases/tag/v1.0.0) - 2020-06-16

No additional note-worthy changes since rc6.
Expand Down
14 changes: 12 additions & 2 deletions allennlp_models/pretrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,22 @@ def named_entity_recognition_with_elmo_peters_2018() -> SentenceTaggerPredictor:
return predictor


def fine_grained_named_entity_recognition_with_elmo_peters_2018() -> SentenceTaggerPredictor:
def fine_grained_named_entity_recognition() -> SentenceTaggerPredictor:
"""
Fine Grained Named Entity Recognition
"""
predictor = _load_predictor(
"https://storage.googleapis.com/allennlp-public-models/fine-grained-ner-model-elmo-2018.12.21.tar.gz"
"https://storage.googleapis.com/allennlp-public-models/fine-grained-ner.2020-06-24.tar.gz"
)
return predictor


def fine_grained_named_entity_recognition_transformer() -> SentenceTaggerPredictor:
"""
Fine Grained Named Entity Recognition with the transformer
"""
predictor = _load_predictor(
"https://storage.googleapis.com/allennlp-public-models/fgner_transformer.2020-06-24.tar.gz"
)
return predictor

Expand Down
14 changes: 14 additions & 0 deletions tests/pretrained_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,17 @@ def test_openie(self):
)
assert "verbs" in result
assert "words" in result

@pytest.mark.parametrize(
"get_model_fn",
[
pretrained.fine_grained_named_entity_recognition,
pretrained.fine_grained_named_entity_recognition_transformer,
],
)
def test_fine_grained_ner(self, get_model_fn):
predictor = get_model_fn()
text = """Dwayne Haskins passed for 251 yards and three touchdowns, and Urban Meyer finished his coaching career at Ohio State with a 28-23 victory after the Buckeyes held off Washington’s thrilling fourth-quarter comeback in the 105th Rose Bowl on Tuesday. Parris Campbell, Johnnie Dixon and Rashod Berry caught TD passes in the first half for the fifth-ranked Buckeyes (13-1), who took a 25-point lead into the fourth. But Myles Gaskin threw a touchdown pass and rushed for two more scores for the No. 9 Huskies (10-4), scoring from 2 yards out with 42 seconds left. The Buckeyes intercepted Jake Browning’s pass on the 2-point conversion attempt and then recovered the Huskies’ onside kick to wrap up the final game of Meyer’s seven-year tenure. “I’m a very blessed man,” Meyer said. “I’m blessed because of my family, [but] this team, this year, I love this group as much as any I’ve ever had.”"""
result = predictor.predict_json({"sentence": text})
# Just assert that we predicted something better than all-O.
assert len(frozenset(result["tags"])) > 1
67 changes: 67 additions & 0 deletions training_config/tagging/fgner_transformer.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
local data_dir = std.extVar("CONLL_DATA_PATH");
// local data_dir = "/net/nfs.corp/allennlp/dirkg/data/conll-formatted-ontonotes-5.0/data";
// local data_dir = "/Users/dirkg/Documents/data/conll-formatted-ontonotes-5.0/data";

local transformer_model = "roberta-base";
local transformer_hidden_dim = 768;
local epochs = 3;
local batch_size = 8;
local max_length = 512;

{
"dataset_reader": {
"type": "ontonotes_ner",
"coding_scheme": "BIOUL",
"token_indexers": {
"tokens": {
"type": "pretrained_transformer_mismatched",
"model_name": transformer_model,
"max_length": max_length
},
},
},
"train_data_path": data_dir + "/train",
"validation_data_path": data_dir + "/development",
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size": batch_size
}
},
"model": {
"type": "crf_tagger",
"encoder": {
"type": "pass_through",
"input_dim": transformer_hidden_dim,
},
"include_start_end_transitions": false,
"label_encoding": "BIOUL",
"text_field_embedder": {
"token_embedders": {
"tokens": {
"type": "pretrained_transformer_mismatched",
"model_name": transformer_model,
"max_length": max_length
}
}
},
"verbose_metrics": true
},
"trainer": {
"optimizer": {
"type": "huggingface_adamw",
"weight_decay": 0.0,
"parameter_groups": [[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}]],
"lr": 1e-5,
"eps": 1e-8
},
"learning_rate_scheduler": {
"type": "slanted_triangular",
"cut_frac": 0.05,
},
"grad_norm": 1.0,
"num_epochs": epochs,
"cuda_device": -1,
"validation_metric": "+f1-measure-overall"
}
}
156 changes: 156 additions & 0 deletions training_config/tagging/fine-grained-ner.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
local data_dir = std.extVar("CONLL_DATA_PATH");
// local data_dir = "/net/nfs.corp/allennlp/dirkg/data/conll-formatted-ontonotes-5.0/data";
// local data_dir = "/Users/dirkg/Documents/data/conll-formatted-ontonotes-5.0/data";

{
"dataset_reader": {
"type": "ontonotes_ner",
"coding_scheme": "BIOUL",
"token_indexers": {
"elmo": {
"type": "elmo_characters"
},
"token_characters": {
"type": "characters"
},
"tokens": {
"type": "single_id",
"lowercase_tokens": true
}
}
},
"train_data_path": data_dir + "/train",
"validation_data_path": data_dir + "/development",
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size": 64
}
},
"model": {
"type": "crf_tagger",
"dropout": 0.5,
"encoder": {
"type": "stacked_bidirectional_lstm",
"hidden_size": 200,
"input_size": 1202,
"num_layers": 2,
"recurrent_dropout_probability": 0.5,
"use_highway": true
},
"feedforward": {
"activations": "tanh",
"dropout": 0.5,
"hidden_dims": 400,
"input_dim": 400,
"num_layers": 1
},
"include_start_end_transitions": false,
"initializer": {
"regexes": [
[
".*tag_projection_layer.*weight",
{
"type": "xavier_uniform"
}
],
[
".*tag_projection_layer.*bias",
{
"type": "zero"
}
],
[
".*feedforward.*weight",
{
"type": "xavier_uniform"
}
],
[
".*feedforward.*bias",
{
"type": "zero"
}
],
[
".*weight_ih.*",
{
"type": "xavier_uniform"
}
],
[
".*weight_hh.*",
{
"type": "orthogonal"
}
],
[
".*bias_ih.*",
{
"type": "zero"
}
],
[
".*bias_hh.*",
{
"type": "lstm_hidden_bias"
}
]
]
},
"label_encoding": "BIOUL",
"regularizer": {
"regexes": [
[
"scalar_parameters",
{
"alpha": 0.001,
"type": "l2"
}
]
]
},
"text_field_embedder": {
"token_embedders": {
"elmo": {
"type": "elmo_token_embedder",
"do_layer_norm": false,
"dropout": 0
},
"token_characters": {
"type": "character_encoding",
"embedding": {
"embedding_dim": 25,
"sparse": true,
"vocab_namespace": "token_characters"
},
"encoder": {
"type": "lstm",
"hidden_size": 128,
"input_size": 25,
"num_layers": 1
}
},
"tokens": {
"type": "embedding",
"embedding_dim": 50,
"pretrained_file": "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz",
"sparse": true,
"trainable": true
}
}
},
"verbose_metrics": true
},
"trainer": {
"cuda_device": -1,
"grad_norm": 5,
"num_epochs": 30,
"optimizer": {
"type": "dense_sparse_adam",
"lr": 0.001
},
"patience": 25,
"validation_metric": "+f1-measure-overall"
}
}

0 comments on commit 82aa9ac

Please sign in to comment.