Skip to content

Commit

Permalink
format with balck; revise annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
qinzzz committed Oct 6, 2021
1 parent 43eca0a commit d4fbf6c
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 47 deletions.
1 change: 0 additions & 1 deletion data_samples/squad_v2.0/dev-v2.0.json

This file was deleted.

2 changes: 1 addition & 1 deletion forte/data/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
from forte.data.readers.ag_news_reader import *
from forte.data.readers.largemovie_reader import *
from forte.data.readers.misc_readers import *
from forte.data.readers.squad_reader import *
from forte.data.readers.squad_reader import *
40 changes: 21 additions & 19 deletions forte/datasets/mrc/squad_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,56 +28,60 @@
class SquadReader(PackReader):
r"""Reader for processing Stanford Question Answering Dataset (SQuAD).
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
consisting of questions posed by crowdworkers on a set of Wikipedia articles,
where the answer to every question is a segment of text, or span.
Dataset can be downloaded at https://rajpurkar.github.io/SQuAD-explorer/.
SquadReader reads each paragraph in the dataset as a separate Document, and the questions
are concatenated behind the paragraph, form a Passage.
MRCAnswers are marked as text spans. Each MRCQuestion has a list of answers as its attribute.
"""

def _collect(self, file_path) -> Iterator[Any]: # type: ignore
r"""Should be called with param ``text_directory`` which is a path to a
folder containing txt files.
def _collect(self, file_path: str) -> Iterator[Any]: # type: ignore
r"""Given file_path to the dataset, return an iterator to every data point in it.
Args:
text_directory: text directory containing the files.
file_path: path to the JSON file
Returns: Iterator over paths to .txt files
Returns: QA pairs and the context of a paragraph of a passage in SQuAD dataset.
"""
with open(file_path, "r", encoding="utf8", errors="ignore") as file:
jsonf = json.load(file)
for dic in jsonf["data"]:
title = dic["title"]
cnt = 0
for qa_dic in dic["paragraphs"]:
yield title+str(cnt), qa_dic["qas"], qa_dic["context"]
yield title + str(cnt), qa_dic["qas"], qa_dic["context"]
cnt += 1

def _cache_key_function(self, text_file: str) -> str:
return os.path.basename(text_file)

# pylint: disable=unused-argument
def text_replace_operation(self, text: str):
return []

def _parse_pack(self, qa_dict: Tuple[str, list, str]) -> Iterator[DataPack]:
title, qas, context = qa_dict
context_end = len(context)
offset = context_end+1
offset = context_end + 1
text = context

pack = DataPack() # one datapack for a context
pack = DataPack() # one datapack for a context
for qa in qas:
if qa["is_impossible"] == True:
if qa["is_impossible"] is True:
continue
ques_text = qa["question"]
ans = qa["answers"]
text += "\n" + ques_text
ques_end = offset + len(ques_text)
question = MRCQuestion(pack, offset, ques_end)
offset = ques_end+1
offset = ques_end + 1
for a in ans:
ans_text = a["text"]
ans_start = a["answer_start"]
answer = MRCAnswer(pack, ans_start, ans_start+len(ans_text))
answer = MRCAnswer(pack, ans_start, ans_start + len(ans_text))
question.answers.append(answer)

pack.set_text(text, replace_func=self.text_replace_operation)
pack.set_text(text)

Document(pack, 0, context_end)
passage = Passage(pack, 0, len(pack.text))
Expand All @@ -88,8 +92,7 @@ def _parse_pack(self, qa_dict: Tuple[str, list, str]) -> Iterator[DataPack]:

@classmethod
def default_configs(cls):
config = super().default_configs()
config["file_ext"] = ".txt"
config = {"file_ext": ".txt"}
return config

def record(self, record_meta: Dict[str, Set[str]]):
Expand All @@ -102,4 +105,3 @@ def record(self, record_meta: Dict[str, Set[str]]):
fill in for consistency checking.
"""
record_meta["ft.onto.base_ontology.Document"] = set()

4 changes: 3 additions & 1 deletion forte/ontology_specs/base_ontology.json
Original file line number Diff line number Diff line change
Expand Up @@ -402,11 +402,13 @@
},
{
"entry_name": "ft.onto.base_ontology.MRCAnswer",
"parent_entry": "forte.data.ontology.top.Annotation"
"parent_entry": "forte.data.ontology.top.Annotation",
"description": "An `Annotation` type entry which represents an answer to an MRC question, typically a span in the given paragraph."
},
{
"entry_name": "ft.onto.base_ontology.MRCQuestion",
"parent_entry": "forte.data.ontology.top.Annotation",
"description": "An `Annotation` type which represents an MRC question.",
"attributes": [
{
"name": "answers",
Expand Down
28 changes: 15 additions & 13 deletions ft/onto/base_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
"Title",
"MCOption",
"MCQuestion",
"QAQuestion"
"MRCAnswer",
"MRCQuestion",
]


Expand Down Expand Up @@ -495,26 +496,27 @@ def __init__(self, pack: DataPack, begin: int, end: int):
self.options: FList[MCOption] = FList(self)
self.answers: List[int] = []


@dataclass
class MRCAnswer(Annotation):
"""[summary]
Args:
Annotation ([type]): [description]
"""
An `Annotation` type entry which represents an answer to an MRC question, typically a span in the given paragraph.
"""

def __init__(self, pack:DataPack, begin: int, end: int):
def __init__(self, pack: DataPack, begin: int, end: int):
super().__init__(pack, begin, end)


@dataclass
class MRCQuestion(Annotation):
"""[summary]
Args:
Annotation ([type]): [description]
"""
answers: List[MRCAnswer]
An `Annotation` type which represents an MRC question.
Attributes:
answers (FList[MRCAnswer]):
"""

def __init__(self, pack:DataPack, begin: int, end: int):
answers: FList[MRCAnswer]

def __init__(self, pack: DataPack, begin: int, end: int):
super().__init__(pack, begin, end)
self.answers: List[MRCAnswer] = []
self.answers: FList[MRCAnswer] = FList(self)
23 changes: 11 additions & 12 deletions tests/forte/datasets/mrc/squad_dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
"""
Unit tests for SquadReader.
"""
# import sys
# sys.path.insert(1, '/Users/qinzzz/Desktop/forte')

import json
import os
import unittest
Expand All @@ -34,7 +31,7 @@ def setUp(self):
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
*([os.path.pardir] * 4),
"data_samples/squad_v2.0/train-v2.0.json"
"data_samples/squad_v2.0/dev-v2.0-sample.json"
)
)

Expand All @@ -51,13 +48,15 @@ def test_reader_no_replace_test(self):
file_path: str = self.dataset_path
expected_file_dict = {}
with open(file_path, "r", encoding="utf8", errors="ignore") as file:
expected_json = json.load(file)
for dic in expected_json["data"]:
title = dic["title"]
cnt = 0
for qa_dic in dic["paragraphs"]:
expected_file_dict[title+str(cnt)] = qa_dic # qas, context
cnt += 1
expected_json = json.load(file)
for dic in expected_json["data"]:
title = dic["title"]
cnt = 0
for qa_dic in dic["paragraphs"]:
expected_file_dict[
title + str(cnt)
] = qa_dic # qas, context
cnt += 1

count_packs = 0
for pack in data_packs:
Expand All @@ -79,7 +78,7 @@ def test_reader_no_replace_test(self):
if not isinstance(expected_answers, list):
expected_answers = [expected_answers]
answers = question.answers

for answer, expected_answer in zip(answers, expected_answers):
self.assertEqual(answer.text, expected_answer["text"])
expected_text += "\n" + expected_question
Expand Down

0 comments on commit d4fbf6c

Please sign in to comment.