format with balck; revise annotations

asyml · Oct 6, 2021 · d4fbf6c · d4fbf6c
1 parent 43eca0a
commit d4fbf6c
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 47 deletions.
diff --git a/data_samples/squad_v2.0/dev-v2.0.json b/data_samples/squad_v2.0/dev-v2.0.json
diff --git a/forte/data/readers/__init__.py b/forte/data/readers/__init__.py
@@ -30,4 +30,4 @@
 from forte.data.readers.ag_news_reader import *
 from forte.data.readers.largemovie_reader import *
 from forte.data.readers.misc_readers import *
-from forte.data.readers.squad_reader import *
+from forte.data.readers.squad_reader import *
diff --git a/forte/datasets/mrc/squad_reader.py b/forte/datasets/mrc/squad_reader.py
@@ -28,56 +28,60 @@
 class SquadReader(PackReader):
  r"""Reader for processing Stanford Question Answering Dataset (SQuAD).
 
+ Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+ consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+ where the answer to every question is a segment of text, or span.
+
+ Dataset can be downloaded at https://rajpurkar.github.io/SQuAD-explorer/.
+
+ SquadReader reads each paragraph in the dataset as a separate Document, and the questions
+ are concatenated behind the paragraph, form a Passage.
+ MRCAnswers are marked as text spans. Each MRCQuestion has a list of answers as its attribute.
  """
 
- def _collect(self, file_path) -> Iterator[Any]: # type: ignore
- r"""Should be called with param ``text_directory`` which is a path to a
- folder containing txt files.
+ def _collect(self, file_path: str) -> Iterator[Any]: # type: ignore
+ r"""Given file_path to the dataset, return an iterator to every data point in it.
 
  Args:
- text_directory: text directory containing the files.
+ file_path: path to the JSON file
 
- Returns: Iterator over paths to .txt files
+ Returns: QA pairs and the context of a paragraph of a passage in SQuAD dataset.
  """
  with open(file_path, "r", encoding="utf8", errors="ignore") as file:
  jsonf = json.load(file)
  for dic in jsonf["data"]:
  title = dic["title"]
  cnt = 0
  for qa_dic in dic["paragraphs"]:
- yield title+str(cnt), qa_dic["qas"], qa_dic["context"]
+ yield title + str(cnt), qa_dic["qas"], qa_dic["context"]
  cnt += 1
 
  def _cache_key_function(self, text_file: str) -> str:
  return os.path.basename(text_file)
 
- # pylint: disable=unused-argument
- def text_replace_operation(self, text: str):
- return []
-
  def _parse_pack(self, qa_dict: Tuple[str, list, str]) -> Iterator[DataPack]:
  title, qas, context = qa_dict
  context_end = len(context)
- offset = context_end+1
+ offset = context_end + 1
  text = context
 
- pack = DataPack() # one datapack for a context
+ pack = DataPack()  # one datapack for a context
  for qa in qas:
- if qa["is_impossible"] == True:
+ if qa["is_impossible"] is True:
  continue
  ques_text = qa["question"]
  ans = qa["answers"]
  text += "\n" + ques_text
  ques_end = offset + len(ques_text)
  question = MRCQuestion(pack, offset, ques_end)
- offset = ques_end+1
+ offset = ques_end + 1
  for a in ans:
  ans_text = a["text"]
  ans_start = a["answer_start"]
- answer = MRCAnswer(pack, ans_start, ans_start+len(ans_text))
+ answer = MRCAnswer(pack, ans_start, ans_start + len(ans_text))
  question.answers.append(answer)
 
- pack.set_text(text, replace_func=self.text_replace_operation)
+ pack.set_text(text)
 
  Document(pack, 0, context_end)
  passage = Passage(pack, 0, len(pack.text))
@@ -88,8 +92,7 @@ def _parse_pack(self, qa_dict: Tuple[str, list, str]) -> Iterator[DataPack]:
 
  @classmethod
  def default_configs(cls):
- config = super().default_configs()
- config["file_ext"] = ".txt"
+ config = {"file_ext": ".txt"}
  return config
 
  def record(self, record_meta: Dict[str, Set[str]]):
@@ -102,4 +105,3 @@ def record(self, record_meta: Dict[str, Set[str]]):
  fill in for consistency checking.
  """
  record_meta["ft.onto.base_ontology.Document"] = set()
-
diff --git a/forte/ontology_specs/base_ontology.json b/forte/ontology_specs/base_ontology.json
@@ -402,11 +402,13 @@
  },
  {
  "entry_name": "ft.onto.base_ontology.MRCAnswer",
- "parent_entry": "forte.data.ontology.top.Annotation"
+ "parent_entry": "forte.data.ontology.top.Annotation",
+ "description": "An `Annotation` type entry which represents an answer to an MRC question, typically a span in the given paragraph."
  },
  {
  "entry_name": "ft.onto.base_ontology.MRCQuestion",
  "parent_entry": "forte.data.ontology.top.Annotation",
+ "description": "An `Annotation` type which represents an MRC question.",
  "attributes": [
  {
  "name": "answers",

diff --git a/ft/onto/base_ontology.py b/ft/onto/base_ontology.py
@@ -48,7 +48,8 @@
  "Title",
  "MCOption",
  "MCQuestion",
- "QAQuestion"
+ "MRCAnswer",
+ "MRCQuestion",
 ]
 
 
@@ -495,26 +496,27 @@ def __init__(self, pack: DataPack, begin: int, end: int):
  self.options: FList[MCOption] = FList(self)
  self.answers: List[int] = []
 
+
 @dataclass
 class MRCAnswer(Annotation):
- """[summary]
-
- Args:
- Annotation ([type]): [description]
+ """
+ An `Annotation` type entry which represents an answer to an MRC question, typically a span in the given paragraph.
  """
 
- def __init__(self, pack:DataPack, begin: int, end: int):
+ def __init__(self, pack: DataPack, begin: int, end: int):
  super().__init__(pack, begin, end)
 
+
 @dataclass
 class MRCQuestion(Annotation):
- """[summary]
-
- Args:
- Annotation ([type]): [description]
  """
- answers: List[MRCAnswer]
+ An `Annotation` type which represents an MRC question.
+ Attributes:
+ answers (FList[MRCAnswer]):
+ """
 
- def __init__(self, pack:DataPack, begin: int, end: int):
+ answers: FList[MRCAnswer]
+
+ def __init__(self, pack: DataPack, begin: int, end: int):
  super().__init__(pack, begin, end)
- self.answers: List[MRCAnswer] = []
+ self.answers: FList[MRCAnswer] = FList(self)
diff --git a/tests/forte/datasets/mrc/squad_dataset_test.py b/tests/forte/datasets/mrc/squad_dataset_test.py
@@ -14,9 +14,6 @@
 """
 Unit tests for SquadReader.
 """
-# import sys
-# sys.path.insert(1, '/Users/qinzzz/Desktop/forte')
-
 import json
 import os
 import unittest
@@ -34,7 +31,7 @@ def setUp(self):
  os.path.join(
  os.path.dirname(os.path.realpath(__file__)),
  *([os.path.pardir] * 4),
- "data_samples/squad_v2.0/train-v2.0.json"
+ "data_samples/squad_v2.0/dev-v2.0-sample.json"
  )
  )
 
@@ -51,13 +48,15 @@ def test_reader_no_replace_test(self):
  file_path: str = self.dataset_path
  expected_file_dict = {}
  with open(file_path, "r", encoding="utf8", errors="ignore") as file:
- expected_json = json.load(file)
- for dic in expected_json["data"]:
- title = dic["title"]
- cnt = 0
- for qa_dic in dic["paragraphs"]:
- expected_file_dict[title+str(cnt)] = qa_dic # qas, context
- cnt += 1
+ expected_json = json.load(file)
+ for dic in expected_json["data"]:
+ title = dic["title"]
+ cnt = 0
+ for qa_dic in dic["paragraphs"]:
+ expected_file_dict[
+ title + str(cnt)
+ ] = qa_dic # qas, context
+ cnt += 1
 
  count_packs = 0
  for pack in data_packs:
@@ -79,7 +78,7 @@ def test_reader_no_replace_test(self):
  if not isinstance(expected_answers, list):
  expected_answers = [expected_answers]
  answers = question.answers
- 
+
  for answer, expected_answer in zip(answers, expected_answers):
  self.assertEqual(answer.text, expected_answer["text"])
  expected_text += "\n" + expected_question