Add more augmentation methods (#519)

* fix version update error and import errors * clear output * fix typo in data_select example * add typo replacement augmentation and its example * add copyright text * fix documentations for typo replacement * combine op and generator files and add unit tests * allow dict_path to be a local file or an url * commit deleted files and fix init in generator * fix naming, add code-blocks and format python file Co-authored-by: wanglechuan-gif <[email protected]>
asyml · Sep 8, 2021 · 8bbf1a0 · 8bbf1a0
1 parent c65814b
commit 8bbf1a0
Show file tree

Hide file tree

Showing 4 changed files with 225 additions and 5 deletions.
diff --git a/examples/data_augmentation/tutorial_for_data_augmentation.ipynb b/examples/data_augmentation/tutorial_for_data_augmentation.ipynb
@@ -20,8 +20,9 @@
  "metadata": {},
  "outputs": [],
  "source": [
- "from forte.processors.base.data_augment_processor import ReplacementDataAugmentProcessor\n",
+ "from forte.processors.data_augment import ReplacementDataAugmentProcessor\n",
  "from forte.pipeline import Pipeline\n",
+ "from forte.data.multi_pack import MultiPack\n",
  "\n",
  "nlp = Pipeline[MultiPack]()\n",
  "\n",
@@ -48,7 +49,43 @@
  "}\n",
  "\n",
  "processor = ReplacementDataAugmentProcessor()\n",
- "nlp.add(component=processor, configs=processor_config)"
+ "nlp.add(component=processor, config=processor_config)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here is another example for typo data augmentation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from forte.data.data_pack import DataPack\n",
+ "from ft.onto.base_ontology import Token\n",
+ "from forte.processors.data_augment.algorithms.typo_replacement_op import (\n",
+ " TypoReplacementOp,\n",
+ ")\n",
+ "\n",
+ "opr = TypoReplacementOp(\n",
+ " configs={\n",
+ " \"prob\": 0.6,\n",
+ " 'typo_generator': 'uniform',\n",
+ " 'dict_path': 'https://raw.githubusercontent.com/wanglec/temporaryJson/main/misspelling.json'\n",
+ " }\n",
+ ")\n",
+ "data_pack = DataPack()\n",
+ "data_pack.set_text(\"commonly addressable\")\n",
+ "token_1 = Token(data_pack, 0, 8)\n",
+ "token_2 = Token(data_pack, 9, 20)\n",
+ "data_pack.add_entry(token_1)\n",
+ "data_pack.add_entry(token_2)\n",
+ "print(opr.replace(token_1))\n",
+ "print(opr.replace(token_2))"
  ]
  },
  {
@@ -145,13 +182,25 @@
  "\n",
  "To see how to use these two classes to build the RL-based DA model, and to see an example that uses this algorithm for text classification, please refer to `examples/data_augmentation/reinforcemennt/README.md` for details."
  ]
- },
+ }
  ],
  "metadata": {
  "kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
  "language": "python",
  "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
  }
  },
  "nbformat": 4,

diff --git a/forte/processors/base/data_selector_for_da.py b/forte/processors/base/data_selector_for_da.py
@@ -54,7 +54,7 @@ def initialize(self, resources: Resources, configs: Config):
  super().initialize(resources, configs)
  self.index = create_class_with_kwargs(
  self.configs.indexer_class,
- class_args={"config": self.configs.index_configs},
+ class_args={"config": self.configs.index_config},
  )
 
  def _create_search_key(self, data: Optional[str]) -> Dict[str, Any]:

diff --git a/forte/processors/data_augment/algorithms/typo_replacement_op.py b/forte/processors/data_augment/algorithms/typo_replacement_op.py
@@ -0,0 +1,117 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+import json
+from typing import Tuple, Union, Dict, Any
+
+import requests
+from forte.data.ontology import Annotation
+from forte.processors.data_augment.algorithms.text_replacement_op import (
+ TextReplacementOp,
+)
+from forte.common.configuration import Config
+
+__all__ = [
+ "UniformTypoGenerator",
+ "TypoReplacementOp",
+]
+
+
+class UniformTypoGenerator:
+ r"""
+ A uniform generateor that generates a typo from a typo dictionary.
+
+ Args:
+ word: input word that needs to be replaced,
+ dict_path: the url or the path to the pre-defined typo json file.
+ The key is a word we want to replace. The value is a list
+ containing various typos of the corresponding key.
+
+ .. code-block:: python
+ {
+ "apparent": ["aparent", "apparant"],
+ "bankruptcy": ["bankrupcy", "banruptcy"],
+ "barbecue": ["barbeque"]
+ }
+ """
+
+ def __init__(self, dict_path: str):
+ try:
+ r = requests.get(dict_path)
+ self.data = r.json()
+ except requests.exceptions.RequestException:
+ with open(dict_path, encoding="utf8") as json_file:
+ self.data = json.load(json_file)
+
+ def generate(self, word: str) -> str:
+ if word in self.data.keys():
+ result: str = random.choice(self.data[word])
+ return result
+ else:
+ return word
+
+
+class TypoReplacementOp(TextReplacementOp):
+ r"""
+ This class is a replacement op using a pre-defined
+ spelling mistake dictionary to simulate spelling mistake.
+
+ Args:
+ configs:
+ The config should contain
+ `prob`(float): The probability of replacement,
+ should fall in [0, 1].
+ dict_path (str): the url or the path to the pre-defined
+ typo json file. The key is a word we want to replace.
+ The value is a list containing various typos
+ of the corresponding key.
+ typo_generator (str): A generator that takes in a word and
+ outputs the replacement typo.
+ """
+
+ def __init__(self, configs: Union[Config, Dict[str, Any]]):
+ super().__init__(configs)
+ if "dict_path" in configs.keys():
+ self.dict_path = configs["dict_path"]
+ else:
+ # default typo dictionary
+ self.dict_path = (
+ "https://raw.githubusercontent.com/wanglec/"
+ + "temporaryJson/main/misspelling.json"
+ )
+ if configs["typo_generator"] == "uniform":
+ self.typo_generator = UniformTypoGenerator(self.dict_path)
+ else:
+ raise ValueError(
+ "The valid options for typo_generator are [uniform]"
+ )
+
+ def replace(self, input_anno: Annotation) -> Tuple[bool, str]:
+ r"""
+ This function replaces a word from a typo dictionary.
+
+ Args:
+ input_anno (Annotation): The input annotation.
+ Returns:
+ A tuple, where the first element is a boolean value indicating
+ whether the replacement happens, and the second element is the
+ replaced string.
+ """
+ # If the replacement does not happen, return False.
+ if random.random() > self.configs.prob:
+ return False, input_anno.text
+ word: str = self.typo_generator.generate(input_anno.text)
+ return True, word
diff --git a/tests/forte/processors/data_augment/algorithms/typo_replacement_op_test.py b/tests/forte/processors/data_augment/algorithms/typo_replacement_op_test.py
@@ -0,0 +1,54 @@
+# Copyright 2020 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for dictionary word replacement op.
+"""
+
+import unittest
+from forte.data.data_pack import DataPack
+from ft.onto.base_ontology import Token
+from forte.processors.data_augment.algorithms.typo_replacement_op import (
+ TypoReplacementOp,
+)
+
+
+class TestTypoReplacementOp(unittest.TestCase):
+ def setUp(self):
+ self.tyre = TypoReplacementOp(
+ configs={
+ "prob": 1.0,
+ "typo_generator": "uniform",
+ }
+ )
+
+ def test_replace(self):
+ data_pack = DataPack()
+ data_pack.set_text("auxiliary colleague apple")
+ token_1 = Token(data_pack, 0, 9)
+ token_2 = Token(data_pack, 10, 19)
+ token_3 = Token(data_pack, 20, 25)
+ data_pack.add_entry(token_1)
+ data_pack.add_entry(token_2)
+ data_pack.add_entry(token_3)
+
+ self.assertIn(
+ self.tyre.replace(token_1)[1],
+ ["auxilliary", "auxilary", "auxillary"],
+ )
+ self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"])
+ self.assertIn(self.tyre.replace(token_3)[1], ["apple"])
+
+
+if __name__ == "__main__":
+ unittest.main()