Fix #86

makcedward · Jan 24, 2020 · ab2582e · ab2582e
1 parent b3da60b
commit ab2582e
Show file tree

Hide file tree

Showing 7 changed files with 108 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -121,7 +121,9 @@ pip install librosa>=0.7.1
 * Fix missing library dependency [#74](https:/makcedward/nlpaug/issues/74)
 * Fix single token error when using RandomWordAug [#76](https:/makcedward/nlpaug/issues/76)
 * Fix replacing character in RandomCharAug error [#77](https:/makcedward/nlpaug/issues/77)
-* Enhanced word's augmenter to provide regular expression for stopwords [#81](https:/makcedward/nlpaug/issues/81)
+* Enhance word's augmenter to support regular expression stopwords [#81](https:/makcedward/nlpaug/issues/81)
+* Enhance char's augmenter to support regular expression stopwords [#86](https:/makcedward/nlpaug/issues/86)
+# KeyboardAug supports Thai language [#92](https:/makcedward/nlpaug/pull/92)
 
 **0.0.11 Dec 6, 2019
 * Support color noise (pink, blue, red and violet noise) in audio's NoiseAug
@@ -159,4 +161,7 @@ This library uses data (e.g. capturing from internet), research (e.g. following
  howpublished={\url{https:/makcedward/nlpaug}},
  year={2019}
 }
-```
+```
+
+## Contributions (Supporting Other Languages)
+- [sakares](https:/sakares): Add Thai support to KeyboardAug
diff --git a/nlpaug/augmenter/char/char_augmenter.py b/nlpaug/augmenter/char/char_augmenter.py
@@ -1,3 +1,6 @@
+import string
+import re
+
 from nlpaug.util import Method
 from nlpaug import Augmenter
 from nlpaug.util import WarningException, WarningName, WarningCode, WarningMessage
@@ -6,7 +9,7 @@
 class CharAugmenter(Augmenter):
  def __init__(self, action, name='Char_Aug', min_char=2, aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
  aug_word_min=1, aug_word_max=10, aug_word_p=0.3, tokenizer=None, reverse_tokenizer=None,
- stopwords=None, device='cpu', verbose=0):
+ stopwords=None, device='cpu', verbose=0, stopwords_regex=None):
  super().__init__(
  name=name, method=Method.CHAR, action=action, aug_min=None, aug_max=None, device=device, verbose=verbose)
  self.aug_p = None
@@ -21,6 +24,7 @@ def __init__(self, action, name='Char_Aug', min_char=2, aug_char_min=1, aug_char
  self.tokenizer = tokenizer or self._tokenizer
  self.reverse_tokenizer = reverse_tokenizer or self._reverse_tokenizer
  self.stopwords = stopwords
+ self.stopwords_regex = re.compile(stopwords_regex) if stopwords_regex is not None else stopwords_regex
 
  @classmethod
  def _tokenizer(cls, text):
@@ -48,21 +52,45 @@ def is_duplicate(cls, dataset, data):
  def skip_aug(self, token_idxes, tokens):
  return token_idxes
 
+ def pre_skip_aug(self, tokens, tuple_idx=None):
+ results = []
+ for token_idx, token in enumerate(tokens):
+ if tuple_idx is not None:
+ _token = token[tuple_idx]
+ else:
+ _token = token
+ # skip punctuation
+ if _token in string.punctuation:
+ continue
+ """
+ TODO: cannot skip word that were split by tokenizer
+ """
+ # skip stopwords by list
+ if self.stopwords is not None and _token in self.stopwords:
+ continue
+
+ # skip stopwords by regex
+ if self.stopwords_regex is not None and (
+ self.stopwords_regex.match(_token) or self.stopwords_regex.match(' '+_token+' ') or
+ self.stopwords_regex.match(' '+_token) or self.stopwords_regex.match(_token+' ')):
+ continue
+
+ results.append(token_idx)
+
+ return results
+
  def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode):
  if mode == Method.CHAR:
  # If word is too short, do not augment it.
  if len(tokens) < self.min_char:
  return None
 
  aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p)
- idxes = [i for i, t in enumerate(tokens)]
- if mode == Method.WORD:
- # skip stopwords
- idxes = [i for i in idxes if self.stopwords is None or tokens[i] not in self.stopwords]
- # skip short word
- idxes = [i for i in idxes if len(tokens[i]) >= self.min_char]
 
+ if mode == Method.WORD:
+ idxes = self.pre_skip_aug(tokens)
  elif mode == Method.CHAR:
+ idxes = [i for i, t in enumerate(tokens)]
  idxes = self.skip_aug(idxes, tokens)
 
  if len(idxes) == 0:

diff --git a/nlpaug/augmenter/char/keyboard.py b/nlpaug/augmenter/char/keyboard.py
@@ -24,6 +24,7 @@ class KeyboardAug(CharAugmenter):
  calculated via aup_word_p. If calculated result from aug_p is smaller than aug_max, will use calculated result
  from aug_word_p. Otherwise, using aug_max.
  :param list stopwords: List of words which will be skipped from augment operation.
+ :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
  :param func tokenizer: Customize tokenization process
  :param func reverse_tokenizer: Customize reverse of tokenization process
  :param bool special_char: Include special character
@@ -38,12 +39,12 @@ class KeyboardAug(CharAugmenter):
  def __init__(self, name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
  aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None,
  tokenizer=None, reverse_tokenizer=None, special_char=True, numeric=True,
- upper_case=True, lang="en", verbose=0):
+ upper_case=True, lang="en", verbose=0, stopwords_regex=None):
  super().__init__(
  action=Action.SUBSTITUTE, name=name, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
  aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
  tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
- verbose=verbose)
+ verbose=verbose, stopwords_regex=stopwords_regex)
 
  # TODO: support other type of keyboard
  self.keyboard_type = 'qwerty'

diff --git a/nlpaug/augmenter/char/ocr.py b/nlpaug/augmenter/char/ocr.py
@@ -23,6 +23,7 @@ class OcrAug(CharAugmenter):
  calculated via aup_word_p. If calculated result from aug_p is smaller than aug_max, will use calculated result
  from aug_word_p. Otherwise, using aug_max.
  :param list stopwords: List of words which will be skipped from augment operation.
+ :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
  :param func tokenizer: Customize tokenization process
  :param func reverse_tokenizer: Customize reverse of tokenization process
  :param str name: Name of this augmenter
@@ -33,12 +34,12 @@ class OcrAug(CharAugmenter):
 
  def __init__(self, name='OCR_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
  aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None,
- tokenizer=None, reverse_tokenizer=None, verbose=0):
+ tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None):
  super().__init__(
  action=Action.SUBSTITUTE, name=name, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
  aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
  tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
- verbose=verbose)
+ verbose=verbose, stopwords_regex=stopwords_regex)
 
  self.model = self.get_model()
 

diff --git a/nlpaug/augmenter/char/random.py b/nlpaug/augmenter/char/random.py
@@ -38,6 +38,7 @@ class RandomCharAug(CharAugmenter):
  not the first and last character of word. 'random' means swap action will be executed without constraint.
  :param str spec_char: Special character may be included in augmented data.
  :param list stopwords: List of words which will be skipped from augment operation.
+ :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
  :param func tokenizer: Customize tokenization process
  :param func reverse_tokenizer: Customize reverse of tokenization process
  :param str name: Name of this augmenter.
@@ -49,12 +50,12 @@ class RandomCharAug(CharAugmenter):
  def __init__(self, action=Action.SUBSTITUTE, name='RandomChar_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
  aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True,
  include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None,
- tokenizer=None, reverse_tokenizer=None, verbose=0):
+ tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None):
  super().__init__(
  action=action, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
  aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
  tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
- verbose=verbose)
+ verbose=verbose, stopwords_regex=stopwords_regex)
 
  self.include_upper_case = include_upper_case
  self.include_lower_case = include_lower_case

diff --git a/test/augmenter/char/test_char.py b/test/augmenter/char/test_char.py
@@ -49,3 +49,35 @@ def test_multi_thread(self):
  for aug in augs:
  augmented_data = aug.augment(text, n=n, num_thread=num_thread)
  self.assertEqual(len(augmented_data), n)
+
+ def test_stopwords(self):
+ text = 'The quick brown fox jumps over the lazy dog.'
+ stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']
+
+ augs = [
+ nac.RandomCharAug(stopwords=stopwords),
+ nac.KeyboardAug(stopwords=stopwords),
+ nac.OcrAug(stopwords=stopwords)
+ ]
+
+ for aug in augs:
+ for i in range(10):
+ augmented_text = aug.augment(text)
+ self.assertTrue(
+ 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
+
+ def test_stopwords_regex(self):
+ text = 'The quick brown fox jumps over the lazy dog.'
+ stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "
+
+ augs = [
+ nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex),
+ nac.KeyboardAug(stopwords_regex=stopwords_regex),
+ nac.OcrAug(stopwords_regex=stopwords_regex)
+ ]
+
+ for aug in augs:
+ for i in range(10):
+ augmented_text = aug.augment(text)
+ self.assertTrue(
+ 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
diff --git a/test/augmenter/word/test_word.py b/test/augmenter/word/test_word.py
@@ -56,7 +56,6 @@ def test_empty_input_for_swap(self):
 
  def test_empty_input_for_delete(self):
  text = ' '
- # None
  augs = [
  naw.RandomWordAug(action="delete"),
  naw.RandomWordAug(action="delete", stopwords=['a', 'an', 'the'])
@@ -111,20 +110,40 @@ def test_excessive_space(self):
 
  def test_multi_thread(self):
  text = 'The quick brown fox jumps over the lazy dog.'
- n = 3
  augs = [
  naw.RandomWordAug(),
  naw.WordEmbsAug(model_type='word2vec',
  model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin'),
  naw.ContextualWordEmbsAug(
- model_path='xlnet-base-cased', action="substitute",
- skip_unknown_word=True, temperature=0.7, device='cpu')
+ model_path='xlnet-base-cased', action="substitute", device='cpu')
  ]
 
  for num_thread in [1, 3]:
  for aug in augs:
- augmented_data = aug.augment(text, n=n, num_thread=num_thread)
- self.assertEqual(len(augmented_data), n)
+ augmented_data = aug.augment(text, n=num_thread, num_thread=num_thread)
+ if num_thread == 1:
+ # return string
+ self.assertTrue(isinstance(augmented_data, str))
+ else:
+ self.assertEqual(len(augmented_data), num_thread)
+
+ def test_stopwords(self):
+ text = 'The quick brown fox jumps over the lazy dog.'
+ stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']
+
+ augs = [
+ naw.RandomWordAug(action="delete", stopwords=stopwords),
+ naw.ContextualWordEmbsAug(stopwords=stopwords),
+ naw.WordEmbsAug(model_type='word2vec',
+ model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin',
+ stopwords=stopwords)
+ ]
+
+ for aug in augs:
+ for i in range(10):
+ augmented_text = aug.augment(text)
+ self.assertTrue(
+ 'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
 
  # https:/makcedward/nlpaug/issues/81
  def test_stopwords_regex(self):