Skip to content

Commit

Permalink
Fix #86
Browse files Browse the repository at this point in the history
  • Loading branch information
makcedward committed Jan 24, 2020
1 parent b3da60b commit ab2582e
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 21 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ pip install librosa>=0.7.1
* Fix missing library dependency [#74](https:/makcedward/nlpaug/issues/74)
* Fix single token error when using RandomWordAug [#76](https:/makcedward/nlpaug/issues/76)
* Fix replacing character in RandomCharAug error [#77](https:/makcedward/nlpaug/issues/77)
* Enhanced word's augmenter to provide regular expression for stopwords [#81](https:/makcedward/nlpaug/issues/81)
* Enhance word's augmenter to support regular expression stopwords [#81](https:/makcedward/nlpaug/issues/81)
* Enhance char's augmenter to support regular expression stopwords [#86](https:/makcedward/nlpaug/issues/86)
# KeyboardAug supports Thai language [#92](https:/makcedward/nlpaug/pull/92)

**0.0.11 Dec 6, 2019
* Support color noise (pink, blue, red and violet noise) in audio's NoiseAug
Expand Down Expand Up @@ -159,4 +161,7 @@ This library uses data (e.g. capturing from internet), research (e.g. following
howpublished={\url{https:/makcedward/nlpaug}},
year={2019}
}
```
```

## Contributions (Supporting Other Languages)
- [sakares](https:/sakares): Add Thai support to KeyboardAug
42 changes: 35 additions & 7 deletions nlpaug/augmenter/char/char_augmenter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import string
import re

from nlpaug.util import Method
from nlpaug import Augmenter
from nlpaug.util import WarningException, WarningName, WarningCode, WarningMessage
Expand All @@ -6,7 +9,7 @@
class CharAugmenter(Augmenter):
def __init__(self, action, name='Char_Aug', min_char=2, aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
aug_word_min=1, aug_word_max=10, aug_word_p=0.3, tokenizer=None, reverse_tokenizer=None,
stopwords=None, device='cpu', verbose=0):
stopwords=None, device='cpu', verbose=0, stopwords_regex=None):
super().__init__(
name=name, method=Method.CHAR, action=action, aug_min=None, aug_max=None, device=device, verbose=verbose)
self.aug_p = None
Expand All @@ -21,6 +24,7 @@ def __init__(self, action, name='Char_Aug', min_char=2, aug_char_min=1, aug_char
self.tokenizer = tokenizer or self._tokenizer
self.reverse_tokenizer = reverse_tokenizer or self._reverse_tokenizer
self.stopwords = stopwords
self.stopwords_regex = re.compile(stopwords_regex) if stopwords_regex is not None else stopwords_regex

@classmethod
def _tokenizer(cls, text):
Expand Down Expand Up @@ -48,21 +52,45 @@ def is_duplicate(cls, dataset, data):
def skip_aug(self, token_idxes, tokens):
return token_idxes

def pre_skip_aug(self, tokens, tuple_idx=None):
results = []
for token_idx, token in enumerate(tokens):
if tuple_idx is not None:
_token = token[tuple_idx]
else:
_token = token
# skip punctuation
if _token in string.punctuation:
continue
"""
TODO: cannot skip word that were split by tokenizer
"""
# skip stopwords by list
if self.stopwords is not None and _token in self.stopwords:
continue

# skip stopwords by regex
if self.stopwords_regex is not None and (
self.stopwords_regex.match(_token) or self.stopwords_regex.match(' '+_token+' ') or
self.stopwords_regex.match(' '+_token) or self.stopwords_regex.match(_token+' ')):
continue

results.append(token_idx)

return results

def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode):
if mode == Method.CHAR:
# If word is too short, do not augment it.
if len(tokens) < self.min_char:
return None

aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p)
idxes = [i for i, t in enumerate(tokens)]
if mode == Method.WORD:
# skip stopwords
idxes = [i for i in idxes if self.stopwords is None or tokens[i] not in self.stopwords]
# skip short word
idxes = [i for i in idxes if len(tokens[i]) >= self.min_char]

if mode == Method.WORD:
idxes = self.pre_skip_aug(tokens)
elif mode == Method.CHAR:
idxes = [i for i, t in enumerate(tokens)]
idxes = self.skip_aug(idxes, tokens)

if len(idxes) == 0:
Expand Down
5 changes: 3 additions & 2 deletions nlpaug/augmenter/char/keyboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class KeyboardAug(CharAugmenter):
calculated via aup_word_p. If calculated result from aug_p is smaller than aug_max, will use calculated result
from aug_word_p. Otherwise, using aug_max.
:param list stopwords: List of words which will be skipped from augment operation.
:param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
:param func tokenizer: Customize tokenization process
:param func reverse_tokenizer: Customize reverse of tokenization process
:param bool special_char: Include special character
Expand All @@ -38,12 +39,12 @@ class KeyboardAug(CharAugmenter):
def __init__(self, name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None,
tokenizer=None, reverse_tokenizer=None, special_char=True, numeric=True,
upper_case=True, lang="en", verbose=0):
upper_case=True, lang="en", verbose=0, stopwords_regex=None):
super().__init__(
action=Action.SUBSTITUTE, name=name, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
verbose=verbose)
verbose=verbose, stopwords_regex=stopwords_regex)

# TODO: support other type of keyboard
self.keyboard_type = 'qwerty'
Expand Down
5 changes: 3 additions & 2 deletions nlpaug/augmenter/char/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class OcrAug(CharAugmenter):
calculated via aup_word_p. If calculated result from aug_p is smaller than aug_max, will use calculated result
from aug_word_p. Otherwise, using aug_max.
:param list stopwords: List of words which will be skipped from augment operation.
:param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
:param func tokenizer: Customize tokenization process
:param func reverse_tokenizer: Customize reverse of tokenization process
:param str name: Name of this augmenter
Expand All @@ -33,12 +34,12 @@ class OcrAug(CharAugmenter):

def __init__(self, name='OCR_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None,
tokenizer=None, reverse_tokenizer=None, verbose=0):
tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None):
super().__init__(
action=Action.SUBSTITUTE, name=name, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
verbose=verbose)
verbose=verbose, stopwords_regex=stopwords_regex)

self.model = self.get_model()

Expand Down
5 changes: 3 additions & 2 deletions nlpaug/augmenter/char/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class RandomCharAug(CharAugmenter):
not the first and last character of word. 'random' means swap action will be executed without constraint.
:param str spec_char: Special character may be included in augmented data.
:param list stopwords: List of words which will be skipped from augment operation.
:param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
:param func tokenizer: Customize tokenization process
:param func reverse_tokenizer: Customize reverse of tokenization process
:param str name: Name of this augmenter.
Expand All @@ -49,12 +50,12 @@ class RandomCharAug(CharAugmenter):
def __init__(self, action=Action.SUBSTITUTE, name='RandomChar_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True,
include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None,
tokenizer=None, reverse_tokenizer=None, verbose=0):
tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None):
super().__init__(
action=action, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
verbose=verbose)
verbose=verbose, stopwords_regex=stopwords_regex)

self.include_upper_case = include_upper_case
self.include_lower_case = include_lower_case
Expand Down
32 changes: 32 additions & 0 deletions test/augmenter/char/test_char.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,35 @@ def test_multi_thread(self):
for aug in augs:
augmented_data = aug.augment(text, n=n, num_thread=num_thread)
self.assertEqual(len(augmented_data), n)

def test_stopwords(self):
text = 'The quick brown fox jumps over the lazy dog.'
stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

augs = [
nac.RandomCharAug(stopwords=stopwords),
nac.KeyboardAug(stopwords=stopwords),
nac.OcrAug(stopwords=stopwords)
]

for aug in augs:
for i in range(10):
augmented_text = aug.augment(text)
self.assertTrue(
'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)

def test_stopwords_regex(self):
text = 'The quick brown fox jumps over the lazy dog.'
stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "

augs = [
nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex),
nac.KeyboardAug(stopwords_regex=stopwords_regex),
nac.OcrAug(stopwords_regex=stopwords_regex)
]

for aug in augs:
for i in range(10):
augmented_text = aug.augment(text)
self.assertTrue(
'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
31 changes: 25 additions & 6 deletions test/augmenter/word/test_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def test_empty_input_for_swap(self):

def test_empty_input_for_delete(self):
text = ' '
# None
augs = [
naw.RandomWordAug(action="delete"),
naw.RandomWordAug(action="delete", stopwords=['a', 'an', 'the'])
Expand Down Expand Up @@ -111,20 +110,40 @@ def test_excessive_space(self):

def test_multi_thread(self):
text = 'The quick brown fox jumps over the lazy dog.'
n = 3
augs = [
naw.RandomWordAug(),
naw.WordEmbsAug(model_type='word2vec',
model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin'),
naw.ContextualWordEmbsAug(
model_path='xlnet-base-cased', action="substitute",
skip_unknown_word=True, temperature=0.7, device='cpu')
model_path='xlnet-base-cased', action="substitute", device='cpu')
]

for num_thread in [1, 3]:
for aug in augs:
augmented_data = aug.augment(text, n=n, num_thread=num_thread)
self.assertEqual(len(augmented_data), n)
augmented_data = aug.augment(text, n=num_thread, num_thread=num_thread)
if num_thread == 1:
# return string
self.assertTrue(isinstance(augmented_data, str))
else:
self.assertEqual(len(augmented_data), num_thread)

def test_stopwords(self):
text = 'The quick brown fox jumps over the lazy dog.'
stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

augs = [
naw.RandomWordAug(action="delete", stopwords=stopwords),
naw.ContextualWordEmbsAug(stopwords=stopwords),
naw.WordEmbsAug(model_type='word2vec',
model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin',
stopwords=stopwords)
]

for aug in augs:
for i in range(10):
augmented_text = aug.augment(text)
self.assertTrue(
'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)

# https:/makcedward/nlpaug/issues/81
def test_stopwords_regex(self):
Expand Down

0 comments on commit ab2582e

Please sign in to comment.