Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Character Repetition Correction [resolves #268] #277

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sadedegel/bblock/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,8 @@ def __init__(self, **kwargs):

self.tokenizer = WordTokenizer.factory(tokenizer_str, emoji=self.config['tokenizer'].getboolean('emoji'),
hashtag=self.config['tokenizer'].getboolean('hashtag'),
mention=self.config['tokenizer'].getboolean('mention'))
mention=self.config['tokenizer'].getboolean('mention'),
correct_repeats=self.config['tokenizer'].getboolean('correct_repeats'))

Token.set_vocabulary(self.tokenizer.vocabulary)

Expand Down
5 changes: 5 additions & 0 deletions sadedegel/bblock/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
from rich.console import Console
import re

from ..about import __version__

Expand All @@ -29,6 +30,10 @@ def tr_lower(s: str) -> str:
def tr_upper(s: str) -> str:
return s.replace("i", "İ").upper()

def repetition_correct(s):
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
return pattern.sub(r"\1", s)


def space_pad(token):
return " " + token + " "
Expand Down
31 changes: 18 additions & 13 deletions sadedegel/bblock/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from rich.console import Console

from sadedegel.bblock.word_tokenizer_helper import ICUTokenizerHelper
from .util import normalize_tokenizer_name
from .util import normalize_tokenizer_name, repetition_correct
from .vocabulary import Vocabulary
from .token import Token
from .word_tokenizer_helper import word_tokenize
Expand All @@ -38,7 +38,7 @@ class TokenSpan:
class WordTokenizer(ABC):
__instances = {}

def __init__(self, mention=False, hashtag=False, emoji=False):
def __init__(self, mention=False, hashtag=False, emoji=False, correct_repeats=False):
"""

@param mention: Handle mention in tweet texts.
Expand All @@ -49,9 +49,11 @@ def __init__(self, mention=False, hashtag=False, emoji=False):
self.mention = mention
self.hashtag = hashtag
self.emoji = emoji
self.correct_repeats = correct_repeats

self.regexes = []


if self.hashtag:
console.print("Handling hashtags")
self.regexes.append(re.compile(r"(?P<hashtag>#\S+)"))
Expand Down Expand Up @@ -80,6 +82,9 @@ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
def __call__(self, sentence: str) -> List[Token]:
text = str(sentence)

if self.correct_repeats:
text = repetition_correct(text)

if len(self.regexes) == 0:
return [Token(t) for t in self._tokenize(text)]
else:
Expand Down Expand Up @@ -130,20 +135,20 @@ def __call__(self, sentence: str) -> List[Token]:
return tokens

@staticmethod
def factory(tokenizer_name: str, mention=False, hashtag=False, emoji=False):
console.log(f"mention={mention}, hashtag={hashtag}, emoji={emoji}")
def factory(tokenizer_name: str, mention=False, hashtag=False, emoji=False, correct_repeats=False):
console.log(f"mention={mention}, hashtag={hashtag}, emoji={emoji}, correct_repeats={correct_repeats}")
normalized_name = normalize_tokenizer_name(tokenizer_name)
if normalized_name not in WordTokenizer.__instances:
if normalized_name == "bert":
return BertTokenizer(mention, hashtag, emoji)
return BertTokenizer(mention, hashtag, emoji, correct_repeats)
elif normalized_name == "simple":
warnings.warn(
("Note that SimpleTokenizer is pretty new in sadedeGel. "
"If you experience any problems, open up a issue "
"(https:/GlobalMaksimum/sadedegel/issues/new)"))
return SimpleTokenizer(mention, hashtag, emoji)
return SimpleTokenizer(mention, hashtag, emoji, correct_repeats)
elif normalized_name == "icu":
return ICUTokenizer(mention, hashtag, emoji)
return ICUTokenizer(mention, hashtag, emoji, correct_repeats)
else:
raise Exception(
(f"No word tokenizer type match with name {tokenizer_name}."
Expand All @@ -158,8 +163,8 @@ class BertTokenizer(WordTokenizer):
def convert_tokens_to_ids(self, tokens: List[Token]) -> List[int]:
return self.tokenizer.convert_tokens_to_ids([t.word for t in tokens])

def __init__(self, mention=False, hashtag=False, emoji=False):
super(BertTokenizer, self).__init__(mention, hashtag, emoji)
def __init__(self, mention=False, hashtag=False, emoji=False, correct_repeats=False):
super(BertTokenizer, self).__init__(mention, hashtag, emoji, correct_repeats)

self.tokenizer = None

Expand Down Expand Up @@ -190,8 +195,8 @@ def vocabulary(self):
class SimpleTokenizer(WordTokenizer):
__name__ = "SimpleTokenizer"

def __init__(self, mention=False, hashtag=False, emoji=False):
super(SimpleTokenizer, self).__init__(mention, hashtag, emoji)
def __init__(self, mention=False, hashtag=False, emoji=False, correct_repeats=False):
super(SimpleTokenizer, self).__init__(mention, hashtag, emoji, correct_repeats)
self.tokenizer = word_tokenize

def _tokenize(self, text: str) -> List[str]:
Expand All @@ -213,8 +218,8 @@ def vocabulary(self):
class ICUTokenizer(WordTokenizer):
__name__ = "ICUTokenizer"

def __init__(self, mention=False, hashtag=False, emoji=False):
super(ICUTokenizer, self).__init__(mention, hashtag, emoji)
def __init__(self, mention=False, hashtag=False, emoji=False, correct_repeats=False):
super(ICUTokenizer, self).__init__(mention, hashtag, emoji, correct_repeats)
self.tokenizer = ICUTokenizerHelper()

def _tokenize(self, text: str) -> List[str]:
Expand Down
1 change: 1 addition & 0 deletions sadedegel/default.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ drop_punct = false
hashtag = false
mention = false
emoji = false
correct_repeats = false

[bert]
avg_document_length = 42.37
Expand Down
10 changes: 6 additions & 4 deletions sadedegel/extension/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,26 @@ def partial_fit(self, X, y=None, **kwargs):
class Text2Doc(BaseEstimator, TransformerMixin):
Doc = None

def __init__(self, tokenizer="icu", hashtag=False, mention=False, emoji=False, progress_tracking=True):
def __init__(self, tokenizer="icu", hashtag=False, mention=False, emoji=False, correct_repeats=False, progress_tracking=True):
self.tokenizer = tokenizer
self.hashtag = hashtag
self.mention = mention
self.emoji = emoji
self.correct_repeats = correct_repeats
self.progress_tracking = progress_tracking
# TODO: Add sadedegel version

self.init()

def init(self):
if Text2Doc.Doc is None:
if hasattr(self, 'hashtag') and hasattr(self, 'mention') and hasattr(self, 'emoji'):
if hasattr(self, 'hashtag') and hasattr(self, 'mention') and hasattr(self, 'emoji') and hasattr(self, 'correct_repeats'):
Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=self.hashtag,
tokenizer__mention=self.mention, tokenizer__emoji=self.emoji)
tokenizer__mention=self.mention, tokenizer__emoji=self.emoji,
tokenizer__correct_repeats=self.correct_repeats)
else:
Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=False,
tokenizer__mention=False, tokenizer__emoji=False)
tokenizer__mention=False, tokenizer__emoji=False, tokenizer__correct_repeats=False)

def fit(self, X, y=None):
return self
Expand Down
1 change: 1 addition & 0 deletions tests/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sadedegel.summarize import RandomSummarizer, PositionSummarizer, LengthSummarizer, BandSummarizer, Rouge1Summarizer, LexRankSummarizer # noqa # pylint: disable=unused-import, wrong-import-position, line-too-long
from sadedegel.tokenize import NLTKPunctTokenizer, RegexpSentenceTokenizer # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.bblock import Doc, Sentences, BertTokenizer, SimpleTokenizer, WordTokenizer, ICUTokenizer # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.extension.sklearn import Text2Doc # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel import Token # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.bblock.util import tr_upper, tr_lower, __tr_lower__, __tr_upper__ # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.bblock.util import flatten, is_eos # noqa # pylint: disable=unused-import, wrong-import-position
Expand Down
25 changes: 25 additions & 0 deletions tests/test_repetition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest

from .context import SimpleTokenizer, BertTokenizer, ICUTokenizer, Text2Doc


@pytest.mark.parametrize('toker, text, tokens_true', [
(ICUTokenizer, 'alemiiiin kralı geliyooor geliyooooooor', ['alemin', 'kralı', 'geliyor', 'geliyor']),
(SimpleTokenizer, 'alemiiiin kralı geliyooor geliyooooooor', ['alemin', 'kralı', 'geliyor', 'geliyor']),
(BertTokenizer, 'alemiiiin kralı geliyooor geliyooooooor', ['alem', '##in', 'kralı', 'geliyor', 'geliyor'])
])
def test_tokenizer_repeat(text, tokens_true, toker):
tokenizer = toker(correct_repeats=True)
tokens_pred = tokenizer(text)
assert tokens_pred == tokens_true


@pytest.mark.parametrize('toker, text, tokens_true', [
('icu', ['alemiiiin kralı geliyooor geliyooooooor'], ['alemin', 'kralı', 'geliyor', 'geliyor']),
('simple', ['alemiiiin kralı geliyooor geliyooooooor'], ['alemin', 'kralı', 'geliyor', 'geliyor']),
('bert', ['alemiiiin kralı geliyooor geliyooooooor'], ['alem', '##in', 'kralı', 'geliyor', 'geliyor'])
])
def test_t2d_repeat(text, tokens_true, toker):
tokenizer = Text2Doc(tokenizer=toker, correct_repeats=True)
tokens_pred = tokenizer.transform(text)
assert tokens_pred[0].tokens == tokens_true