Skip to content

Commit

Permalink
* Fix Issue #360: Tokenizer failed when the infix regex matched the s…
Browse files Browse the repository at this point in the history
…tart of the string while trying to tokenize multi-infix tokens.
  • Loading branch information
honnibal committed May 9, 2016
1 parent eab2376 commit cc8bf62
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 0 deletions.
6 changes: 6 additions & 0 deletions spacy/tests/tokenizer/test_infix.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer):
tokens = en_tokenizer('best...known')
assert len(tokens) == 3

def test_big_ellipsis(en_tokenizer):
'''Test regression identified in Issue #360'''
tokens = en_tokenizer(u'$45...............Asking')
assert len(tokens) > 2



def test_email(en_tokenizer):
tokens = en_tokenizer('[email protected]')
Expand Down
2 changes: 2 additions & 0 deletions spacy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ cdef class Tokenizer:
for match in matches:
infix_start = match.start()
infix_end = match.end()
if infix_start == start:
continue
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)

Expand Down

0 comments on commit cc8bf62

Please sign in to comment.