* Fix Issue #360: Tokenizer failed when the infix regex matched the s…

…tart of the string while trying to tokenize multi-infix tokens.
explosion · May 9, 2016 · cc8bf62 · cc8bf62
1 parent eab2376
commit cc8bf62
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 0 deletions.
diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py
@@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer):
  tokens = en_tokenizer('best...known')
  assert len(tokens) == 3
 
+def test_big_ellipsis(en_tokenizer):
+ '''Test regression identified in Issue #360'''
+ tokens = en_tokenizer(u'$45...............Asking')
+ assert len(tokens) > 2
+
+
 
 def test_email(en_tokenizer):
  tokens = en_tokenizer('[email protected]')

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
@@ -227,6 +227,8 @@ cdef class Tokenizer:
  for match in matches:
  infix_start = match.start()
  infix_end = match.end()
+ if infix_start == start:
+ continue
  span = string[start:infix_start]
  tokens.push_back(self.vocab.get(tokens.mem, span), False)