Skip to content

Commit

Permalink
Fix behaviour of Matcher's ? quantifier for v2.1 (#3105)
Browse files Browse the repository at this point in the history
* Add failing test for matcher bug #3009

* Deduplicate matches from Matcher

* Update matcher ? quantifier test

* Fix bug with ? quantifier in Matcher

The ? quantifier indicates a token may occur zero or one times. If the
token pattern fit, the matcher would fail to consider valid matches
where the token pattern did not fit. Consider a simple regex like:

.?b

If we have the string 'b', the .? part will fit --- but then the 'b' in
the pattern will not fit, leaving us with no match. The same bug left us
with too few matches in some cases. For instance, consider:

.?.?

If we have a string of length two, like 'ab', we actually have three
possible matches here: [a, b, ab]. We were only recovering 'ab'. This
should now be fixed. Note that the fix also uncovered another bug, where
we weren't deduplicating the matches. There are actually two ways we
might match 'a' and two ways we might match 'b': as the second token of the pattern,
or as the first token of the pattern. This ambiguity is spurious, so we
need to deduplicate.

Closes #2464 and #3009

* Fix Python2
  • Loading branch information
honnibal authored Dec 29, 2018
1 parent e808bdd commit 174e854
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 6 deletions.
36 changes: 30 additions & 6 deletions spacy/matcher.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ cdef enum action_t:
ADVANCE = 0100
RETRY = 0010
RETRY_EXTEND = 0011
RETRY_ADVANCE = 0110
MATCH_EXTEND = 1001
MATCH_REJECT = 2000

Expand Down Expand Up @@ -89,8 +90,21 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
transition_states(states, matches, &doc.c[i], extra_attrs[i])
# Handle matches that end in 0-width patterns
finish_states(matches, states)
return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
for i in range(matches.size())]
output = []
seen = set()
for i in range(matches.size()):
match = (
matches[i].pattern_id,
matches[i].start,
matches[i].start+matches[i].length
)
# We need to deduplicate, because we could otherwise arrive at the same
# match through two paths, e.g. .?.? matching 'a'. Are we matching the
# first .?, or the second .? -- it doesn't matter, it's just one match.
if match not in seen:
output.append(match)
seen.add(match)
return output


cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
Expand All @@ -114,11 +128,17 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
continue
state = states[i]
states[q] = state
while action in (RETRY, RETRY_EXTEND):
while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND):
if action == RETRY_EXTEND:
# This handles the 'extend'
new_states.push_back(
PatternStateC(pattern=state.pattern, start=state.start,
length=state.length+1))
if action == RETRY_ADVANCE:
# This handles the 'advance'
new_states.push_back(
PatternStateC(pattern=state.pattern+1, start=state.start,
length=state.length+1))
states[q].pattern += 1
action = get_action(states[q], token, extra_attrs)
if action == REJECT:
Expand Down Expand Up @@ -209,14 +229,15 @@ cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t*
No, non-final:
0010
Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010,
Possible combinations: 1000, 0100, 0000, 1001, 0110, 0011, 0010,
We'll name the bits "match", "advance", "retry", "extend"
REJECT = 0000
MATCH = 1000
ADVANCE = 0100
RETRY = 0010
MATCH_EXTEND = 1001
RETRY_ADVANCE = 0110
RETRY_EXTEND = 0011
MATCH_REJECT = 2000 # Match, but don't include last token
Expand Down Expand Up @@ -259,8 +280,11 @@ cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t*
# Yes, final: 1000
return MATCH
elif is_match and not is_final:
# Yes, non-final: 0100
return ADVANCE
# Yes, non-final: 0110
# We need both branches here, consider a pair like:
# pattern: .?b string: b
# If we 'ADVANCE' on the .?, we miss the match.
return RETRY_ADVANCE
elif not is_match and is_final:
# No, final 2000 (note: Don't include last token!)
return MATCH_REJECT
Expand Down
66 changes: 66 additions & 0 deletions spacy/tests/regression/test_issue3009.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.matcher import Matcher
from spacy.tokens import Doc


PATTERNS = [
("1", [[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}]]),
(
"2",
[
[
{"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
{"LOWER": "to"},
{"LOWER": "do"},
{"POS": "ADP"},
]
],
),
(
"3",
[
[
{"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
{"LOWER": "to"},
{"LOWER": "do"},
{"POS": "ADP"},
]
],
),
]


@pytest.fixture
def doc(en_tokenizer):
doc = en_tokenizer("also has to do with")
doc[0].tag_ = "RB"
doc[1].tag_ = "VBZ"
doc[2].tag_ = "TO"
doc[3].tag_ = "VB"
doc[4].tag_ = "IN"
return doc


@pytest.fixture
def matcher(en_tokenizer):
return Matcher(en_tokenizer.vocab)


@pytest.mark.parametrize("pattern", PATTERNS)
def test_issue3009(doc, matcher, pattern):
"""Test problem with matcher quantifiers"""
matcher.add(pattern[0], None, *pattern[1])
matches = matcher(doc)
assert matches

def test_issue2464(matcher):
"""Test problem with successive ?. This is the same bug, so putting it here."""
doc = Doc(matcher.vocab, words=['a', 'b'])
matcher.add('4', None, [{'OP': '?'}, {'OP': '?'}])
matches = matcher(doc)
assert len(matches) == 3

0 comments on commit 174e854

Please sign in to comment.