forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix dependency copy for as_doc (explosion#3969)
* failing unit test for issue 3962 * attempt to fix Issue explosion#3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences
- Loading branch information
Showing
3 changed files
with
157 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# coding: utf8 | ||
from __future__ import unicode_literals | ||
|
||
import pytest | ||
|
||
from ..util import get_doc | ||
|
||
|
||
@pytest.fixture | ||
def doc(en_tokenizer): | ||
text = "He jests at scars, that never felt a wound." | ||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] | ||
deps = [ | ||
"nsubj", | ||
"ccomp", | ||
"prep", | ||
"pobj", | ||
"punct", | ||
"nsubj", | ||
"neg", | ||
"ROOT", | ||
"det", | ||
"dobj", | ||
"punct", | ||
] | ||
tokens = en_tokenizer(text) | ||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||
|
||
|
||
def test_issue3962(doc): | ||
""" Ensure that as_doc does not result in out-of-bound access of tokens. | ||
This is achieved by setting the head to itself if it would lie out of the span otherwise.""" | ||
span2 = doc[1:5] # "jests at scars ," | ||
doc2 = span2.as_doc() | ||
doc2_json = doc2.to_json() | ||
assert doc2_json | ||
|
||
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root | ||
assert doc2[0].dep_ == "dep" | ||
assert doc2[1].head.text == "jests" | ||
assert doc2[1].dep_ == "prep" | ||
assert doc2[2].head.text == "at" | ||
assert doc2[2].dep_ == "pobj" | ||
assert doc2[3].head.text == "jests" # head set to the new artificial root | ||
assert doc2[3].dep_ == "dep" | ||
|
||
# We should still have 1 sentence | ||
assert len(list(doc2.sents)) == 1 | ||
|
||
span3 = doc[6:9] # "never felt a" | ||
doc3 = span3.as_doc() | ||
doc3_json = doc3.to_json() | ||
assert doc3_json | ||
|
||
assert doc3[0].head.text == "felt" | ||
assert doc3[0].dep_ == "neg" | ||
assert doc3[1].head.text == "felt" | ||
assert doc3[1].dep_ == "ROOT" | ||
assert doc3[2].head.text == "felt" # head set to ancestor | ||
assert doc3[2].dep_ == "dep" | ||
|
||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" | ||
assert len(list(doc3.sents)) == 1 | ||
|
||
|
||
@pytest.fixture | ||
def two_sent_doc(en_tokenizer): | ||
text = "He jests at scars. They never felt a wound." | ||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] | ||
deps = [ | ||
"nsubj", | ||
"ROOT", | ||
"prep", | ||
"pobj", | ||
"punct", | ||
"nsubj", | ||
"neg", | ||
"ROOT", | ||
"det", | ||
"dobj", | ||
"punct", | ||
] | ||
tokens = en_tokenizer(text) | ||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||
|
||
|
||
def test_issue3962_long(two_sent_doc): | ||
""" Ensure that as_doc does not result in out-of-bound access of tokens. | ||
This is achieved by setting the head to itself if it would lie out of the span otherwise.""" | ||
span2 = two_sent_doc[1:7] # "jests at scars. They never" | ||
doc2 = span2.as_doc() | ||
doc2_json = doc2.to_json() | ||
assert doc2_json | ||
|
||
assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1) | ||
assert doc2[0].dep_ == "ROOT" | ||
assert doc2[1].head.text == "jests" | ||
assert doc2[1].dep_ == "prep" | ||
assert doc2[2].head.text == "at" | ||
assert doc2[2].dep_ == "pobj" | ||
assert doc2[3].head.text == "jests" | ||
assert doc2[3].dep_ == "punct" | ||
assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2) | ||
assert doc2[4].dep_ == "dep" | ||
assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2) | ||
assert doc2[4].dep_ == "dep" | ||
|
||
# We should still have 2 sentences | ||
sents = list(doc2.sents) | ||
assert len(sents) == 2 | ||
assert sents[0].text == "jests at scars ." | ||
assert sents[1].text == "They never" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters