Fix dependency copy for as_doc (explosion#3969)

* failing unit test for issue 3962 * attempt to fix Issue explosion#3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences
polm · Aug 18, 2019 · c1a3be7 · c1a3be7
1 parent 9d682dc
commit c1a3be7
Show file tree

Hide file tree

Showing 3 changed files with 157 additions and 5 deletions.
diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py
@@ -0,0 +1,112 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+ text = "He jests at scars, that never felt a wound."
+ heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+ deps = [
+ "nsubj",
+ "ccomp",
+ "prep",
+ "pobj",
+ "punct",
+ "nsubj",
+ "neg",
+ "ROOT",
+ "det",
+ "dobj",
+ "punct",
+ ]
+ tokens = en_tokenizer(text)
+ return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962(doc):
+ """ Ensure that as_doc does not result in out-of-bound access of tokens.
+ This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+ span2 = doc[1:5] # "jests at scars ,"
+ doc2 = span2.as_doc()
+ doc2_json = doc2.to_json()
+ assert doc2_json
+
+ assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root
+ assert doc2[0].dep_ == "dep"
+ assert doc2[1].head.text == "jests"
+ assert doc2[1].dep_ == "prep"
+ assert doc2[2].head.text == "at"
+ assert doc2[2].dep_ == "pobj"
+ assert doc2[3].head.text == "jests" # head set to the new artificial root
+ assert doc2[3].dep_ == "dep"
+
+ # We should still have 1 sentence
+ assert len(list(doc2.sents)) == 1
+
+ span3 = doc[6:9] # "never felt a"
+ doc3 = span3.as_doc()
+ doc3_json = doc3.to_json()
+ assert doc3_json
+
+ assert doc3[0].head.text == "felt"
+ assert doc3[0].dep_ == "neg"
+ assert doc3[1].head.text == "felt"
+ assert doc3[1].dep_ == "ROOT"
+ assert doc3[2].head.text == "felt" # head set to ancestor
+ assert doc3[2].dep_ == "dep"
+
+ # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+ assert len(list(doc3.sents)) == 1
+
+
+@pytest.fixture
+def two_sent_doc(en_tokenizer):
+ text = "He jests at scars. They never felt a wound."
+ heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+ deps = [
+ "nsubj",
+ "ROOT",
+ "prep",
+ "pobj",
+ "punct",
+ "nsubj",
+ "neg",
+ "ROOT",
+ "det",
+ "dobj",
+ "punct",
+ ]
+ tokens = en_tokenizer(text)
+ return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962_long(two_sent_doc):
+ """ Ensure that as_doc does not result in out-of-bound access of tokens.
+ This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+ span2 = two_sent_doc[1:7] # "jests at scars. They never"
+ doc2 = span2.as_doc()
+ doc2_json = doc2.to_json()
+ assert doc2_json
+
+ assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1)
+ assert doc2[0].dep_ == "ROOT"
+ assert doc2[1].head.text == "jests"
+ assert doc2[1].dep_ == "prep"
+ assert doc2[2].head.text == "at"
+ assert doc2[2].dep_ == "pobj"
+ assert doc2[3].head.text == "jests"
+ assert doc2[3].dep_ == "punct"
+ assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2)
+ assert doc2[4].dep_ == "dep"
+ assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2)
+ assert doc2[4].dep_ == "dep"
+
+ # We should still have 2 sentences
+ sents = list(doc2.sents)
+ assert len(sents) == 2
+ assert sents[0].text == "jests at scars ."
+ assert sents[1].text == "They never"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
@@ -794,7 +794,7 @@ cdef class Doc:
  if array[i, col] != 0:
  self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
  # Now load the data
- for i in range(self.length):
+ for i in range(length):
  token = &self.c[i]
  for j in range(n_attrs):
  if attr_ids[j] != TAG:
@@ -804,7 +804,7 @@ cdef class Doc:
  self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
  # If document is parsed, set children
  if self.is_parsed:
- set_children_from_heads(self.c, self.length)
+ set_children_from_heads(self.c, length)
  return self
 
  def get_lca_matrix(self):

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
@@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..attrs cimport *
 from ..lexeme cimport Lexeme
+from ..symbols cimport dep
 
 from ..util import normalize_slice
 from ..compat import is_config, basestring_
@@ -206,7 +207,6 @@ cdef class Span:
 
  DOCS: https://spacy.io/api/span#as_doc
  """
- # TODO: Fix!
  words = [t.text for t in self]
  spaces = [bool(t.whitespace_) for t in self]
  cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
@@ -220,7 +220,9 @@ cdef class Span:
  else:
  array_head.append(SENT_START)
  array = self.doc.to_array(array_head)
- doc.from_array(array_head, array[self.start : self.end])
+ array = array[self.start : self.end]
+ self._fix_dep_copy(array_head, array)
+ doc.from_array(array_head, array)
  doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
  doc.user_hooks = self.doc.user_hooks
  doc.user_span_hooks = self.doc.user_span_hooks
@@ -235,6 +237,44 @@ cdef class Span:
  doc.cats[cat_label] = value
  return doc
 
+ def _fix_dep_copy(self, attrs, array):
+ """ Rewire dependency links to make sure their heads fall into the span
+ while still keeping the correct number of sentences. """
+ cdef int length = len(array)
+ cdef attr_t value
+ cdef int i, head_col, ancestor_i
+ old_to_new_root = dict()
+ if HEAD in attrs:
+ head_col = attrs.index(HEAD)
+ for i in range(length):
+ # if the HEAD refers to a token outside this span, find a more appropriate ancestor
+ token = self[i]
+ ancestor_i = token.head.i - self.start # span offset
+ if ancestor_i not in range(length):
+ if DEP in attrs:
+ array[i, attrs.index(DEP)] = dep
+
+ # try finding an ancestor within this span
+ ancestors = token.ancestors
+ for ancestor in ancestors:
+ ancestor_i = ancestor.i - self.start
+ if ancestor_i in range(length):
+ array[i, head_col] = ancestor_i - i
+
+ # if there is no appropriate ancestor, define a new artificial root
+ value = array[i, head_col]
+ if (i+value) not in range(length):
+ new_root = old_to_new_root.get(ancestor_i, None)
+ if new_root is not None:
+ # take the same artificial root as a previous token from the same sentence
+ array[i, head_col] = new_root - i
+ else:
+ # set this token as the new artificial root
+ array[i, head_col] = 0
+ old_to_new_root[ancestor_i] = i
+
+ return array
+
  def merge(self, *args, **attributes):
  """Retokenize the document, such that the span is merged into a single
  token.
@@ -500,7 +540,7 @@ cdef class Span:
  if "root" in self.doc.user_span_hooks:
  return self.doc.user_span_hooks["root"](self)
  # This should probably be called 'head', and the other one called
- # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
+ # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
  cdef int i
  # First, we scan through the Span, and check whether there's a word
  # with head==0, i.e. a sentence root. If so, we can return it. The