From c1a3be7175ba79702cdd4a8c3ba7cab78dd3955a Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 23 Jul 2019 18:28:55 +0200
Subject: [PATCH] Fix dependency copy for as_doc (#3969)

* failing unit test for issue 3962

* attempt to fix Issue #3962

* create artificial unit test example

* using length instead of self.length

* sp

* reformat with black

* find better ancestor within span and use generic 'dep'

* attach to span.root if there is no appropriate ancestor

* comment span text

* clean up ancestor code

* reconstruct dep tree to keep same number of sentences
---
 spacy/tests/regression/test_issue3962.py | 112 +++++++++++++++++++++++
 spacy/tokens/doc.pyx                     |   4 +-
 spacy/tokens/span.pyx                    |  46 +++++++++-
 3 files changed, 157 insertions(+), 5 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue3962.py

diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py
new file mode 100644
index 00000000000..c7979c2f358
--- /dev/null
+++ b/spacy/tests/regression/test_issue3962.py
@@ -0,0 +1,112 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+    text = "He jests at scars, that never felt a wound."
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = [
+        "nsubj",
+        "ccomp",
+        "prep",
+        "pobj",
+        "punct",
+        "nsubj",
+        "neg",
+        "ROOT",
+        "det",
+        "dobj",
+        "punct",
+    ]
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962(doc):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+
+    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+@pytest.fixture
+def two_sent_doc(en_tokenizer):
+    text = "He jests at scars. They never felt a wound."
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = [
+        "nsubj",
+        "ROOT",
+        "prep",
+        "pobj",
+        "punct",
+        "nsubj",
+        "neg",
+        "ROOT",
+        "det",
+        "dobj",
+        "punct",
+    ]
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962_long(two_sent_doc):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+
+    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    assert doc2[4].head.text == "They"  # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].dep_ == "dep"
+    assert doc2[4].head.text == "They"  # head set to the new artificial head (in sentence 2)
+    assert doc2[4].dep_ == "dep"
+
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c1883f9c09e..7ab1563e95f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -794,7 +794,7 @@ cdef class Doc:
                 if array[i, col] != 0:
                     self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
         # Now load the data
-        for i in range(self.length):
+        for i in range(length):
             token = &self.c[i]
             for j in range(n_attrs):
                 if attr_ids[j] != TAG:
@@ -804,7 +804,7 @@ cdef class Doc:
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
         # If document is parsed, set children
         if self.is_parsed:
-            set_children_from_heads(self.c, self.length)
+            set_children_from_heads(self.c, length)
         return self
 
     def get_lca_matrix(self):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 3f4f4418b95..42fb9852dd4 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..attrs cimport *
 from ..lexeme cimport Lexeme
+from ..symbols cimport dep
 
 from ..util import normalize_slice
 from ..compat import is_config, basestring_
@@ -206,7 +207,6 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#as_doc
         """
-        # TODO: Fix!
         words = [t.text for t in self]
         spaces = [bool(t.whitespace_) for t in self]
         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
@@ -220,7 +220,9 @@ cdef class Span:
         else:
             array_head.append(SENT_START)
         array = self.doc.to_array(array_head)
-        doc.from_array(array_head, array[self.start : self.end])
+        array = array[self.start : self.end]
+        self._fix_dep_copy(array_head, array)
+        doc.from_array(array_head, array)
         doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
         doc.user_hooks = self.doc.user_hooks
         doc.user_span_hooks = self.doc.user_span_hooks
@@ -235,6 +237,44 @@ cdef class Span:
                     doc.cats[cat_label] = value
         return doc
 
+    def _fix_dep_copy(self, attrs, array):
+        """ Rewire dependency links to make sure their heads fall into the span
+        while still keeping the correct number of sentences. """
+        cdef int length = len(array)
+        cdef attr_t value
+        cdef int i, head_col, ancestor_i
+        old_to_new_root = dict()
+        if HEAD in attrs:
+            head_col = attrs.index(HEAD)
+            for i in range(length):
+                # if the HEAD refers to a token outside this span, find a more appropriate ancestor
+                token = self[i]
+                ancestor_i = token.head.i - self.start   # span offset
+                if ancestor_i not in range(length):
+                    if DEP in attrs:
+                        array[i, attrs.index(DEP)] = dep
+
+                    # try finding an ancestor within this span
+                    ancestors = token.ancestors
+                    for ancestor in ancestors:
+                        ancestor_i = ancestor.i - self.start
+                        if ancestor_i in range(length):
+                            array[i, head_col] = ancestor_i - i
+
+                # if there is no appropriate ancestor, define a new artificial root
+                value = array[i, head_col]
+                if (i+value) not in range(length):
+                    new_root = old_to_new_root.get(ancestor_i, None)
+                    if new_root is not None:
+                        # take the same artificial root as a previous token from the same sentence
+                        array[i, head_col] = new_root - i
+                    else:
+                        # set this token as the new artificial root
+                        array[i, head_col] = 0
+                        old_to_new_root[ancestor_i] = i
+
+        return array
+
     def merge(self, *args, **attributes):
         """Retokenize the document, such that the span is merged into a single
         token.
@@ -500,7 +540,7 @@ cdef class Span:
         if "root" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["root"](self)
         # This should probably be called 'head', and the other one called
-        # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
+        # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The