From da36b352ad9cea0e3347cee17a633d830032fe92 Mon Sep 17 00:00:00 2001
From: "C. Grivaz" <cgrivaz@protonmail.com>
Date: Fri, 8 Feb 2019 15:29:24 -0500
Subject: [PATCH 01/10] Add split one token into several (resolves #2838)

---
 spacy/errors.py                   |   3 +
 spacy/tests/doc/test_doc_spilt.py | 112 ++++++++++++++++++++++++++++++
 spacy/tokens/_retokenize.pyx      | 111 +++++++++++++++++++++++++++--
 3 files changed, 222 insertions(+), 4 deletions(-)
 create mode 100644 spacy/tests/doc/test_doc_spilt.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 138de0f5716..981755de412 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -253,6 +253,9 @@ class Errors(object):
     E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
             " can only be part of one entity, so make sure the entities you're "
             "setting don't overlap.")
+    E099 = ("The splitted token can only have one root (head = 0).")
+    E100 = ("The splitted token needs to have a root (head = 0)")
+    E101 = ("All subtokens must have associated heads")
 
 @add_codes
 class TempErrors(object):
diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py
new file mode 100644
index 00000000000..5f80b3ee0e3
--- /dev/null
+++ b/spacy/tests/doc/test_doc_spilt.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ..util import get_doc
+from ...vocab import Vocab
+from ...tokens import Doc
+from ...tokens import Span
+
+import pytest
+
+
+def test_doc_split(en_tokenizer):
+    text = "LosAngeles start."
+    heads = [1, 1, 0]
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
+
+    assert len(doc) == 3
+    assert len(str(doc)) == 19
+    assert doc[0].head.text == 'start'
+    assert doc[1].head.text == '.'
+
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(0, ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
+
+    assert len(doc) == 4
+    assert doc[0].text == 'Los'
+    assert doc[0].head.text == 'Angeles'
+
+    assert doc[1].text == 'Angeles'
+    assert doc[1].head.text == 'start'
+
+    assert doc[2].text == 'start'
+    assert doc[2].head.text == '.'
+
+    assert doc[3].text == '.'
+    assert doc[3].head.text == '.'
+
+    assert len(str(doc)) == 19
+
+def test_split_dependencies(en_tokenizer):
+    text = "LosAngeles start."
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens])
+    dep1 = doc.vocab.strings.add('amod')
+    dep2 = doc.vocab.strings.add('subject')
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(0, ["Los", "Angeles"], [1, 0], [dep1, dep2])
+
+    assert doc[0].dep == dep1
+    assert doc[1].dep == dep2
+
+
+
+def test_split_heads_error(en_tokenizer):
+    text = "LosAngeles start."
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens])
+    #Not enough heads
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(0, ["Los", "Angeles"], [0])
+
+    #Too many heads
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(0, ["Los", "Angeles"], [1, 1, 0])
+
+    #No token head
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(0, ["Los", "Angeles"], [1, 1])
+
+    #Several token heads
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.split(0, ["Los", "Angeles"], [0, 0])
+
+
+def test_spans_entity_merge_iob():
+    # Test entity IOB stays consistent after merging
+    words = ["abc", "d", "e"]
+    doc = Doc(Vocab(), words=words)
+    doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
+    assert doc[0].ent_iob_ == "B"
+    assert doc[1].ent_iob_ == "I"
+
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(0, ["a", "b", "c"], [1, 1, 0])
+    assert doc[0].ent_iob_ == "B"
+    assert doc[1].ent_iob_ == "I"
+    assert doc[2].ent_iob_ == "I"
+    assert doc[3].ent_iob_ == "I"
+
+def test_spans_sentence_update_after_merge(en_tokenizer):
+    text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
+    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
+    deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
+            'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
+            'compound', 'punct']
+
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
+    sent1, sent2 = list(doc.sents)
+    init_len = len(sent1)
+    init_len2 = len(sent2)
+    with doc.retokenize() as retokenizer:
+        retokenizer.split(0, ["Stewart", "Lee"], [1, 0])
+        retokenizer.split(14, ["Joe", "Pasquale"], [1, 0])
+
+    assert len(sent1) == init_len + 1
+    assert len(sent2) == init_len2 + 1
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 60ed63ee7eb..a6e2631c17a 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -43,12 +43,12 @@ cdef class Retokenizer:
         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
         self.merges.append((span, attrs))
 
-    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
+    def split(self, token_index, orths, heads, deps=[], attrs=SimpleFrozenDict()):
         """Mark a Token for splitting, into the specified orths. The attrs
         will be applied to each subtoken.
         """
         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
-        self.splits.append((token.start_char, orths, attrs))
+        self.splits.append((token_index, orths, heads, deps, attrs))
 
     def __enter__(self):
         self.merges = []
@@ -65,8 +65,8 @@ cdef class Retokenizer:
             end = span.end
             _merge(self.doc, start, end, attrs)
 
-        for start_char, orths, attrs in self.splits:
-            raise NotImplementedError
+        for token_index, orths, heads, deps, attrs in self.splits:
+             _split(self.doc, token_index, orths, heads, deps, attrs)
 
 def _merge(Doc doc, int start, int end, attributes):
     """Retokenize the document, such that the span at
@@ -279,3 +279,106 @@ def _bulk_merge(Doc doc, merges):
 
     # Return the merged Python object
     return doc[spans[0].start]
+
+
+def _split(Doc doc, int token_index, orths, heads, deps, attrs):
+    """Retokenize the document, such that the token at
+    `doc.text[token_index]` is split into tokens with the orth 'orths'
+    token_index(int): token index of the token to split.
+    orths: IDs of the verbatim text content of the tokens to create
+    **attributes: Attributes to assign to each of the newly created tokens. By default,
+        attributes are inherited from the original token.
+    RETURNS (Token): The first newly created token.
+    """
+    cdef int nb_subtokens = len(orths)
+    cdef const LexemeC* lex
+    cdef TokenC* token
+    cdef TokenC orig_token = doc.c[token_index]
+
+    if(len(heads) != nb_subtokens):
+        raise ValueError(Errors.E101)
+    token_head_index = -1
+    for index, head in enumerate(heads):
+        if head == 0:
+            if token_head_index != -1:
+                raise ValueError(Errors.E098)
+            token_head_index = index
+    if token_head_index == -1:
+        raise ValueError(Errors.E099)
+
+    # First, make the dependencies absolutes, and adjust all possible dependencies before
+    # creating the tokens
+
+    for i in range(doc.length):
+        doc.c[i].head += i
+
+    # Adjust dependencies
+    offset = nb_subtokens - 1
+    for i in range(doc.length):
+        head_idx = doc.c[i].head
+        if head_idx == token_index:
+            doc.c[i].head = token_head_index
+        elif head_idx > token_index:
+            doc.c[i].head += offset
+
+    new_token_head = doc.c[token_index].head
+
+    # Double doc.c max_length if necessary (until big enough for all new tokens)
+    while doc.length + nb_subtokens - 1 >= doc.max_length:
+        doc._realloc(doc.length * 2)
+
+    # Move tokens after the split to create space for the new tokens
+    doc.length = len(doc) + nb_subtokens -1
+    for token_to_move in range(doc.length - 1, token_index, -1):
+        doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
+
+    # Host the tokens in the newly created space
+    for i, orth in enumerate(orths):
+
+        token = &doc.c[token_index + i]
+        lex = doc.vocab.get(doc.mem, orth)
+        token.lex = lex
+
+        # Set token.spacy to False for all non-last splited tokens, and
+        # to origToken.spacy for the last token
+        if (i < nb_subtokens - 1):
+            token.spacy = False
+        else:
+            token.spacy = orig_token.spacy
+
+        # Apply attrs to each subtoken
+        for attr_name, attr_value in attrs.items():
+            if attr_name == TAG:
+                doc.vocab.morphology.assign_tag(token, attr_value)
+            else:
+                Token.set_struct_attr(token, attr_name, attr_value)
+
+        # Make IOB consistent
+        if (orig_token.ent_iob == 3):
+            if i == 0:
+                token.ent_iob = 3
+            else:
+                token.ent_iob = 1
+        else:
+            # In all other cases subtokens inherit iob from origToken
+            token.ent_iob = orig_token.ent_iob
+
+         # Use the head of the new token everywhere. This will be partially overwritten later on.
+        token.head = new_token_head
+
+    # Transform the dependencies into relative ones again
+    for i in range(doc.length):
+        doc.c[i].head -= i
+
+    # Assign correct dependencies to the inner token
+    for i, head in enumerate(heads):
+        if head != 0:
+            # the token's head's head is already correct
+            doc.c[token_index + i].head = head
+
+    for i, dep in enumerate(deps):
+        doc[token_index + i].dep = dep
+
+    # set children from head
+    set_children_from_heads(doc.c, doc.length)
+

From 9864236a5413e17caddf8c1e0fbb88a0e52c672c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 02:11:41 +1100
Subject: [PATCH 02/10] Improve error message for token splitting

---
 spacy/errors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 981755de412..00204d8e324 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -253,8 +253,8 @@ class Errors(object):
     E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
             " can only be part of one entity, so make sure the entities you're "
             "setting don't overlap.")
-    E099 = ("The splitted token can only have one root (head = 0).")
-    E100 = ("The splitted token needs to have a root (head = 0)")
+    E099 = ("The newly split token can only have one root (head = 0).")
+    E100 = ("The newly split token needs to have a root (head = 0)")
     E101 = ("All subtokens must have associated heads")
 
 @add_codes

From 4e36c4158c0cbaddff90663df543e4710698b740 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 02:14:31 +1100
Subject: [PATCH 03/10] Make retokenizer.split() tests use a Token object

Change retokenizer.split() to use a Token object, instead of an index.
---
 spacy/tests/doc/test_doc_spilt.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py
index 5f80b3ee0e3..1126f2b4740 100644
--- a/spacy/tests/doc/test_doc_spilt.py
+++ b/spacy/tests/doc/test_doc_spilt.py
@@ -21,7 +21,7 @@ def test_doc_split(en_tokenizer):
     assert doc[1].head.text == '.'
 
     with doc.retokenize() as retokenizer:
-        retokenizer.split(0, ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
+        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})
 
     assert len(doc) == 4
     assert doc[0].text == 'Los'
@@ -45,7 +45,7 @@ def test_split_dependencies(en_tokenizer):
     dep1 = doc.vocab.strings.add('amod')
     dep2 = doc.vocab.strings.add('subject')
     with doc.retokenize() as retokenizer:
-        retokenizer.split(0, ["Los", "Angeles"], [1, 0], [dep1, dep2])
+        retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
 
     assert doc[0].dep == dep1
     assert doc[1].dep == dep2
@@ -59,22 +59,22 @@ def test_split_heads_error(en_tokenizer):
     #Not enough heads
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
-            retokenizer.split(0, ["Los", "Angeles"], [0])
+            retokenizer.split(doc[0], ["Los", "Angeles"], [0])
 
     #Too many heads
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
-            retokenizer.split(0, ["Los", "Angeles"], [1, 1, 0])
+            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])
 
     #No token head
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
-            retokenizer.split(0, ["Los", "Angeles"], [1, 1])
+            retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])
 
     #Several token heads
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
-            retokenizer.split(0, ["Los", "Angeles"], [0, 0])
+            retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
 
 
 def test_spans_entity_merge_iob():
@@ -86,7 +86,7 @@ def test_spans_entity_merge_iob():
     assert doc[1].ent_iob_ == "I"
 
     with doc.retokenize() as retokenizer:
-        retokenizer.split(0, ["a", "b", "c"], [1, 1, 0])
+        retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
     assert doc[0].ent_iob_ == "B"
     assert doc[1].ent_iob_ == "I"
     assert doc[2].ent_iob_ == "I"
@@ -105,8 +105,8 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
     init_len = len(sent1)
     init_len2 = len(sent2)
     with doc.retokenize() as retokenizer:
-        retokenizer.split(0, ["Stewart", "Lee"], [1, 0])
-        retokenizer.split(14, ["Joe", "Pasquale"], [1, 0])
+        retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
+        retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
 
     assert len(sent1) == init_len + 1
     assert len(sent2) == init_len2 + 1

From 822ccd51472ec0c3b4ee9d45323761f07586ab77 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 02:18:00 +1100
Subject: [PATCH 04/10] Pass Token into retokenize.split()

Tweak retokenize.split() API so that we pass the `Token` object, not the index.
---
 spacy/tokens/_retokenize.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index a6e2631c17a..68f1d7e0ca2 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -43,12 +43,12 @@ cdef class Retokenizer:
         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
         self.merges.append((span, attrs))
 
-    def split(self, token_index, orths, heads, deps=[], attrs=SimpleFrozenDict()):
+    def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
         """Mark a Token for splitting, into the specified orths. The attrs
         will be applied to each subtoken.
         """
         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
-        self.splits.append((token_index, orths, heads, deps, attrs))
+        self.splits.append((token, orths, heads, deps, attrs))
 
     def __enter__(self):
         self.merges = []
@@ -65,8 +65,8 @@ cdef class Retokenizer:
             end = span.end
             _merge(self.doc, start, end, attrs)
 
-        for token_index, orths, heads, deps, attrs in self.splits:
-             _split(self.doc, token_index, orths, heads, deps, attrs)
+        for token, orths, heads, deps, attrs in self.splits:
+             _split(self.doc, token, orths, heads, deps, attrs)
 
 def _merge(Doc doc, int start, int end, attributes):
     """Retokenize the document, such that the span at
@@ -283,7 +283,7 @@ def _bulk_merge(Doc doc, merges):
 
 def _split(Doc doc, int token_index, orths, heads, deps, attrs):
     """Retokenize the document, such that the token at
-    `doc.text[token_index]` is split into tokens with the orth 'orths'
+    `doc[token_index]` is split into tokens with the orth 'orths'
     token_index(int): token index of the token to split.
     orths: IDs of the verbatim text content of the tokens to create
     **attributes: Attributes to assign to each of the newly created tokens. By default,
@@ -339,7 +339,7 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
         lex = doc.vocab.get(doc.mem, orth)
         token.lex = lex
 
-        # Set token.spacy to False for all non-last splited tokens, and
+        # Set token.spacy to False for all non-last split tokens, and
         # to origToken.spacy for the last token
         if (i < nb_subtokens - 1):
             token.spacy = False

From be9bbfb1f7f6e1d7a3961d80181b9c9e2b32988c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 02:27:42 +1100
Subject: [PATCH 05/10] Fix token.idx in retokenize.split()

---
 spacy/tokens/_retokenize.pyx | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 68f1d7e0ca2..a8bb560409b 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -48,7 +48,7 @@ cdef class Retokenizer:
         will be applied to each subtoken.
         """
         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
-        self.splits.append((token, orths, heads, deps, attrs))
+        self.splits.append((token.i, orths, heads, deps, attrs))
 
     def __enter__(self):
         self.merges = []
@@ -65,8 +65,12 @@ cdef class Retokenizer:
             end = span.end
             _merge(self.doc, start, end, attrs)
 
-        for token, orths, heads, deps, attrs in self.splits:
-             _split(self.doc, token, orths, heads, deps, attrs)
+        offset = 0
+        # Iterate in order, to keep the offset simple.
+        for token_index, orths, heads, deps, attrs in sorted(self.splits):
+             _split(self.doc, token_index + offset, orths, heads, deps, attrs)
+             # Adjust for the previous tokens
+             offset += len(orths)
 
 def _merge(Doc doc, int start, int end, attributes):
     """Retokenize the document, such that the span at
@@ -333,11 +337,15 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
         doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
 
     # Host the tokens in the newly created space
+    cdef int idx_offset = 0
     for i, orth in enumerate(orths):
 
         token = &doc.c[token_index + i]
         lex = doc.vocab.get(doc.mem, orth)
         token.lex = lex
+        # Update the character offset of the subtokens
+        token.idx += idx_offset
+        idx_offset += len(orth)
 
         # Set token.spacy to False for all non-last split tokens, and
         # to origToken.spacy for the last token

From afc5b3e62d4d9df0ceacc99efca69bd38aa06a0b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 02:29:08 +1100
Subject: [PATCH 06/10] Test that token.idx is correct after split

---
 spacy/tests/doc/test_doc_spilt.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py
index 1126f2b4740..b72141b83c7 100644
--- a/spacy/tests/doc/test_doc_spilt.py
+++ b/spacy/tests/doc/test_doc_spilt.py
@@ -26,6 +26,8 @@ def test_doc_split(en_tokenizer):
     assert len(doc) == 4
     assert doc[0].text == 'Los'
     assert doc[0].head.text == 'Angeles'
+    assert doc[0].idx == 0
+    assert doc[1].idx == 3
 
     assert doc[1].text == 'Angeles'
     assert doc[1].head.text == 'start'

From 7356410ad03de476b35610986cacebd2a73edf16 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 02:46:27 +1100
Subject: [PATCH 07/10] Fix token.idx for split tokens

---
 spacy/tokens/_retokenize.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index a8bb560409b..acf82e0c50d 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -344,7 +344,8 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
         lex = doc.vocab.get(doc.mem, orth)
         token.lex = lex
         # Update the character offset of the subtokens
-        token.idx += idx_offset
+        if i != 0:
+            token.idx = doc.c[token_index].idx + idx_offset
         idx_offset += len(orth)
 
         # Set token.spacy to False for all non-last split tokens, and

From e7dc0eca6966661b08920aca92cfd5d53da4a044 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 03:12:00 +1100
Subject: [PATCH 08/10] Fix retokenize.split()

---
 spacy/tokens/_retokenize.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index acf82e0c50d..ceb8a502786 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -70,7 +70,7 @@ cdef class Retokenizer:
         for token_index, orths, heads, deps, attrs in sorted(self.splits):
              _split(self.doc, token_index + offset, orths, heads, deps, attrs)
              # Adjust for the previous tokens
-             offset += len(orths)
+             offset += len(orths)-1
 
 def _merge(Doc doc, int start, int end, attributes):
     """Retokenize the document, such that the span at

From 906022ef8760d14451816f7a27175bd29b7fe855 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 03:26:34 +1100
Subject: [PATCH 09/10] Fix retokenize.split

---
 spacy/tokens/_retokenize.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index ceb8a502786..e0dc4bdf4ab 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -345,7 +345,7 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
         token.lex = lex
         # Update the character offset of the subtokens
         if i != 0:
-            token.idx = doc.c[token_index].idx + idx_offset
+            token.idx = orig_token.idx + idx_offset
         idx_offset += len(orth)
 
         # Set token.spacy to False for all non-last split tokens, and

From 7841edd92f4d2b3af49cf851a80bab2ede99a6e2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Feb 2019 03:27:08 +1100
Subject: [PATCH 10/10] Fix retokenize.split() test

---
 spacy/tests/doc/test_doc_spilt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py
index b72141b83c7..827fd565e36 100644
--- a/spacy/tests/doc/test_doc_spilt.py
+++ b/spacy/tests/doc/test_doc_spilt.py
@@ -109,6 +109,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
     with doc.retokenize() as retokenizer:
         retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
         retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
-
+    sent1, sent2 = list(doc.sents)
     assert len(sent1) == init_len + 1
     assert len(sent2) == init_len2 + 1