From da36b352ad9cea0e3347cee17a633d830032fe92 Mon Sep 17 00:00:00 2001 From: "C. Grivaz" Date: Fri, 8 Feb 2019 15:29:24 -0500 Subject: [PATCH 01/10] Add split one token into several (resolves #2838) --- spacy/errors.py | 3 + spacy/tests/doc/test_doc_spilt.py | 112 ++++++++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 111 +++++++++++++++++++++++++++-- 3 files changed, 222 insertions(+), 4 deletions(-) create mode 100644 spacy/tests/doc/test_doc_spilt.py diff --git a/spacy/errors.py b/spacy/errors.py index 138de0f5716..981755de412 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -253,6 +253,9 @@ class Errors(object): E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" " can only be part of one entity, so make sure the entities you're " "setting don't overlap.") + E099 = ("The splitted token can only have one root (head = 0).") + E100 = ("The splitted token needs to have a root (head = 0)") + E101 = ("All subtokens must have associated heads") @add_codes class TempErrors(object): diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py new file mode 100644 index 00000000000..5f80b3ee0e3 --- /dev/null +++ b/spacy/tests/doc/test_doc_spilt.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc +from ...tokens import Span + +import pytest + + +def test_doc_split(en_tokenizer): + text = "LosAngeles start." + heads = [1, 1, 0] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + + assert len(doc) == 3 + assert len(str(doc)) == 19 + assert doc[0].head.text == 'start' + assert doc[1].head.text == '.' + + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) + + assert len(doc) == 4 + assert doc[0].text == 'Los' + assert doc[0].head.text == 'Angeles' + + assert doc[1].text == 'Angeles' + assert doc[1].head.text == 'start' + + assert doc[2].text == 'start' + assert doc[2].head.text == '.' + + assert doc[3].text == '.' + assert doc[3].head.text == '.' + + assert len(str(doc)) == 19 + +def test_split_dependencies(en_tokenizer): + text = "LosAngeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + dep1 = doc.vocab.strings.add('amod') + dep2 = doc.vocab.strings.add('subject') + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Los", "Angeles"], [1, 0], [dep1, dep2]) + + assert doc[0].dep == dep1 + assert doc[1].dep == dep2 + + + +def test_split_heads_error(en_tokenizer): + text = "LosAngeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + #Not enough heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Los", "Angeles"], [0]) + + #Too many heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Los", "Angeles"], [1, 1, 0]) + + #No token head + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Los", "Angeles"], [1, 1]) + + #Several token heads + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Los", "Angeles"], [0, 0]) + + +def test_spans_entity_merge_iob(): + # Test entity IOB stays consistent after merging + words = ["abc", "d", "e"] + doc = Doc(Vocab(), words=words) + doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)] + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["a", "b", "c"], [1, 1, 0]) + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + assert doc[2].ent_iob_ == "I" + assert doc[3].ent_iob_ == "I" + +def test_spans_sentence_update_after_merge(en_tokenizer): + text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale." + heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2] + deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', + 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', + 'compound', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) + sent1, sent2 = list(doc.sents) + init_len = len(sent1) + init_len2 = len(sent2) + with doc.retokenize() as retokenizer: + retokenizer.split(0, ["Stewart", "Lee"], [1, 0]) + retokenizer.split(14, ["Joe", "Pasquale"], [1, 0]) + + assert len(sent1) == init_len + 1 + assert len(sent2) == init_len2 + 1 diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 60ed63ee7eb..a6e2631c17a 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -43,12 +43,12 @@ cdef class Retokenizer: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span, attrs)) - def split(self, Token token, orths, attrs=SimpleFrozenDict()): + def split(self, token_index, orths, heads, deps=[], attrs=SimpleFrozenDict()): """Mark a Token for splitting, into the specified orths. The attrs will be applied to each subtoken. """ attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - self.splits.append((token.start_char, orths, attrs)) + self.splits.append((token_index, orths, heads, deps, attrs)) def __enter__(self): self.merges = [] @@ -65,8 +65,8 @@ cdef class Retokenizer: end = span.end _merge(self.doc, start, end, attrs) - for start_char, orths, attrs in self.splits: - raise NotImplementedError + for token_index, orths, heads, deps, attrs in self.splits: + _split(self.doc, token_index, orths, heads, deps, attrs) def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at @@ -279,3 +279,106 @@ def _bulk_merge(Doc doc, merges): # Return the merged Python object return doc[spans[0].start] + + +def _split(Doc doc, int token_index, orths, heads, deps, attrs): + """Retokenize the document, such that the token at + `doc.text[token_index]` is split into tokens with the orth 'orths' + token_index(int): token index of the token to split. + orths: IDs of the verbatim text content of the tokens to create + **attributes: Attributes to assign to each of the newly created tokens. By default, + attributes are inherited from the original token. + RETURNS (Token): The first newly created token. + """ + cdef int nb_subtokens = len(orths) + cdef const LexemeC* lex + cdef TokenC* token + cdef TokenC orig_token = doc.c[token_index] + + if(len(heads) != nb_subtokens): + raise ValueError(Errors.E101) + token_head_index = -1 + for index, head in enumerate(heads): + if head == 0: + if token_head_index != -1: + raise ValueError(Errors.E098) + token_head_index = index + if token_head_index == -1: + raise ValueError(Errors.E099) + + # First, make the dependencies absolutes, and adjust all possible dependencies before + # creating the tokens + + for i in range(doc.length): + doc.c[i].head += i + + # Adjust dependencies + offset = nb_subtokens - 1 + for i in range(doc.length): + head_idx = doc.c[i].head + if head_idx == token_index: + doc.c[i].head = token_head_index + elif head_idx > token_index: + doc.c[i].head += offset + + new_token_head = doc.c[token_index].head + + # Double doc.c max_length if necessary (until big enough for all new tokens) + while doc.length + nb_subtokens - 1 >= doc.max_length: + doc._realloc(doc.length * 2) + + # Move tokens after the split to create space for the new tokens + doc.length = len(doc) + nb_subtokens -1 + for token_to_move in range(doc.length - 1, token_index, -1): + doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] + + # Host the tokens in the newly created space + for i, orth in enumerate(orths): + + token = &doc.c[token_index + i] + lex = doc.vocab.get(doc.mem, orth) + token.lex = lex + + # Set token.spacy to False for all non-last splited tokens, and + # to origToken.spacy for the last token + if (i < nb_subtokens - 1): + token.spacy = False + else: + token.spacy = orig_token.spacy + + # Apply attrs to each subtoken + for attr_name, attr_value in attrs.items(): + if attr_name == TAG: + doc.vocab.morphology.assign_tag(token, attr_value) + else: + Token.set_struct_attr(token, attr_name, attr_value) + + # Make IOB consistent + if (orig_token.ent_iob == 3): + if i == 0: + token.ent_iob = 3 + else: + token.ent_iob = 1 + else: + # In all other cases subtokens inherit iob from origToken + token.ent_iob = orig_token.ent_iob + + # Use the head of the new token everywhere. This will be partially overwritten later on. + token.head = new_token_head + + # Transform the dependencies into relative ones again + for i in range(doc.length): + doc.c[i].head -= i + + # Assign correct dependencies to the inner token + for i, head in enumerate(heads): + if head != 0: + # the token's head's head is already correct + doc.c[token_index + i].head = head + + for i, dep in enumerate(deps): + doc[token_index + i].dep = dep + + # set children from head + set_children_from_heads(doc.c, doc.length) + From 9864236a5413e17caddf8c1e0fbb88a0e52c672c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 02:11:41 +1100 Subject: [PATCH 02/10] Improve error message for token splitting --- spacy/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 981755de412..00204d8e324 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -253,8 +253,8 @@ class Errors(object): E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" " can only be part of one entity, so make sure the entities you're " "setting don't overlap.") - E099 = ("The splitted token can only have one root (head = 0).") - E100 = ("The splitted token needs to have a root (head = 0)") + E099 = ("The newly split token can only have one root (head = 0).") + E100 = ("The newly split token needs to have a root (head = 0)") E101 = ("All subtokens must have associated heads") @add_codes From 4e36c4158c0cbaddff90663df543e4710698b740 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 02:14:31 +1100 Subject: [PATCH 03/10] Make retokenizer.split() tests use a Token object Change retokenizer.split() to use a Token object, instead of an index. --- spacy/tests/doc/test_doc_spilt.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py index 5f80b3ee0e3..1126f2b4740 100644 --- a/spacy/tests/doc/test_doc_spilt.py +++ b/spacy/tests/doc/test_doc_spilt.py @@ -21,7 +21,7 @@ def test_doc_split(en_tokenizer): assert doc[1].head.text == '.' with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) assert len(doc) == 4 assert doc[0].text == 'Los' @@ -45,7 +45,7 @@ def test_split_dependencies(en_tokenizer): dep1 = doc.vocab.strings.add('amod') dep2 = doc.vocab.strings.add('subject') with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Los", "Angeles"], [1, 0], [dep1, dep2]) + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2]) assert doc[0].dep == dep1 assert doc[1].dep == dep2 @@ -59,22 +59,22 @@ def test_split_heads_error(en_tokenizer): #Not enough heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Los", "Angeles"], [0]) + retokenizer.split(doc[0], ["Los", "Angeles"], [0]) #Too many heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Los", "Angeles"], [1, 1, 0]) + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0]) #No token head with pytest.raises(ValueError): with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Los", "Angeles"], [1, 1]) + retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1]) #Several token heads with pytest.raises(ValueError): with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Los", "Angeles"], [0, 0]) + retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0]) def test_spans_entity_merge_iob(): @@ -86,7 +86,7 @@ def test_spans_entity_merge_iob(): assert doc[1].ent_iob_ == "I" with doc.retokenize() as retokenizer: - retokenizer.split(0, ["a", "b", "c"], [1, 1, 0]) + retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" @@ -105,8 +105,8 @@ def test_spans_sentence_update_after_merge(en_tokenizer): init_len = len(sent1) init_len2 = len(sent2) with doc.retokenize() as retokenizer: - retokenizer.split(0, ["Stewart", "Lee"], [1, 0]) - retokenizer.split(14, ["Joe", "Pasquale"], [1, 0]) + retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0]) + retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0]) assert len(sent1) == init_len + 1 assert len(sent2) == init_len2 + 1 From 822ccd51472ec0c3b4ee9d45323761f07586ab77 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 02:18:00 +1100 Subject: [PATCH 04/10] Pass Token into retokenize.split() Tweak retokenize.split() API so that we pass the `Token` object, not the index. --- spacy/tokens/_retokenize.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a6e2631c17a..68f1d7e0ca2 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -43,12 +43,12 @@ cdef class Retokenizer: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span, attrs)) - def split(self, token_index, orths, heads, deps=[], attrs=SimpleFrozenDict()): + def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()): """Mark a Token for splitting, into the specified orths. The attrs will be applied to each subtoken. """ attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - self.splits.append((token_index, orths, heads, deps, attrs)) + self.splits.append((token, orths, heads, deps, attrs)) def __enter__(self): self.merges = [] @@ -65,8 +65,8 @@ cdef class Retokenizer: end = span.end _merge(self.doc, start, end, attrs) - for token_index, orths, heads, deps, attrs in self.splits: - _split(self.doc, token_index, orths, heads, deps, attrs) + for token, orths, heads, deps, attrs in self.splits: + _split(self.doc, token, orths, heads, deps, attrs) def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at @@ -283,7 +283,7 @@ def _bulk_merge(Doc doc, merges): def _split(Doc doc, int token_index, orths, heads, deps, attrs): """Retokenize the document, such that the token at - `doc.text[token_index]` is split into tokens with the orth 'orths' + `doc[token_index]` is split into tokens with the orth 'orths' token_index(int): token index of the token to split. orths: IDs of the verbatim text content of the tokens to create **attributes: Attributes to assign to each of the newly created tokens. By default, @@ -339,7 +339,7 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): lex = doc.vocab.get(doc.mem, orth) token.lex = lex - # Set token.spacy to False for all non-last splited tokens, and + # Set token.spacy to False for all non-last split tokens, and # to origToken.spacy for the last token if (i < nb_subtokens - 1): token.spacy = False From be9bbfb1f7f6e1d7a3961d80181b9c9e2b32988c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 02:27:42 +1100 Subject: [PATCH 05/10] Fix token.idx in retokenize.split() --- spacy/tokens/_retokenize.pyx | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 68f1d7e0ca2..a8bb560409b 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -48,7 +48,7 @@ cdef class Retokenizer: will be applied to each subtoken. """ attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - self.splits.append((token, orths, heads, deps, attrs)) + self.splits.append((token.i, orths, heads, deps, attrs)) def __enter__(self): self.merges = [] @@ -65,8 +65,12 @@ cdef class Retokenizer: end = span.end _merge(self.doc, start, end, attrs) - for token, orths, heads, deps, attrs in self.splits: - _split(self.doc, token, orths, heads, deps, attrs) + offset = 0 + # Iterate in order, to keep the offset simple. + for token_index, orths, heads, deps, attrs in sorted(self.splits): + _split(self.doc, token_index + offset, orths, heads, deps, attrs) + # Adjust for the previous tokens + offset += len(orths) def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at @@ -333,11 +337,15 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] # Host the tokens in the newly created space + cdef int idx_offset = 0 for i, orth in enumerate(orths): token = &doc.c[token_index + i] lex = doc.vocab.get(doc.mem, orth) token.lex = lex + # Update the character offset of the subtokens + token.idx += idx_offset + idx_offset += len(orth) # Set token.spacy to False for all non-last split tokens, and # to origToken.spacy for the last token From afc5b3e62d4d9df0ceacc99efca69bd38aa06a0b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 02:29:08 +1100 Subject: [PATCH 06/10] Test that token.idx is correct after split --- spacy/tests/doc/test_doc_spilt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py index 1126f2b4740..b72141b83c7 100644 --- a/spacy/tests/doc/test_doc_spilt.py +++ b/spacy/tests/doc/test_doc_spilt.py @@ -26,6 +26,8 @@ def test_doc_split(en_tokenizer): assert len(doc) == 4 assert doc[0].text == 'Los' assert doc[0].head.text == 'Angeles' + assert doc[0].idx == 0 + assert doc[1].idx == 3 assert doc[1].text == 'Angeles' assert doc[1].head.text == 'start' From 7356410ad03de476b35610986cacebd2a73edf16 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 02:46:27 +1100 Subject: [PATCH 07/10] Fix token.idx for split tokens --- spacy/tokens/_retokenize.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a8bb560409b..acf82e0c50d 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -344,7 +344,8 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): lex = doc.vocab.get(doc.mem, orth) token.lex = lex # Update the character offset of the subtokens - token.idx += idx_offset + if i != 0: + token.idx = doc.c[token_index].idx + idx_offset idx_offset += len(orth) # Set token.spacy to False for all non-last split tokens, and From e7dc0eca6966661b08920aca92cfd5d53da4a044 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 03:12:00 +1100 Subject: [PATCH 08/10] Fix retokenize.split() --- spacy/tokens/_retokenize.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index acf82e0c50d..ceb8a502786 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -70,7 +70,7 @@ cdef class Retokenizer: for token_index, orths, heads, deps, attrs in sorted(self.splits): _split(self.doc, token_index + offset, orths, heads, deps, attrs) # Adjust for the previous tokens - offset += len(orths) + offset += len(orths)-1 def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at From 906022ef8760d14451816f7a27175bd29b7fe855 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 03:26:34 +1100 Subject: [PATCH 09/10] Fix retokenize.split --- spacy/tokens/_retokenize.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index ceb8a502786..e0dc4bdf4ab 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -345,7 +345,7 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs): token.lex = lex # Update the character offset of the subtokens if i != 0: - token.idx = doc.c[token_index].idx + idx_offset + token.idx = orig_token.idx + idx_offset idx_offset += len(orth) # Set token.spacy to False for all non-last split tokens, and From 7841edd92f4d2b3af49cf851a80bab2ede99a6e2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Feb 2019 03:27:08 +1100 Subject: [PATCH 10/10] Fix retokenize.split() test --- spacy/tests/doc/test_doc_spilt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/doc/test_doc_spilt.py b/spacy/tests/doc/test_doc_spilt.py index b72141b83c7..827fd565e36 100644 --- a/spacy/tests/doc/test_doc_spilt.py +++ b/spacy/tests/doc/test_doc_spilt.py @@ -109,6 +109,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0]) retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0]) - + sent1, sent2 = list(doc.sents) assert len(sent1) == init_len + 1 assert len(sent2) == init_len2 + 1