Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add split one token into several (resolves #2838) #3253

Merged
merged 10 commits into from
Feb 14, 2019
3 changes: 3 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ class Errors(object):
E098 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
" can only be part of one entity, so make sure the entities you're "
"setting don't overlap.")
E099 = ("The newly split token can only have one root (head = 0).")
E100 = ("The newly split token needs to have a root (head = 0)")
E101 = ("All subtokens must have associated heads")

@add_codes
class TempErrors(object):
Expand Down
114 changes: 114 additions & 0 deletions spacy/tests/doc/test_doc_spilt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# coding: utf-8
from __future__ import unicode_literals

from ..util import get_doc
from ...vocab import Vocab
from ...tokens import Doc
from ...tokens import Span

import pytest


def test_doc_split(en_tokenizer):
text = "LosAngeles start."
heads = [1, 1, 0]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)

assert len(doc) == 3
assert len(str(doc)) == 19
assert doc[0].head.text == 'start'
assert doc[1].head.text == '.'

with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'})

assert len(doc) == 4
assert doc[0].text == 'Los'
assert doc[0].head.text == 'Angeles'
assert doc[0].idx == 0
assert doc[1].idx == 3

assert doc[1].text == 'Angeles'
assert doc[1].head.text == 'start'

assert doc[2].text == 'start'
assert doc[2].head.text == '.'

assert doc[3].text == '.'
assert doc[3].head.text == '.'

assert len(str(doc)) == 19

def test_split_dependencies(en_tokenizer):
text = "LosAngeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
dep1 = doc.vocab.strings.add('amod')
dep2 = doc.vocab.strings.add('subject')
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])

assert doc[0].dep == dep1
assert doc[1].dep == dep2



def test_split_heads_error(en_tokenizer):
text = "LosAngeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
#Not enough heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [0])

#Too many heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1, 0])

#No token head
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 1])

#Several token heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])


def test_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging
words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [(doc.vocab.strings.add('ent-abcd'), 0, 2)]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"

with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "I"

def test_spans_sentence_update_after_merge(en_tokenizer):
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale."
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
deps = ['nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
'compound', 'punct']

tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents)
init_len = len(sent1)
init_len2 = len(sent2)
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Stewart", "Lee"], [1, 0])
retokenizer.split(doc[14], ["Joe", "Pasquale"], [1, 0])
sent1, sent2 = list(doc.sents)
assert len(sent1) == init_len + 1
assert len(sent2) == init_len2 + 1
120 changes: 116 additions & 4 deletions spacy/tokens/_retokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ cdef class Retokenizer:
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.merges.append((span, attrs))

def split(self, Token token, orths, attrs=SimpleFrozenDict()):
def split(self, Token token, orths, heads, deps=[], attrs=SimpleFrozenDict()):
"""Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.splits.append((token.start_char, orths, attrs))
self.splits.append((token.i, orths, heads, deps, attrs))

def __enter__(self):
self.merges = []
Expand All @@ -65,8 +65,12 @@ cdef class Retokenizer:
end = span.end
_merge(self.doc, start, end, attrs)

for start_char, orths, attrs in self.splits:
raise NotImplementedError
offset = 0
# Iterate in order, to keep the offset simple.
for token_index, orths, heads, deps, attrs in sorted(self.splits):
_split(self.doc, token_index + offset, orths, heads, deps, attrs)
# Adjust for the previous tokens
offset += len(orths)-1

def _merge(Doc doc, int start, int end, attributes):
"""Retokenize the document, such that the span at
Expand Down Expand Up @@ -279,3 +283,111 @@ def _bulk_merge(Doc doc, merges):

# Return the merged Python object
return doc[spans[0].start]


def _split(Doc doc, int token_index, orths, heads, deps, attrs):
"""Retokenize the document, such that the token at
`doc[token_index]` is split into tokens with the orth 'orths'
token_index(int): token index of the token to split.
orths: IDs of the verbatim text content of the tokens to create
**attributes: Attributes to assign to each of the newly created tokens. By default,
attributes are inherited from the original token.
RETURNS (Token): The first newly created token.
"""
cdef int nb_subtokens = len(orths)
cdef const LexemeC* lex
cdef TokenC* token
cdef TokenC orig_token = doc.c[token_index]

if(len(heads) != nb_subtokens):
raise ValueError(Errors.E101)
token_head_index = -1
for index, head in enumerate(heads):
if head == 0:
if token_head_index != -1:
raise ValueError(Errors.E098)
token_head_index = index
if token_head_index == -1:
raise ValueError(Errors.E099)

# First, make the dependencies absolutes, and adjust all possible dependencies before
# creating the tokens

for i in range(doc.length):
doc.c[i].head += i

# Adjust dependencies
offset = nb_subtokens - 1
for i in range(doc.length):
head_idx = doc.c[i].head
if head_idx == token_index:
doc.c[i].head = token_head_index
elif head_idx > token_index:
doc.c[i].head += offset

new_token_head = doc.c[token_index].head

# Double doc.c max_length if necessary (until big enough for all new tokens)
while doc.length + nb_subtokens - 1 >= doc.max_length:
doc._realloc(doc.length * 2)

# Move tokens after the split to create space for the new tokens
doc.length = len(doc) + nb_subtokens -1
for token_to_move in range(doc.length - 1, token_index, -1):
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]

# Host the tokens in the newly created space
cdef int idx_offset = 0
for i, orth in enumerate(orths):

token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth)
token.lex = lex
# Update the character offset of the subtokens
if i != 0:
token.idx = orig_token.idx + idx_offset
idx_offset += len(orth)

# Set token.spacy to False for all non-last split tokens, and
# to origToken.spacy for the last token
if (i < nb_subtokens - 1):
token.spacy = False
else:
token.spacy = orig_token.spacy

# Apply attrs to each subtoken
for attr_name, attr_value in attrs.items():
if attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
Token.set_struct_attr(token, attr_name, attr_value)

# Make IOB consistent
if (orig_token.ent_iob == 3):
if i == 0:
token.ent_iob = 3
else:
token.ent_iob = 1
else:
# In all other cases subtokens inherit iob from origToken
token.ent_iob = orig_token.ent_iob

# Use the head of the new token everywhere. This will be partially overwritten later on.
token.head = new_token_head

# Transform the dependencies into relative ones again
for i in range(doc.length):
doc.c[i].head -= i

# Assign correct dependencies to the inner token
for i, head in enumerate(heads):
if head != 0:
# the token's head's head is already correct
doc.c[token_index + i].head = head

for i, dep in enumerate(deps):
doc[token_index + i].dep = dep

# set children from head
set_children_from_heads(doc.c, doc.length)