-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
fix_space_entities.py
27 lines (24 loc) · 940 Bytes
/
fix_space_entities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
'''Demonstrate adding a rule-based component that forces some tokens to not
be entities, before the NER tagger is applied. This is used to hotfix the issue
in https:/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
'''
import spacy
from spacy.attrs import ENT_IOB
def fix_space_tags(doc):
ent_iobs = doc.to_array([ENT_IOB])
for i, token in enumerate(doc):
if token.is_space:
# Sets 'O' tag (0 is None, so I is 1, O is 2)
ent_iobs[i] = 2
doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
return doc
def main():
nlp = spacy.load('en_core_web_sm')
text = u'''This is some crazy test where I dont need an Apple Watch to make things bug'''
doc = nlp(text)
print('Before', doc.ents)
nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
doc = nlp(text)
print('After', doc.ents)
if __name__ == '__main__':
main()