From 680d615bb3f3662ce2a9c9cbe0bcbe1561077746 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 5 Apr 2020 17:47:04 +0200 Subject: [PATCH 1/3] Use inline flags in token_match patterns Use inline flags in `token_match` patterns so that serializing does not lose the flag information. --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tokenizer.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index dfcb2756e4f..7899cfc9bdb 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ def lower_first_letter(text): TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE + "(?iu:" + "|".join("(?:{})".format(m) for m in _regular_exp) + ")" ).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 385afb8bd3d..0de554f9a7a 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -58,7 +58,7 @@ # fmt: on ).strip() -TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match +TOKEN_MATCH = re.compile("(?u:" + URL_PATTERN + ")").match BASE_EXCEPTIONS = {} diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da08125976..62b8bbf4a8e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -567,7 +567,7 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): From 8eb23dd4e4de63ae2d0b81e76a55cd911750bcb8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Apr 2020 09:59:08 +0200 Subject: [PATCH 2/3] Modify inline flag --- spacy/lang/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 0de554f9a7a..29ce754429c 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -58,7 +58,7 @@ # fmt: on ).strip() -TOKEN_MATCH = re.compile("(?u:" + URL_PATTERN + ")").match +TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} From c1cf5da232756179a4e26f161b3ee75e4010f37a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Apr 2020 10:03:52 +0200 Subject: [PATCH 3/3] Modify inline flag --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 7899cfc9bdb..cb17023009e 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ def lower_first_letter(text): TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "(?iu:" + "|".join("(?:{})".format(m) for m in _regular_exp) + ")" + "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match