diff --git a/.gitignore b/.gitignore index 7bbc71c..df61408 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,4 @@ ENV/ # mypy .mypy_cache/ +.idea/ diff --git a/README.md b/README.md index 76ce1fb..af2505e 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,17 @@ # spaCy-CLD: Bringing simple language detection to spaCy -This package is a [spaCy 2.0 extension](https://spacy.io/usage/processing-pipelines#section-extensions) that adds language detection to spaCy's text processing pipeline. Inspired from a discussion [here](https://github.com/explosion/spaCy/issues/1172). +This package is a +[spaCy 2.0 extension](https://spacy.io/usage/processing-pipelines#section-extensions) +that adds language detection to spaCy's text processing pipeline. +Inspired from a discussion [here](https://github.com/explosion/spaCy/issues/1172). ## Installation -`pip install spacy_cld` +`python setup.py install` + +If you can't compile it, retry after + +`export CFLAGS="-Wno-narrowing"` ## Usage @@ -23,10 +30,20 @@ doc._.languages # ['en'] doc._.language_scores['en'] # 0.96 ``` -spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1). +spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, +the object is given two attributes: `languages` (a list of up to 3 language codes) +and `language_scores` (a dictionary mapping language codes to confidence scores between +0 and 1). ## Under the hood -spacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language. +spacy-cld is a little extension that wraps the +[CLD2-CFFI](https://github.com/GregBowyer/cld2-cffi) Python library, which in turn +wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) +C++ library originally built at Google for the Chromium project. +CLD2 uses character n-grams as features and a Naive Bayes classifier to identify +80+ languages from Unicode text strings (or XML/HTML). +It can detect up to 3 different languages in a given document, and reports +a confidence score (reported in with each language. -For additional details, see the linked project pages for PYCLD2 and CLD2. +For additional details, see the linked project pages for CLD2-CFFI and CLD2. diff --git a/requirements.txt b/requirements.txt index 72ce02b..684cd3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ +cld2-cffi>=0.1.4 spacy>=2.0.0 -pycld2==0.31 diff --git a/setup.py b/setup.py index de9fded..52de177 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def setup_package(): packages=find_packages(), install_requires=[ 'spacy>=2.0.0,<3.0.0', - 'pycld2>=0.31'], + 'cld2-cffi>=0.1.4'], zip_safe=False, ) diff --git a/spacy_cld/spacy_cld.py b/spacy_cld/spacy_cld.py index 3451b31..8d4d14a 100644 --- a/spacy_cld/spacy_cld.py +++ b/spacy_cld/spacy_cld.py @@ -1,4 +1,4 @@ -from pycld2 import detect, error as pycld_error +from cld2 import detect from spacy.tokens import Doc, Span @@ -19,7 +19,7 @@ def get_scores(text, cld_results=None): def detect_languages(text): try: _, _, results = detect(text.text) - except pycld_error as err: + except (ValueError, MemoryError): results = [[None, "error", 0.0, None]] return results