From 6e71a9f09abfad572b7d56108ca61c0345be45d8 Mon Sep 17 00:00:00 2001 From: Dennis Aumiller Date: Fri, 3 Feb 2023 16:58:33 +0100 Subject: [PATCH 01/10] Add test case to check whether empty root processing is correct. --- tests/local_test.py | 0 tests/test_EntityLinker.py | 10 ++++++++++ 2 files changed, 10 insertions(+) create mode 100644 tests/local_test.py diff --git a/tests/local_test.py b/tests/local_test.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_EntityLinker.py b/tests/test_EntityLinker.py index 0120dd5..9a00ee0 100644 --- a/tests/test_EntityLinker.py +++ b/tests/test_EntityLinker.py @@ -22,3 +22,13 @@ def test_initialization(self): sent._.linkedEntities.pretty_print() self.nlp.remove_pipe("entityLinker") + + def test_empty_root(self): + self.nlp.add_pipe("entityLinker", last=True) + + doc = self.nlp( + 'I was right."\n\n "To that extent."\n\n "But that was all."\n\n "No, no, m') + for sent in doc.sents: + sent._.linkedEntities.pretty_print() + + self.nlp.remove_pipe("entityLinker") From 89c4038dfa910dace36f988bea0484a368d9a8d2 Mon Sep 17 00:00:00 2001 From: Dennis Aumiller Date: Fri, 3 Feb 2023 17:01:06 +0100 Subject: [PATCH 02/10] Move installation instructions further up. --- README.md | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 964bd6b..8054ee8 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,22 @@ It also comes along with a number of disadvantages: - no context sensitivity due to the implementation of the "max-prior method" for entitiy disambiguation (an improved method for this is in progress) + +## Installation + +To install the package, run: +```bash +pip install spacy-entity-linker +``` + +Afterwards, the knowledge base (Wikidata) must be downloaded. This can be done by calling + +```bash +python -m spacy_entity_linker "download_knowledge_base" +``` + +This will download and extract a ~1.3GB file that contains a preprocessed version of Wikidata. + ## Use ```python @@ -138,16 +154,6 @@ The current implementation supports only Sqlite. This is advantageous for develo any special setup and configuration. However, for more performance critical usecases, a different database with in-memory access (e.g. Redis) should be used. This may be implemented in the future. -## Installation - -To install the package run: pip install spacy-entity-linker - -Afterwards, the knowledge base (Wikidata) must be downloaded. This can be done by calling - -python -m spacy_entity_linker "download_knowledge_base" - -This will download and extract a ~500mb file that contains a preprocessed version of Wikidata - ## Data the knowledge base was derived from this dataset: https://www.kaggle.com/kenshoresearch/kensho-derived-wikimedia-data From e2a265aebc3292e9b7bafe547aa02363f286f613 Mon Sep 17 00:00:00 2001 From: Dennis Aumiller Date: Fri, 3 Feb 2023 17:14:48 +0100 Subject: [PATCH 03/10] Refactor if the __main__.py file. 1. Adds a download progress bar to get response during the call, which helps especially with slower connections to recognize timeouts etc. 2. Separates out the logic for download_knowledge_base(), which effectively allows downloads to be called from other parts of the library. 3. More expressive error messaging in case attributes passed to calls like python -m spacy_entity_linker are incorrect. --- spacy_entity_linker/__main__.py | 60 ++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/spacy_entity_linker/__main__.py b/spacy_entity_linker/__main__.py index e6a30d2..6ba5725 100644 --- a/spacy_entity_linker/__main__.py +++ b/spacy_entity_linker/__main__.py @@ -1,28 +1,48 @@ +import sys +import tarfile +import urllib.request +import tqdm +import os + + +class DownloadProgressBar(tqdm.tqdm): + """ + Code taken from https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads + """ + def update_to(self, chunk_id=1, max_chunk_size=1, total_size=None): + if total_size is not None: + self.total = total_size + self.update(chunk_id * max_chunk_size - self.n) + + +def download_knowledge_base( + file_url="https://huggingface.co/MartinoMensio/spaCy-entity-linker/resolve/main/knowledge_base.tar.gz" +): + OUTPUT_TAR_FILE = os.path.abspath( + os.path.dirname(__file__)) + '/../data_spacy_entity_linker/wikidb_filtered.tar.gz' + OUTPUT_DB_PATH = os.path.abspath(os.path.dirname(__file__)) + '/../data_spacy_entity_linker' + if not os.path.exists(OUTPUT_DB_PATH): + os.makedirs(OUTPUT_DB_PATH) + with DownloadProgressBar(unit='B', unit_scale=True, miniters=1) as dpb: + urllib.request.urlretrieve(file_url, filename=OUTPUT_TAR_FILE, reporthook=dpb.update_to) + + tar = tarfile.open(OUTPUT_TAR_FILE) + tar.extractall(OUTPUT_DB_PATH) + tar.close() + + os.remove(OUTPUT_TAR_FILE) + + if __name__ == "__main__": - import sys - import urllib - import urllib.request - import tarfile - import os if len(sys.argv) < 2: - print("No arguments given") + print("No arguments given.") pass command = sys.argv.pop(1) if command == "download_knowledge_base": - FILE_URL = "https://huggingface.co/MartinoMensio/spaCy-entity-linker/resolve/main/knowledge_base.tar.gz" - - OUTPUT_TAR_FILE = os.path.abspath( - os.path.dirname(__file__)) + '/../data_spacy_entity_linker/wikidb_filtered.tar.gz' - OUTPUT_DB_PATH = os.path.abspath(os.path.dirname(__file__)) + '/../data_spacy_entity_linker' - if not os.path.exists(OUTPUT_DB_PATH): - os.makedirs(OUTPUT_DB_PATH) - urllib.request.urlretrieve(FILE_URL, OUTPUT_TAR_FILE) - - tar = tarfile.open(OUTPUT_TAR_FILE) - tar.extractall(OUTPUT_DB_PATH) - tar.close() - - os.remove(OUTPUT_TAR_FILE) + download_knowledge_base() + else: + raise ValueError("Unrecognized command given. If you are trying to install the knowledge base, run " + "'python -m spacy_entity_linker \"download_knowledge_base\"'.") From 399b6d2bed08853f382439cbf1f053bf2e665494 Mon Sep 17 00:00:00 2001 From: Dennis Aumiller Date: Fri, 3 Feb 2023 17:15:02 +0100 Subject: [PATCH 04/10] Required updates to requirements for download bar to work. --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ecf975e..a0dc9ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ --e . \ No newline at end of file +-e . +tqdm \ No newline at end of file From ed84ebf7014dab71e26fd00df613a9d845672ed9 Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Fri, 3 Feb 2023 17:16:14 +0100 Subject: [PATCH 05/10] Delete local_test.py --- tests/local_test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/local_test.py diff --git a/tests/local_test.py b/tests/local_test.py deleted file mode 100644 index e69de29..0000000 From c62d2d7f5541994d2707fc7afba0e33c62b42f02 Mon Sep 17 00:00:00 2001 From: Dennis Aumiller Date: Fri, 3 Feb 2023 17:17:20 +0100 Subject: [PATCH 06/10] Refactor of DatabaseConnection. 1. Automatically attempts to resolve errors that are likely caused by a missing KB; will download and then reconnect. 2. Also refactors some of the critical patterns, e.g., mutable default arguments, None-checking and use of internal attribute names (property). --- README.md | 3 +- spacy_entity_linker/DatabaseConnection.py | 39 ++++++++++++++++------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 8054ee8..0ae7ca7 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,13 @@ To install the package, run: pip install spacy-entity-linker ``` -Afterwards, the knowledge base (Wikidata) must be downloaded. This can be done by calling +Afterwards, the knowledge base (Wikidata) must be downloaded. This can be either be done by manually calling ```bash python -m spacy_entity_linker "download_knowledge_base" ``` +or when you first access the entity linker through spacy. This will download and extract a ~1.3GB file that contains a preprocessed version of Wikidata. ## Use diff --git a/spacy_entity_linker/DatabaseConnection.py b/spacy_entity_linker/DatabaseConnection.py index 400f52c..1ec90af 100644 --- a/spacy_entity_linker/DatabaseConnection.py +++ b/spacy_entity_linker/DatabaseConnection.py @@ -1,6 +1,8 @@ import sqlite3 import os +from .__main__ import download_knowledge_base + MAX_DEPTH_CHAIN = 10 P_INSTANCE_OF = 31 P_SUBCLASS = 279 @@ -11,7 +13,7 @@ entity_cache = {} chain_cache = {} -DB_DEFAULT_PATH = os.path.abspath(__file__ + '/../../data_spacy_entity_linker/wikidb_filtered.db') +DB_DEFAULT_PATH = os.path.abspath(os.path.join(__file__, "../../data_spacy_entity_linker/wikidb_filtered.db")) wikidata_instance = None @@ -49,7 +51,13 @@ def _add_to_cache(self, cache_type, key, value): self.cache[cache_type][key] = value def init_database_connection(self, path=DB_DEFAULT_PATH): - self.conn = sqlite3.connect(path) + try: + self.conn = sqlite3.connect(path) + except sqlite3.OperationalError: + # Automatically download the knowledge base if it isn't already + download_knowledge_base() + # ... and retry the connection after completion + self.conn = sqlite3.connect(path) def clear_cache(self): self.cache["entity"].clear() @@ -61,9 +69,9 @@ def get_entities_from_alias(self, alias): if self._is_cached("entity", alias): return self._get_cached_value("entity", alias).copy() - query_alias = """SELECT j.item_id,j.en_label, j.en_description,j.views,j.inlinks,a.en_alias from aliases as a - LEFT JOIN joined as j ON a.item_id = j.item_id - WHERE a.en_alias_lowercase = ? and j.item_id NOT NULL""" + query_alias = """SELECT j.item_id,j.en_label, j.en_description,j.views,j.inlinks,a.en_alias + FROM aliases as a LEFT JOIN joined as j ON a.item_id = j.item_id + WHERE a.en_alias_lowercase = ? AND j.item_id NOT NULL""" c.execute(query_alias, [alias.lower()]) fetched_rows = c.fetchall() @@ -92,7 +100,7 @@ def get_entity_name(self, item_id): res = c.fetchone() if res and len(res): - if res[0] == None: + if res[0] is None: self._add_to_cache("name", item_id, 'no label') else: self._add_to_cache("name", item_id, res[0]) @@ -148,10 +156,14 @@ def get_recursive_edges(self, item_id): self._append_chain_elements(self, item_id, 0, chain, edges) return edges - def _append_chain_elements(self, item_id, level=0, chain=[], edges=[], max_depth=10, property=P_INSTANCE_OF): - properties = property - if type(property) != list: - properties = [property] + def _append_chain_elements(self, item_id, level=0, chain=None, edges=None, max_depth=10, prop=P_INSTANCE_OF): + if chain is None: + chain = [] + if edges is None: + edges = [] + properties = prop + if type(prop) != list: + properties = [prop] if self._is_cached("chain", (item_id, max_depth)): chain += self._get_cached_value("chain", (item_id, max_depth)).copy() @@ -176,9 +188,12 @@ def _append_chain_elements(self, item_id, level=0, chain=[], edges=[], max_depth if not (target_item[0] in chain_ids): chain += [(target_item[0], level + 1)] edges.append((item_id, target_item[0], target_item[1])) - self._append_chain_elements(target_item[0], level=level + 1, chain=chain, edges=edges, + self._append_chain_elements(target_item[0], + level=level + 1, + chain=chain, + edges=edges, max_depth=max_depth, - property=property) + prop=prop) self._add_to_cache("chain", (item_id, max_depth), chain) From f77cdb14618fed6b6a4974278817298f15e7b74e Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Fri, 3 Feb 2023 17:37:30 +0100 Subject: [PATCH 07/10] added another example (empty document) that was failing before --- tests/test_EntityLinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_EntityLinker.py b/tests/test_EntityLinker.py index 9a00ee0..1fe3641 100644 --- a/tests/test_EntityLinker.py +++ b/tests/test_EntityLinker.py @@ -24,11 +24,16 @@ def test_initialization(self): self.nlp.remove_pipe("entityLinker") def test_empty_root(self): + # test empty lists of roots (#9) self.nlp.add_pipe("entityLinker", last=True) doc = self.nlp( 'I was right."\n\n "To that extent."\n\n "But that was all."\n\n "No, no, m') for sent in doc.sents: sent._.linkedEntities.pretty_print() + # empty document + doc = self.nlp('\n\n') + for sent in doc.sents: + sent._.linkedEntities.pretty_print() self.nlp.remove_pipe("entityLinker") From 54c2eb88a76a09c94f0e89121a56a947b0846795 Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Fri, 3 Feb 2023 17:58:22 +0100 Subject: [PATCH 08/10] moved tqdm to setup.py setup.py is the default when installing from pip, instead requirements.txt is more for development requirements --- requirements.txt | 3 +-- setup.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index a0dc9ff..ecf975e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ --e . -tqdm \ No newline at end of file +-e . \ No newline at end of file diff --git a/setup.py b/setup.py index d4de19d..c8e1d57 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,8 @@ def open_file(fname): zip_safe=True, install_requires=[ 'spacy>=3.0.0', - 'numpy>=1.0.0' + 'numpy>=1.0.0', + 'tqdm' ], entry_points={ 'spacy_factories': 'entityLinker = spacy_entity_linker.EntityLinker:EntityLinker' From b97a46f96694c4048db74e2479097cfbfc1e2d02 Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Fri, 3 Feb 2023 18:07:33 +0100 Subject: [PATCH 09/10] check if database exists and download, instead of catching sqlite3.OperationalError --- spacy_entity_linker/DatabaseConnection.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/spacy_entity_linker/DatabaseConnection.py b/spacy_entity_linker/DatabaseConnection.py index 1ec90af..f72e4f3 100644 --- a/spacy_entity_linker/DatabaseConnection.py +++ b/spacy_entity_linker/DatabaseConnection.py @@ -51,13 +51,11 @@ def _add_to_cache(self, cache_type, key, value): self.cache[cache_type][key] = value def init_database_connection(self, path=DB_DEFAULT_PATH): - try: - self.conn = sqlite3.connect(path) - except sqlite3.OperationalError: + # check if the database exists + if not os.path.exists(DB_DEFAULT_PATH): # Automatically download the knowledge base if it isn't already download_knowledge_base() - # ... and retry the connection after completion - self.conn = sqlite3.connect(path) + self.conn = sqlite3.connect(path) def clear_cache(self): self.cache["entity"].clear() From 8fadfd914642efb1048eb06eaef401e899044c71 Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Fri, 3 Feb 2023 18:08:02 +0100 Subject: [PATCH 10/10] description of download --- spacy_entity_linker/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy_entity_linker/__main__.py b/spacy_entity_linker/__main__.py index 6ba5725..85ba6a5 100644 --- a/spacy_entity_linker/__main__.py +++ b/spacy_entity_linker/__main__.py @@ -23,7 +23,7 @@ def download_knowledge_base( OUTPUT_DB_PATH = os.path.abspath(os.path.dirname(__file__)) + '/../data_spacy_entity_linker' if not os.path.exists(OUTPUT_DB_PATH): os.makedirs(OUTPUT_DB_PATH) - with DownloadProgressBar(unit='B', unit_scale=True, miniters=1) as dpb: + with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc='Downloading knowledge base') as dpb: urllib.request.urlretrieve(file_url, filename=OUTPUT_TAR_FILE, reporthook=dpb.update_to) tar = tarfile.open(OUTPUT_TAR_FILE)