Initial commit

dglttr · Apr 26, 2021 · d0b40e8 · d0b40e8
commit d0b40e8
Show file tree

Hide file tree

Showing 28 changed files with 3,199 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,19 @@
+# virtualenv
+venv/
+.venv/
+
+# auto-created .idea directory
+.idea/
+
+# auto-created .vscode directory
+.vscode/
+
+# Build directories
+dist/
+build/
+scrawler.egg-info/
+
+# Pycaches
+scrawler/__pycache__/
+scrawler/backends/__pycache__/
+scrawler/utils/__pycache__/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2021 Daniel Glatter
+Developed while working at Fraunhofer Institute for Systems and Innovation Research.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
diff --git a/scrawler/__init__.py b/scrawler/__init__.py
@@ -0,0 +1,9 @@
+import logging
+
+from scrawler.crawling import Crawler
+from scrawler.scraping import Scraper
+from scrawler.website import Website
+
+__all__ = ["Crawler", "Scraper", "Website"]
+
+logging.getLogger('readability').setLevel(logging.CRITICAL) # to avoid a lot of noise in the output
diff --git a/scrawler/attributes.py b/scrawler/attributes.py
@@ -0,0 +1,165 @@
+"""Specifies the attribute objects used by crawlers and scrapers."""
+from typing import Tuple, Union, Callable
+from inspect import signature
+import os
+
+import pandas as pd
+
+from scrawler.defaults import (DEFAULT_CSV_ENCODING, DEFAULT_CSV_SEPARATOR, DEFAULT_CSV_QUOTING, DEFAULT_CSV_ESCAPECHAR,
+ DEFAULT_PAUSE_TIME)
+from scrawler.website import Website
+from scrawler.data_extractors import BaseExtractor
+from scrawler.utils.web_utils import is_same_host, extract_same_host_pattern
+
+
+class SearchAttributes:
+ def __init__(self, *args: BaseExtractor, validate: bool = True):
+ """Specify which data to collect/search for in the website
+
+ :param args: Data extractors (see utils.data_extractors for possibilities)
+ :param validate: Whether to make sure that input parameters are valid.
+ """
+ if validate:
+ for extractor in args:
+ if not isinstance(extractor, BaseExtractor):
+ raise TypeError(f"{extractor.__class__} does not inherit from BaseExtractor.")
+
+ self.attributes: Tuple[BaseExtractor] = args
+ self.n_return_values: int = sum([extractor.n_return_values for extractor in self.attributes])
+
+ def extract_all_attrs_from_website(self, website: Website, index: int = None) -> list:
+ """Extract data from a website using data extractors specified in SearchAttributes definition.
+
+ :param website: Website object to collect the data points specified in search_attrs from.
+ :param index: Optionally pass an index for data extractors that index into passed parameters.
+ """
+ extracted_data = []
+
+ for extractor in self.attributes:
+ # Case handling for functions using an index
+ if (index is not None) and extractor.dynamic_parameters:
+ result = extractor.run(website, index)
+ else:
+ result = extractor.run(website)
+
+ # Case handling for functions with multiple return values
+ if extractor.n_return_values != 1:
+ extracted_data.extend(result)
+ else:
+ extracted_data.append(result)
+
+ return extracted_data
+
+
+class ExportAttributes:
+ def __init__(self, directory: str, fn: Union[str, list],
+ header: Union[list, str, bool] = None, encoding: str = DEFAULT_CSV_ENCODING,
+ separator: str = DEFAULT_CSV_SEPARATOR, quoting: int = DEFAULT_CSV_QUOTING,
+ escapechar: str = DEFAULT_CSV_ESCAPECHAR, validate: bool = True, **kwargs):
+ """Specify how and where to export the collected data.
+
+ :param directory: Folder where file(s) will be saved to.
+ :param fn: Name(s) of the file(s) containing the crawled data. Without file extension.
+ :param header: Have the final CSV file have a header. Possible parameters:
+ If None or False, no header will be written.
+ Use "first-row" to use first row of data as header.
+ Else, pass list of strings of appropriate length.
+ :param encoding: Encoding to use to create the CSV file.
+ :param separator: Column separator or delimiter to use for creating the CSV file.
+ :param quoting: Puts quotes around cells that contain the separator character.
+ :param escapechar: Escapes the separator character.
+ :param validate: Whether to make sure that input parameters are valid.
+ :param kwargs: Any parameter supported by pandas.DataFrame.to_csv() can be passed. See their documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
+ """
+ if validate:
+ # Check that directory exists
+ if not os.path.isdir(directory):
+ raise NotADirectoryError(f"Export directory does not exist on this system ({directory}).")
+
+ # Check that keyword arguments are allowed for pandas.DataFrame.to_csv()
+ for key, value in kwargs.items():
+ if key not in signature(pd.DataFrame.to_csv).parameters:
+ raise ValueError(f'Invalid keyword argument passed to ExportAttributes: "{key}"')
+
+ self.directory = directory
+ self.fn = fn # Filename(s)
+
+ self.header = header
+ self.encoding = encoding
+ self.separator = separator
+ self.quoting = quoting
+ self.escapechar = escapechar
+
+ for key, value in kwargs.items(): # Add keyword arguments as attributes
+ self.__setattr__(key, value)
+
+
+class CrawlingAttributes:
+ def __init__(self,
+ filter_non_standard_schemes: bool = True,
+ filter_media_files: bool = True,
+ blocklist: tuple = (),
+ filter_foreign_urls: Union[str, Callable] = "auto",
+ strip_url_parameters: bool = False,
+ strip_url_fragments: bool = True,
+
+ max_no_urls: int = None,
+ max_distance_from_start_url: int = None,
+ max_subdirectory_depth: int = None,
+
+ pause_time: float = DEFAULT_PAUSE_TIME,
+ respect_robots_txt: bool = True,
+
+ validate: bool = True
+ ):
+ """Specify how to conduct the crawling, e. g. how to filter irrelevant URLs or limits on the number of URLs crawled.
+
+ :param filter_non_standard_schemes: Filter URLs starting with schemes other than http: or https: (e.g., mailto: or javascript:).
+ :param filter_media_files: Whether to filter media files. Recommended: True to avoid long runtimes.
+ :param blocklist: Filter URLs that contain one or more of the parts specified here. Has to be a list.
+ :param filter_foreign_urls: Filter URLs that do not belong to the same host (foreign URLs).
+ Can either be a string that is passed to is_same_host(), or a custom callable that has to include two arguments, `url1`and `url2`.
+ The following string values are permitted:
+ If "auto", a matching pattern will be extracted from the start URL (will be either `fld` or `subdomainX`, see below).
+ Alternatively, you can manually specify the mode to one of these options:
+ Either any one of the attributes of the ParsedUrl class (e.g. `domain`, `hostname`, `fld`).
+ Alternatively, can be set to `subdomainX` with `X` representing an integer number up to which subdomain the URLs should be compared. E.g., comparing http://www.sub.example.com and http://blog.sub.example.com, 'sub' is the first level, while the second levels are 'www' and 'blog', respectively.
+ Or, can be set to `directoryX` with `X` representing an integer number up to which directory the URLs should be compared. E.g., for http://example.com/dir1/dir2/index.html, `directory2` would include all files in "dir2".
+
+ :param strip_url_parameters: Whether to strip URL query parameters (prefixed by '?') from the URL.
+ :param strip_url_fragments: Whether to strip URL fragments (prefixed by '#') from the URL.
+
+ :param max_no_urls: Maximum number of URLs to be crawled per domain (safety limit for very large crawls). Set to None if you want all URLs to be crawled.
+ :param max_distance_from_start_url: Maximum number of links that have to be followed to arrive at a certain URL from the start_url.
+ :param max_subdirectory_depth: Maximum sub-level of the host up to which to crawl. E.g., consider this schema: hostname/sub-directory1/sub-siteA.
+ If you would want to crawl all URLs of the same level as 'sub-directory1', specify 1.
+ sub-siteA will then not be found, but a site hostname/sub-directory2 or hostname/sub-siteB will be.
+
+ :param pause_time: Time to wait between the crawling of 2 URLs (in seconds). This pause is mostly to avoid being flagged as spammer by websites.
+ :param respect_robots_txt: Whether to respect the specifications made in the website's robots.txt file.
+ """
+ if validate:
+ # Check that a valid input is passed to parameter filter_foreign_url
+ TEST_URL = "https://www.example.com"
+ try:
+ if not isinstance(filter_foreign_urls, Callable):
+ test_mode = extract_same_host_pattern(TEST_URL) if (filter_foreign_urls == "auto") else filter_foreign_urls
+ assert is_same_host(TEST_URL, TEST_URL, mode=test_mode), "is_same_host() should be True if the same URL is used."
+ else:
+ assert filter_foreign_urls(TEST_URL, TEST_URL), f"Error when testing your custom foreign URL filter function ({filter_foreign_urls.__name__}): Should be True if the same URL is used for both input arguments."
+ except (ValueError, TypeError, AssertionError) as e:
+ raise ValueError(f"Parameter filter_foreign_url is not correctly specified: {filter_foreign_urls}. The following error occurred during validation: {e}")
+
+ self.filter_non_standard_schemes = filter_non_standard_schemes
+ self.filter_media = filter_media_files
+ self.blocklist = blocklist
+ self.filter_foreign_urls = filter_foreign_urls
+ self.strip_url_parameters = strip_url_parameters
+ self.strip_url_fragments = strip_url_fragments
+
+ self.max_no_urls = max_no_urls if (max_no_urls is not None) else float("inf")
+ self.max_distance_from_start_url = max_distance_from_start_url if (max_distance_from_start_url is not None) else float("inf")
+ self.max_subdirectory_depth = max_subdirectory_depth if (max_subdirectory_depth is not None) else float("inf")
+
+ self.pause_time = pause_time
+ self.respect_robots_txt = respect_robots_txt
diff --git a/scrawler/backends/__init__.py b/scrawler/backends/__init__.py
@@ -0,0 +1,2 @@
+ASYNCIO = "asyncio"
+MULTITHREADING = "multithreading"