diff --git a/README.md b/README.md index 8820792..da014ec 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,8 @@ This instruction is originally written by [@lahoffm](https://github.com/lahoffm) - multiple_tables (bool, optional): - (Experimental) Extract multiple tables. - This option uses JSON as an intermediate format, so if tabula-java output format will change, this option doesn't work. +- user_agent (str, optional) + - Set a custom user-agent when download a pdf from a url. Otherwise it uses the default urllib.request user-agent ## FAQ diff --git a/tabula/file_util.py b/tabula/file_util.py index e6fa311..dc5c0ff 100644 --- a/tabula/file_util.py +++ b/tabula/file_util.py @@ -6,12 +6,12 @@ PY3 = sys.version_info[0] >= 3 if PY3: - from urllib.request import urlopen + from urllib.request import urlopen, Request from urllib.parse import urlparse as parse_url from urllib.parse import uses_relative, uses_netloc, uses_params text_type = str else: - from urllib2 import urlopen + from urllib2 import urlopen, Request from urlparse import urlparse as parse_url from urlparse import uses_relative, uses_netloc, uses_params text_type = unicode @@ -21,7 +21,7 @@ _VALID_URLS.discard('') -def localize_file(path_or_buffer): +def localize_file(path_or_buffer, user_agent=None): '''Ensure localize target file. If the target file is remote, this function fetches into local storage. @@ -38,7 +38,10 @@ def localize_file(path_or_buffer): path_or_buffer = _stringify_path(path_or_buffer) if _is_url(path_or_buffer): - req = urlopen(path_or_buffer) + if user_agent: + req = urlopen(_create_request(path_or_buffer, user_agent)) + else: + req = urlopen(path_or_buffer) filename = os.path.basename(req.geturl()) if os.path.splitext(filename)[-1] is not ".pdf": pid = os.getpid() @@ -71,6 +74,10 @@ def _is_url(url): return False +def _create_request(path_or_buffer, user_agent): + req_headers = {'User-Agent': user_agent} + return Request(path_or_buffer, headers=req_headers) + def is_file_like(obj): '''Check file like object diff --git a/tabula/wrapper.py b/tabula/wrapper.py index 37801fe..0dabaf9 100644 --- a/tabula/wrapper.py +++ b/tabula/wrapper.py @@ -127,7 +127,9 @@ def read_pdf(input_path, if not any(filter(r.find, java_options)): java_options = java_options + ['-Dfile.encoding=UTF8'] - path, temporary = localize_file(input_path) + user_agent = kwargs.pop('user_agent', None) + + path, temporary = localize_file(input_path, user_agent) if not os.path.exists(path): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path) diff --git a/tests/test_read_pdf_table.py b/tests/test_read_pdf_table.py index 3907535..efe30ee 100644 --- a/tests/test_read_pdf_table.py +++ b/tests/test_read_pdf_table.py @@ -12,9 +12,11 @@ try: FileNotFoundError from unittest.mock import patch + from urllib.request import Request except NameError: FileNotFoundError = IOError from mock import patch + from urllib2 import Request class TestReadPdfTable(unittest.TestCase): @@ -30,6 +32,12 @@ def test_read_remote_pdf(self): df = tabula.read_pdf(uri) self.assertTrue(isinstance(df, pd.DataFrame)) + def test_read_remote_pdf_with_custom_user_agent(self): + uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf" + + df = tabula.read_pdf(uri, user_agent='Mozilla/5.0') + self.assertTrue(isinstance(df, pd.DataFrame)) + def test_read_pdf_into_json(self): pdf_path = 'tests/resources/data.pdf' expected_json = 'tests/resources/data_1.json' diff --git a/tests/test_util.py b/tests/test_util.py index e4825d9..ab523c4 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,11 +1,36 @@ import unittest import tabula +try: + FileNotFoundError + from unittest.mock import patch, MagicMock + from urllib.request import Request +except NameError: + FileNotFoundError = IOError + from mock import patch, MagicMock + from urllib2 import Request + class TestUtil(unittest.TestCase): def test_environment_info(self): self.assertEqual(tabula.environment_info(), None) + @patch('tabula.file_util.shutil.copyfileobj') + @patch('tabula.file_util.urlopen') + @patch('tabula.file_util._create_request') + def test_localize_file_with_user_agent(self, mock_fun, mock_urlopen, mock_copyfileobj): + uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf" + user_agent='Mozilla/5.0' + + cm = MagicMock() + cm.getcode.return_value = 200 + cm.read.return_value = b'contents' + cm.geturl.return_value = uri + mock_urlopen.return_value = cm + + tabula.file_util.localize_file(uri, user_agent=user_agent) + mock_fun.assert_called_with(uri, user_agent) + if __name__ == '__main__': unittest.main()