Skip to content

Commit

Permalink
Merge pull request #151 from chezou/user-agent
Browse files Browse the repository at this point in the history
Add User agent handling
  • Loading branch information
chezou authored Jun 13, 2019
2 parents baacafe + 6d942a8 commit 60182cd
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 5 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ This instruction is originally written by [@lahoffm](https:/lahoffm)
- multiple_tables (bool, optional):
- (Experimental) Extract multiple tables.
- This option uses JSON as an intermediate format, so if tabula-java output format will change, this option doesn't work.
- user_agent (str, optional)
- Set a custom user-agent when download a pdf from a url. Otherwise it uses the default urllib.request user-agent


## FAQ
Expand Down
15 changes: 11 additions & 4 deletions tabula/file_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
PY3 = sys.version_info[0] >= 3

if PY3:
from urllib.request import urlopen
from urllib.request import urlopen, Request
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
text_type = str
else:
from urllib2 import urlopen
from urllib2 import urlopen, Request
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params
text_type = unicode
Expand All @@ -21,7 +21,7 @@
_VALID_URLS.discard('')


def localize_file(path_or_buffer):
def localize_file(path_or_buffer, user_agent=None):
'''Ensure localize target file.
If the target file is remote, this function fetches into local storage.
Expand All @@ -38,7 +38,10 @@ def localize_file(path_or_buffer):
path_or_buffer = _stringify_path(path_or_buffer)

if _is_url(path_or_buffer):
req = urlopen(path_or_buffer)
if user_agent:
req = urlopen(_create_request(path_or_buffer, user_agent))
else:
req = urlopen(path_or_buffer)
filename = os.path.basename(req.geturl())
if os.path.splitext(filename)[-1] is not ".pdf":
pid = os.getpid()
Expand Down Expand Up @@ -71,6 +74,10 @@ def _is_url(url):
return False


def _create_request(path_or_buffer, user_agent):
req_headers = {'User-Agent': user_agent}
return Request(path_or_buffer, headers=req_headers)

def is_file_like(obj):
'''Check file like object
Expand Down
4 changes: 3 additions & 1 deletion tabula/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,9 @@ def read_pdf(input_path,
if not any(filter(r.find, java_options)):
java_options = java_options + ['-Dfile.encoding=UTF8']

path, temporary = localize_file(input_path)
user_agent = kwargs.pop('user_agent', None)

path, temporary = localize_file(input_path, user_agent)

if not os.path.exists(path):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_read_pdf_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
try:
FileNotFoundError
from unittest.mock import patch
from urllib.request import Request
except NameError:
FileNotFoundError = IOError
from mock import patch
from urllib2 import Request


class TestReadPdfTable(unittest.TestCase):
Expand All @@ -30,6 +32,12 @@ def test_read_remote_pdf(self):
df = tabula.read_pdf(uri)
self.assertTrue(isinstance(df, pd.DataFrame))

def test_read_remote_pdf_with_custom_user_agent(self):
uri = "https:/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf"

df = tabula.read_pdf(uri, user_agent='Mozilla/5.0')
self.assertTrue(isinstance(df, pd.DataFrame))

def test_read_pdf_into_json(self):
pdf_path = 'tests/resources/data.pdf'
expected_json = 'tests/resources/data_1.json'
Expand Down
25 changes: 25 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,36 @@
import unittest
import tabula

try:
FileNotFoundError
from unittest.mock import patch, MagicMock
from urllib.request import Request
except NameError:
FileNotFoundError = IOError
from mock import patch, MagicMock
from urllib2 import Request


class TestUtil(unittest.TestCase):
def test_environment_info(self):
self.assertEqual(tabula.environment_info(), None)

@patch('tabula.file_util.shutil.copyfileobj')
@patch('tabula.file_util.urlopen')
@patch('tabula.file_util._create_request')
def test_localize_file_with_user_agent(self, mock_fun, mock_urlopen, mock_copyfileobj):
uri = "https:/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf"
user_agent='Mozilla/5.0'

cm = MagicMock()
cm.getcode.return_value = 200
cm.read.return_value = b'contents'
cm.geturl.return_value = uri
mock_urlopen.return_value = cm

tabula.file_util.localize_file(uri, user_agent=user_agent)
mock_fun.assert_called_with(uri, user_agent)


if __name__ == '__main__':
unittest.main()

0 comments on commit 60182cd

Please sign in to comment.