From 6856b23631acbe7eaffbb900102d999c2e842ba4 Mon Sep 17 00:00:00 2001 From: IgorTatarnikov Date: Thu, 19 Oct 2023 14:46:09 +0100 Subject: [PATCH 1/8] Added a new function get_download_size that's called if the content-length header is missing. --- bg_atlasapi/utils.py | 58 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/bg_atlasapi/utils.py b/bg_atlasapi/utils.py index a1eecbb9..eb315646 100644 --- a/bg_atlasapi/utils.py +++ b/bg_atlasapi/utils.py @@ -105,7 +105,7 @@ def atlas_name_from_repr(name, resolution, major_vers=None, minor_vers=None): def check_internet_connection( - url="http://www.google.com/", timeout=5, raise_error=True + url="http://www.google.com/", timeout=5, raise_error=True ): """Check that there is an internet connection url : str @@ -131,9 +131,9 @@ def check_internet_connection( def retrieve_over_http( - url, - output_file_path, - fn_update: Optional[Callable[[int, int], None]] = None, + url, + output_file_path, + fn_update: Optional[Callable[[int, int], None]] = None, ): """Download file from remote location, with progress bar. @@ -167,6 +167,10 @@ def retrieve_over_http( try: with progress: tot = int(response.headers.get("content-length", 0)) + + if tot == 0: + tot = get_download_size(url) + task_id = progress.add_task( "download", filename=output_file_path.name, @@ -193,6 +197,52 @@ def retrieve_over_http( ) +def get_download_size(url: str) -> int: + """Get file size based on the MB value found on the "src" page of each atlas + + Parameters + ---------- + url : str + atlas file url (in a repo, make sure the "raw" url is passed) + + Returns + ------- + int + size of the file to download + + Raises + ------ + requests.exceptions.HTTPError: If there's an issue with the HTTP request. + ValueError: If the file size cannot be extracted from the response. + + """ + # Replace the 'raw' in the url with 'src' + url_split = url.split('/') + url_split[5] = 'src' + url = '/'.join(url_split) + + try: + response = requests.get(url) + response.raise_for_status() + + start_marker = b">" + end_marker = b" MB" + end = response.content.find(end_marker) + start = response.content[:end].rfind(start_marker) + 1 + + if start == -1 or end == -1: + raise ValueError("File size information not found in the response") + + size = float(response.content[start:end]) * 1e6 + + return int(size) + + except requests.exceptions.HTTPError as e: + raise e + except ValueError as e: + raise ValueError("File size information not found in the response.") + + def conf_from_url(url): """Read conf file from an URL. Parameters From 7243a52c3284cd78eaa6788e8c23b7fd6e904414 Mon Sep 17 00:00:00 2001 From: IgorTatarnikov Date: Thu, 19 Oct 2023 14:46:56 +0100 Subject: [PATCH 2/8] Fixed a bug where the progress bar went over full when the last chunk was added. --- bg_atlasapi/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bg_atlasapi/utils.py b/bg_atlasapi/utils.py index eb315646..19c7d957 100644 --- a/bg_atlasapi/utils.py +++ b/bg_atlasapi/utils.py @@ -105,7 +105,7 @@ def atlas_name_from_repr(name, resolution, major_vers=None, minor_vers=None): def check_internet_connection( - url="http://www.google.com/", timeout=5, raise_error=True + url="http://www.google.com/", timeout=5, raise_error=True ): """Check that there is an internet connection url : str @@ -131,9 +131,9 @@ def check_internet_connection( def retrieve_over_http( - url, - output_file_path, - fn_update: Optional[Callable[[int, int], None]] = None, + url, + output_file_path, + fn_update: Optional[Callable[[int, int], None]] = None, ): """Download file from remote location, with progress bar. @@ -179,16 +179,16 @@ def retrieve_over_http( ) with open(output_file_path, "wb") as fout: - advanced = 0 + completed = 0 for chunk in response.iter_content(chunk_size=CHUNK_SIZE): fout.write(chunk) adv = len(chunk) - progress.update(task_id, advance=adv, refresh=True) + completed += adv + progress.update(task_id, completed=min(completed, tot), refresh=True) if fn_update: # update handler with completed and total bytes - advanced += adv - fn_update(advanced, tot) + fn_update(completed, tot) except requests.exceptions.ConnectionError: output_file_path.unlink() From 17f5ac8bec021cebaae7ab81c337c5b5cb0541e9 Mon Sep 17 00:00:00 2001 From: IgorTatarnikov Date: Thu, 19 Oct 2023 17:14:46 +0100 Subject: [PATCH 3/8] Switched to using regex to account for GB vs MB. Added basic tests. --- bg_atlasapi/utils.py | 39 +++++++++++++++++++++++++++------------ tests/test_utils.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/bg_atlasapi/utils.py b/bg_atlasapi/utils.py index 19c7d957..559c1405 100644 --- a/bg_atlasapi/utils.py +++ b/bg_atlasapi/utils.py @@ -1,6 +1,7 @@ import configparser import json import logging +import re from typing import Callable, Optional import requests @@ -169,7 +170,10 @@ def retrieve_over_http( tot = int(response.headers.get("content-length", 0)) if tot == 0: - tot = get_download_size(url) + try: + tot = get_download_size(url) + except Exception as e: + tot = 0 task_id = progress.add_task( "download", @@ -216,24 +220,33 @@ def get_download_size(url: str) -> int: ValueError: If the file size cannot be extracted from the response. """ - # Replace the 'raw' in the url with 'src' - url_split = url.split('/') - url_split[5] = 'src' - url = '/'.join(url_split) - try: + # Replace the 'raw' in the url with 'src' + url_split = url.split('/') + url_split[5] = 'src' + url = '/'.join(url_split) + response = requests.get(url) response.raise_for_status() - start_marker = b">" - end_marker = b" MB" - end = response.content.find(end_marker) - start = response.content[:end].rfind(start_marker) + 1 + response_string = response.content.decode("utf-8") + size_string = re.search('([0-9]+.[0-9] [MGK]B)|([0-9]+ [MGK]B)', response_string) - if start == -1 or end == -1: + if not size_string: raise ValueError("File size information not found in the response") - size = float(response.content[start:end]) * 1e6 + size_string = size_string.group() + size = float(size_string[:-3]) + prefix = size_string[-2] + + if prefix == 'G': + size *= 1e9 + elif prefix == 'M': + size *= 1e6 + elif prefix == 'K': + size *= 1e3 + else: + raise ValueError("File size information not found in the response") return int(size) @@ -241,6 +254,8 @@ def get_download_size(url: str) -> int: raise e except ValueError as e: raise ValueError("File size information not found in the response.") + except IndexError as e: + raise IndexError("Improperly formatted URL") def conf_from_url(url): diff --git a/tests/test_utils.py b/tests/test_utils.py index 9928ebea..597bc07d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ import pytest +import requests.exceptions from bg_atlasapi import utils @@ -14,3 +15,31 @@ def test_http_check(): assert not utils.check_internet_connection( url="http://asd", raise_error=False ) + + +def test_get_download_size_bad_url(): + with pytest.raises(IndexError) as e: + utils.get_download_size(url="http://asd") + + +def test_get_download_size_no_size_url(): + with pytest.raises(ValueError) as e: + utils.get_download_size( + "https://gin.g-node.org/BrainGlobe/atlases/src/master/last_versions.conf" + ) + + +@pytest.mark.parametrize("url, real_size", + [ + ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/example_mouse_100um_v1.2.tar.gz", 7.3), + ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/allen_mouse_100um_v1.2.tar.gz", 61), + ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/admba_3d_p56_mouse_25um_v1.0.tar.gz", 335), + ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/osten_mouse_10um_v1.1.tar.gz", 3600) + ], +) +def test_get_download_size(url, real_size): + size = utils.get_download_size(url) + + real_size = real_size * 1e6 + + assert size == real_size From 55e67a4a73f6812dd13b4488e820e7a1b3a75e96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Oct 2023 16:40:55 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- bg_atlasapi/utils.py | 26 +++++++++++++++----------- tests/test_utils.py | 28 ++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/bg_atlasapi/utils.py b/bg_atlasapi/utils.py index 559c1405..3b67ed34 100644 --- a/bg_atlasapi/utils.py +++ b/bg_atlasapi/utils.py @@ -172,7 +172,7 @@ def retrieve_over_http( if tot == 0: try: tot = get_download_size(url) - except Exception as e: + except Exception: tot = 0 task_id = progress.add_task( @@ -188,7 +188,9 @@ def retrieve_over_http( fout.write(chunk) adv = len(chunk) completed += adv - progress.update(task_id, completed=min(completed, tot), refresh=True) + progress.update( + task_id, completed=min(completed, tot), refresh=True + ) if fn_update: # update handler with completed and total bytes @@ -222,15 +224,17 @@ def get_download_size(url: str) -> int: """ try: # Replace the 'raw' in the url with 'src' - url_split = url.split('/') - url_split[5] = 'src' - url = '/'.join(url_split) + url_split = url.split("/") + url_split[5] = "src" + url = "/".join(url_split) response = requests.get(url) response.raise_for_status() response_string = response.content.decode("utf-8") - size_string = re.search('([0-9]+.[0-9] [MGK]B)|([0-9]+ [MGK]B)', response_string) + size_string = re.search( + "([0-9]+.[0-9] [MGK]B)|([0-9]+ [MGK]B)", response_string + ) if not size_string: raise ValueError("File size information not found in the response") @@ -239,11 +243,11 @@ def get_download_size(url: str) -> int: size = float(size_string[:-3]) prefix = size_string[-2] - if prefix == 'G': + if prefix == "G": size *= 1e9 - elif prefix == 'M': + elif prefix == "M": size *= 1e6 - elif prefix == 'K': + elif prefix == "K": size *= 1e3 else: raise ValueError("File size information not found in the response") @@ -252,9 +256,9 @@ def get_download_size(url: str) -> int: except requests.exceptions.HTTPError as e: raise e - except ValueError as e: + except ValueError: raise ValueError("File size information not found in the response.") - except IndexError as e: + except IndexError: raise IndexError("Improperly formatted URL") diff --git a/tests/test_utils.py b/tests/test_utils.py index 597bc07d..63b3616b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,4 @@ import pytest -import requests.exceptions from bg_atlasapi import utils @@ -18,23 +17,36 @@ def test_http_check(): def test_get_download_size_bad_url(): - with pytest.raises(IndexError) as e: + with pytest.raises(IndexError): utils.get_download_size(url="http://asd") def test_get_download_size_no_size_url(): - with pytest.raises(ValueError) as e: + with pytest.raises(ValueError): utils.get_download_size( "https://gin.g-node.org/BrainGlobe/atlases/src/master/last_versions.conf" ) -@pytest.mark.parametrize("url, real_size", +@pytest.mark.parametrize( + "url, real_size", [ - ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/example_mouse_100um_v1.2.tar.gz", 7.3), - ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/allen_mouse_100um_v1.2.tar.gz", 61), - ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/admba_3d_p56_mouse_25um_v1.0.tar.gz", 335), - ("https://gin.g-node.org/BrainGlobe/atlases/raw/master/osten_mouse_10um_v1.1.tar.gz", 3600) + ( + "https://gin.g-node.org/BrainGlobe/atlases/raw/master/example_mouse_100um_v1.2.tar.gz", + 7.3, + ), + ( + "https://gin.g-node.org/BrainGlobe/atlases/raw/master/allen_mouse_100um_v1.2.tar.gz", + 61, + ), + ( + "https://gin.g-node.org/BrainGlobe/atlases/raw/master/admba_3d_p56_mouse_25um_v1.0.tar.gz", + 335, + ), + ( + "https://gin.g-node.org/BrainGlobe/atlases/raw/master/osten_mouse_10um_v1.1.tar.gz", + 3600, + ), ], ) def test_get_download_size(url, real_size): From 2195913cc060ee15ec8a385ee4b94bd8f11574fa Mon Sep 17 00:00:00 2001 From: Igor Tatarnikov Date: Fri, 20 Oct 2023 12:06:11 +0100 Subject: [PATCH 5/8] Amended code to satisfy pre-commit hooks --- bg_atlasapi/utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bg_atlasapi/utils.py b/bg_atlasapi/utils.py index 3b67ed34..2987dd77 100644 --- a/bg_atlasapi/utils.py +++ b/bg_atlasapi/utils.py @@ -204,7 +204,7 @@ def retrieve_over_http( def get_download_size(url: str) -> int: - """Get file size based on the MB value found on the "src" page of each atlas + """Get file size based on the MB value on the "src" page of each atlas Parameters ---------- @@ -218,7 +218,7 @@ def get_download_size(url: str) -> int: Raises ------ - requests.exceptions.HTTPError: If there's an issue with the HTTP request. + requests.exceptions.HTTPError: If there's an issue with HTTP request. ValueError: If the file size cannot be extracted from the response. """ @@ -232,14 +232,16 @@ def get_download_size(url: str) -> int: response.raise_for_status() response_string = response.content.decode("utf-8") - size_string = re.search( + search_result = re.search( "([0-9]+.[0-9] [MGK]B)|([0-9]+ [MGK]B)", response_string ) - if not size_string: - raise ValueError("File size information not found in the response") + assert search_result is not None + + size_string = search_result.group() + + assert size_string is not None - size_string = size_string.group() size = float(size_string[:-3]) prefix = size_string[-2] From 77c213d85e6f734ddfed3f9cfd4f924b59650198 Mon Sep 17 00:00:00 2001 From: IgorTatarnikov Date: Sun, 22 Oct 2023 15:11:22 +0100 Subject: [PATCH 6/8] Fixed test to catch the correct exception thrown when the url contains no text matching the regex. --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 63b3616b..2f7e98fc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -22,7 +22,7 @@ def test_get_download_size_bad_url(): def test_get_download_size_no_size_url(): - with pytest.raises(ValueError): + with pytest.raises(AssertionError): utils.get_download_size( "https://gin.g-node.org/BrainGlobe/atlases/src/master/last_versions.conf" ) From 61879c6f26535df11b50dea7aa889d248344e2e1 Mon Sep 17 00:00:00 2001 From: IgorTatarnikov Date: Mon, 23 Oct 2023 13:05:32 +0100 Subject: [PATCH 7/8] Added tests to improve coverage of exceptional cases. --- bg_atlasapi/utils.py | 7 +++---- tests/test_utils.py | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/bg_atlasapi/utils.py b/bg_atlasapi/utils.py index 2987dd77..591ab491 100644 --- a/bg_atlasapi/utils.py +++ b/bg_atlasapi/utils.py @@ -220,6 +220,7 @@ def get_download_size(url: str) -> int: ------ requests.exceptions.HTTPError: If there's an issue with HTTP request. ValueError: If the file size cannot be extracted from the response. + IndexError: If the url is not formatted as expected """ try: @@ -233,7 +234,7 @@ def get_download_size(url: str) -> int: response_string = response.content.decode("utf-8") search_result = re.search( - "([0-9]+.[0-9] [MGK]B)|([0-9]+ [MGK]B)", response_string + r"([0-9]+\.[0-9] [MGK]B)|([0-9]+ [MGK]B)", response_string ) assert search_result is not None @@ -251,14 +252,12 @@ def get_download_size(url: str) -> int: size *= 1e6 elif prefix == "K": size *= 1e3 - else: - raise ValueError("File size information not found in the response") return int(size) except requests.exceptions.HTTPError as e: raise e - except ValueError: + except AssertionError: raise ValueError("File size information not found in the response.") except IndexError: raise IndexError("Improperly formatted URL") diff --git a/tests/test_utils.py b/tests/test_utils.py index 2f7e98fc..60e6f6b6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,14 @@ +from pathlib import Path +from unittest import mock + import pytest +import requests +from requests import HTTPError from bg_atlasapi import utils +test_url = "https://gin.g-node.org/BrainGlobe/atlases/raw/master/example_mouse_100um_v1.2.tar.gz" + def test_http_check(): assert utils.check_internet_connection() @@ -22,7 +29,7 @@ def test_get_download_size_bad_url(): def test_get_download_size_no_size_url(): - with pytest.raises(AssertionError): + with pytest.raises(ValueError): utils.get_download_size( "https://gin.g-node.org/BrainGlobe/atlases/src/master/last_versions.conf" ) @@ -55,3 +62,35 @@ def test_get_download_size(url, real_size): real_size = real_size * 1e6 assert size == real_size + + +def test_get_download_size_kb(): + with mock.patch("requests.get", autospec=True) as mock_request: + mock_response = mock.Mock(spec=requests.Response) + mock_response.status_code = 200 + mock_response.content = b"asd 24.7 KB 123sd" + mock_request.return_value = mock_response + + size = utils.get_download_size(test_url) + + assert size == 24700 + + +def test_get_download_size_HTTPError(): + with mock.patch("requests.get", autospec=True) as mock_request: + mock_request.side_effect = HTTPError() + + with pytest.raises(HTTPError): + utils.get_download_size(test_url) + + +def test_retrieve_over_http_get_download_size_exception(): + with mock.patch("requests.get", autospec=True) as mock_request: + mock_response = mock.Mock(spec=requests.Response) + mock_response.status_code = 200 + mock_response.content = b"1234" + mock_response.headers = {} + mock_request.return_value = mock_response + + with pytest.raises(FileNotFoundError): + utils.retrieve_over_http(test_url, Path("/tmp/path")) From 43185a7e8a3665df9d72e2faa4a0c89ad6c2ae28 Mon Sep 17 00:00:00 2001 From: IgorTatarnikov Date: Mon, 23 Oct 2023 13:22:00 +0100 Subject: [PATCH 8/8] Removed retrieve_over_http test --- tests/test_utils.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 60e6f6b6..1d194872 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,3 @@ -from pathlib import Path from unittest import mock import pytest @@ -82,15 +81,3 @@ def test_get_download_size_HTTPError(): with pytest.raises(HTTPError): utils.get_download_size(test_url) - - -def test_retrieve_over_http_get_download_size_exception(): - with mock.patch("requests.get", autospec=True) as mock_request: - mock_response = mock.Mock(spec=requests.Response) - mock_response.status_code = 200 - mock_response.content = b"1234" - mock_response.headers = {} - mock_request.return_value = mock_response - - with pytest.raises(FileNotFoundError): - utils.retrieve_over_http(test_url, Path("/tmp/path"))