Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the URL quoting in _clean_link() for IPv6 addresses #6245

Merged
merged 1 commit into from
Apr 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/6285.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix incorrect URL quoting of IPv6 addresses.
21 changes: 17 additions & 4 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,15 +939,28 @@ def _get_encoding_from_headers(headers):
return None


_CLEAN_LINK_RE = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)


def _clean_link(url):
# type: (str) -> str
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
the link, it will be rewritten to %20 (while not over-quoting
% or other characters)."""
return _CLEAN_LINK_RE.sub(lambda match: '%%%2x' % ord(match.group(0)), url)
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`. Note that the
# `netloc` can be empty and the URI will then refer to a local
# filesystem path.
result = urllib_parse.urlparse(url)
# In both cases below we unquote prior to quoting to make sure
# nothing is double quoted.
if result.netloc == "":
nicolasbock marked this conversation as resolved.
Show resolved Hide resolved
# On Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
path = urllib_request.pathname2url(
urllib_request.url2pathname(result.path))
else:
path = urllib_parse.quote(urllib_parse.unquote(result.path))
return urllib_parse.urlunparse(result._replace(path=path))


class HTMLPage(object):
Expand Down
64 changes: 63 additions & 1 deletion tests/unit/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from pip._internal.download import PipSession
from pip._internal.index import (
Link, PackageFinder, _determine_base_url, _egg_info_matches,
Link, PackageFinder, _clean_link, _determine_base_url, _egg_info_matches,
_find_name_version_sep, _get_html_page,
)

Expand Down Expand Up @@ -280,3 +280,65 @@ def test_request_retries(caplog):
'Could not fetch URL http://localhost: Retry error - skipping'
in caplog.text
)


@pytest.mark.parametrize(
("url", "clean_url"),
[
# URL with hostname and port. Port separator should not be quoted.
("https://localhost.localdomain:8181/path/with space/",
"https://localhost.localdomain:8181/path/with%20space/"),
# URL that is already properly quoted. The quoting `%`
# characters should not be quoted again.
("https://localhost.localdomain:8181/path/with%20quoted%20space/",
"https://localhost.localdomain:8181/path/with%20quoted%20space/"),
# URL with IPv4 address and port.
("https://127.0.0.1:8181/path/with space/",
"https://127.0.0.1:8181/path/with%20space/"),
# URL with IPv6 address and port. The `[]` brackets around the
# IPv6 address should not be quoted.
("https://[fd00:0:0:236::100]:8181/path/with space/",
"https://[fd00:0:0:236::100]:8181/path/with%20space/"),
# URL with query. The leading `?` should not be quoted.
("https://localhost.localdomain:8181/path/with/query?request=test",
"https://localhost.localdomain:8181/path/with/query?request=test"),
# URL with colon in the path portion.
("https://localhost.localdomain:8181/path:/with:/colon",
"https://localhost.localdomain:8181/path%3A/with%3A/colon"),
# URL with something that looks like a drive letter, but is
# not. The `:` should be quoted.
("https://localhost.localdomain/T:/path/",
"https://localhost.localdomain/T%3A/path/")
]
)
def test_clean_link(url, clean_url):
assert(_clean_link(url) == clean_url)


@pytest.mark.parametrize(
("url", "clean_url"),
[
# URL with Windows drive letter. The `:` after the drive
# letter should not be quoted. The trailing `/` should be
# removed.
("file:///T:/path/with spaces/",
"file:///T:/path/with%20spaces")
]
)
@pytest.mark.skipif("sys.platform != 'win32'")
def test_clean_link_windows(url, clean_url):
assert(_clean_link(url) == clean_url)


@pytest.mark.parametrize(
("url", "clean_url"),
[
# URL with Windows drive letter, running on non-windows
# platform. The `:` after the drive should be quoted.
("file:///T:/path/with spaces/",
"file:///T%3A/path/with%20spaces/")
]
)
@pytest.mark.skipif("sys.platform == 'win32'")
def test_clean_link_non_windows(url, clean_url):
assert(_clean_link(url) == clean_url)