Skip to content

Commit

Permalink
Smarter (and looser) link equivalency logic
Browse files Browse the repository at this point in the history
  • Loading branch information
uranusjr committed Jun 18, 2021
1 parent 7c3abcc commit c55d17c
Showing 1 changed file with 46 additions and 3 deletions.
49 changes: 46 additions & 3 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import posixpath
import re
import urllib.parse
from typing import TYPE_CHECKING, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union

from pip._internal.utils.filetypes import WHEEL_EXTENSION
from pip._internal.utils.hashes import Hashes
Expand Down Expand Up @@ -242,7 +242,50 @@ def is_hash_allowed(self, hashes):
return hashes.is_hash_allowed(self.hash_name, hex_digest=self.hash)


# TODO: Relax this comparison logic to ignore, for example, fragments.
class _CleanResult(NamedTuple):
"""Convert link for equivalency check.
This is used in the resolver to check whether two URL-specified requirements
likely point to the same distribution and can be considered equivalent. This
equivalency logic avoids comparing URLs literally, which can be too strict
(e.g. "a=1&b=2" vs "b=2&a=1") and produce conflicts unexpecting to users.
Currently this does three things:
1. Drop the basic auth part. This is technically wrong since a server can
serve different content based on auth, but if it does that, it is even
impossible to guarantee two URLs without auth are equivalent, since
the user can input different auth information when prompted. So the
practical solution is to assume the auth doesn't affect the response.
2. Parse the query to avoid the ordering issue.
3. Parse the fragment, and explicitly drop the "egg=" part since it is
commonly provided as the project name for compatibility. This is wrong in
the strictest sense, but too many people are doing it.
Note that query value ordering under the same key in query and fragment are
NOT cleaned; i.e. "a=1&a=2" and "a=2&a=1" are still considered different.
"""

parsed: urllib.parse.SplitResult
query: Dict[str, List[str]]
fragment: Dict[str, List[str]]

@classmethod
def from_link(cls, link: Link) -> "_CleanResult":
parsed = link._parsed_url
netloc = parsed.netloc.rsplit("@", 1)[-1]
# The fragment does not necessarily use the query string format
# (it's a pip-specific syntax), so we set keep_blank_values to keep
# a fragment that's not a key-value pair (e.g. "#title_1").
frag_qs = urllib.parse.parse_qs(parsed.fragment, keep_blank_values=True)
frag_qs.pop("egg", None)
return _CleanResult(
parsed=parsed._replace(netloc=netloc, query="", fragment=""),
query=urllib.parse.parse_qs(parsed.query),
fragment=frag_qs,
)


def links_equivalent(link1, link2):
# type: (Link, Link) -> bool
return link1 == link2
return _CleanResult.from_link(link1) == _CleanResult.from_link(link2)

0 comments on commit c55d17c

Please sign in to comment.