diff --git a/tests/test_url.py b/tests/test_url.py index ca84745..f969cff 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1384,6 +1384,11 @@ def test_domains_are_case_insensitive(self): canonicalize_url("http://www.EXAMPLE.com/"), "http://www.example.com/" ) + def test_userinfo_is_case_sensitive(self): + self.assertEqual( + canonicalize_url("sftp://UsEr:PaSsWoRd@www.EXAMPLE.com/"), "sftp://UsEr:PaSsWoRd@www.example.com/" + ) + def test_canonicalize_idns(self): self.assertEqual( canonicalize_url("http://www.bücher.de?q=bücher"), diff --git a/w3lib/url.py b/w3lib/url.py index 28e70cb..bb6486c 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -654,9 +654,14 @@ def canonicalize_url( fragment = "" if not keep_fragments else fragment + # Apply lowercase to the domain, but not to the userinfo. + netloc_parts = netloc.split("@") + netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":") + netloc = "@".join(netloc_parts) + # every part should be safe already return urlunparse( - (scheme, netloc.lower().rstrip(":"), path, params, query, fragment) + (scheme, netloc, path, params, query, fragment) )