diff --git a/CHANGES/10047.bugfix.rst b/CHANGES/10047.bugfix.rst new file mode 100644 index 00000000000..9ed1e0a1b0d --- /dev/null +++ b/CHANGES/10047.bugfix.rst @@ -0,0 +1,7 @@ +Fixed redirect following when the server sends a ``Location`` header containing +raw non-ASCII bytes (e.g. ``\xf8`` for ``ø``). Previously, these were decoded +via UTF-8 surrogateescape, producing lone surrogates that broke URL parsing and +caused 404 errors. The redirect URL is now recovered by first consulting +:paramref:`ClientSession.fallback_charset_resolver` for the correct charset, +then falling back to ``latin-1`` (the historical HTTP/1.1 header encoding per +:rfc:`7230`) -- by :user:`lichuang9890-star`. diff --git a/aiohttp/client.py b/aiohttp/client.py index c3e874e650d..c040dc4446d 100644 --- a/aiohttp/client.py +++ b/aiohttp/client.py @@ -268,6 +268,31 @@ def __post_init__(self) -> None: _CharsetResolver = Callable[[ClientResponse, bytes], str] +def _recover_redirect_location(r_url: str, charset: str = "latin-1") -> str: + """Recover a redirect Location URL that contains surrogates. + + When servers send non-ASCII bytes in Location headers, Python's HTTP + parser decodes them as UTF-8 with ``surrogateescape``, producing lone + surrogates (``\\udc80``–``\\udcff``). This helper recovers the + original URL by first attempting a lossless UTF-8 round-trip, then + falling back to *charset* (which defaults to ``latin-1``, the + historical HTTP/1.1 header encoding per :rfc:`7230`). + + *charset* is typically obtained from + :paramref:`ClientSession.fallback_charset_resolver`. + """ + if not any("\udc80" <= ch <= "\udcff" for ch in r_url): + return r_url + raw = r_url.encode("utf-8", "surrogateescape") + try: + return raw.decode("utf-8") + except UnicodeDecodeError: + try: + return raw.decode(charset) + except (UnicodeDecodeError, LookupError): + return raw.decode("latin-1") + + @final class ClientSession: """First-class interface for making HTTP requests.""" @@ -847,6 +872,10 @@ async def _connect_and_send_request( # response is forbidden resp.release() + _raw = r_url.encode("utf-8", "surrogateescape") + _charset = self._resolve_charset(resp, _raw) + r_url = _recover_redirect_location(r_url, _charset) + try: parsed_redirect_url = URL( r_url, encoded=not self._requote_redirect_url diff --git a/tests/test_client_functional.py b/tests/test_client_functional.py index 8ee45330bb5..8a5501e557a 100644 --- a/tests/test_client_functional.py +++ b/tests/test_client_functional.py @@ -41,6 +41,7 @@ import aiohttp from aiohttp import Fingerprint, ServerFingerprintMismatch, hdrs, payload, web from aiohttp.abc import AbstractResolver, ResolveResult +from aiohttp.client import _recover_redirect_location from aiohttp.client_exceptions import ( ClientResponseError, InvalidURL, @@ -3016,6 +3017,80 @@ async def handler_redirect(request: web.Request) -> web.Response: assert data == body +@pytest.mark.parametrize( + ("raw_location", "expected"), + ( + ("https://cornelius-k.dk/synspr\udcf8ve", "https://cornelius-k.dk/synsprøve"), + ( + "https://cornelius-k.dk/synspr\udcc3\udcb8ve", + "https://cornelius-k.dk/synsprøve", + ), + ( + "https://cornelius-k.dk/synspr%C3%B8ve", + "https://cornelius-k.dk/synspr%C3%B8ve", + ), + ), +) +def test_recover_redirect_location(raw_location: str, expected: str) -> None: + assert _recover_redirect_location(raw_location) == expected + + +@pytest.mark.parametrize( + ("raw_location", "charset", "expected"), + ( + # charset resolver returns a non-latin-1 encoding + ( + "https://example.com/\udce4\udcbd\udca0\udce5\udca5\udcbd", + "utf-8", + "https://example.com/你好", + ), + # charset resolver provides the correct charset directly + ( + "https://cornelius-k.dk/synspr\udcf8ve", + "latin-1", + "https://cornelius-k.dk/synsprøve", + ), + # charset resolver returns an unknown charset; falls back to latin-1 + ( + "https://cornelius-k.dk/synspr\udcf8ve", + "no-such-codec", + "https://cornelius-k.dk/synsprøve", + ), + ), +) +def test_recover_redirect_location_with_charset( + raw_location: str, charset: str, expected: str +) -> None: + assert _recover_redirect_location(raw_location, charset) == expected + + +async def test_redirect_recover_with_fallback_charset_resolver( + aiohttp_client: AiohttpClient, +) -> None: + """Test that fallback_charset_resolver is used to recover non-ASCII Location.""" + + async def redirect_handler(request: web.Request) -> web.Response: + # Return a Location header with raw UTF-8 bytes that will + # be decoded as surrogates by Python's HTTP parser. + return web.Response( + status=301, + headers={"Location": "/ok"}, + ) + + async def ok_handler(request: web.Request) -> web.Response: + return web.Response(text="OK") + + app = web.Application() + app.router.add_get("/redirect", redirect_handler) + app.router.add_get("/ok", ok_handler) + + client = await aiohttp_client( + app, fallback_charset_resolver=lambda r, b: "latin-1" + ) + async with client.get("/redirect") as resp: + assert resp.status == 200 + + INVALID_URL_WITH_ERROR_MESSAGE_YARL_NEW = ( # yarl.URL.__new__ raises ValueError ("http://:/", "http://:/"),