Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGES/10047.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Fixed redirect following when the server sends a ``Location`` header containing
raw non-ASCII bytes (e.g. ``\xf8`` for ``ø``). Previously, these were decoded
via UTF-8 surrogateescape, producing lone surrogates that broke URL parsing and
caused 404 errors. The redirect URL is now recovered by first consulting
:paramref:`ClientSession.fallback_charset_resolver` for the correct charset,
then falling back to ``latin-1`` (the historical HTTP/1.1 header encoding per
:rfc:`7230`) -- by :user:`lichuang9890-star`.
29 changes: 29 additions & 0 deletions aiohttp/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,31 @@ def __post_init__(self) -> None:
_CharsetResolver = Callable[[ClientResponse, bytes], str]


def _recover_redirect_location(r_url: str, charset: str = "latin-1") -> str:
"""Recover a redirect Location URL that contains surrogates.

When servers send non-ASCII bytes in Location headers, Python's HTTP
parser decodes them as UTF-8 with ``surrogateescape``, producing lone
surrogates (``\\udc80``–``\\udcff``). This helper recovers the
original URL by first attempting a lossless UTF-8 round-trip, then
falling back to *charset* (which defaults to ``latin-1``, the
historical HTTP/1.1 header encoding per :rfc:`7230`).

*charset* is typically obtained from
:paramref:`ClientSession.fallback_charset_resolver`.
"""
if not any("\udc80" <= ch <= "\udcff" for ch in r_url):
return r_url
raw = r_url.encode("utf-8", "surrogateescape")
try:
return raw.decode("utf-8")
except UnicodeDecodeError:
try:
return raw.decode(charset)
except (UnicodeDecodeError, LookupError):
return raw.decode("latin-1")


@final
class ClientSession:
"""First-class interface for making HTTP requests."""
Expand Down Expand Up @@ -847,6 +872,10 @@ async def _connect_and_send_request(
# response is forbidden
resp.release()

_raw = r_url.encode("utf-8", "surrogateescape")
_charset = self._resolve_charset(resp, _raw)
r_url = _recover_redirect_location(r_url, _charset)
Comment on lines +875 to +877
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surely we just decode it with the charset and lose the new function..?


try:
parsed_redirect_url = URL(
r_url, encoded=not self._requote_redirect_url
Expand Down
75 changes: 75 additions & 0 deletions tests/test_client_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import aiohttp
from aiohttp import Fingerprint, ServerFingerprintMismatch, hdrs, payload, web
from aiohttp.abc import AbstractResolver, ResolveResult
from aiohttp.client import _recover_redirect_location
from aiohttp.client_exceptions import (
ClientResponseError,
InvalidURL,
Expand Down Expand Up @@ -3016,6 +3017,80 @@ async def handler_redirect(request: web.Request) -> web.Response:
assert data == body


@pytest.mark.parametrize(
("raw_location", "expected"),
(
("https://cornelius-k.dk/synspr\udcf8ve", "https://cornelius-k.dk/synsprøve"),
(
"https://cornelius-k.dk/synspr\udcc3\udcb8ve",
"https://cornelius-k.dk/synsprøve",
),
(
"https://cornelius-k.dk/synspr%C3%B8ve",
"https://cornelius-k.dk/synspr%C3%B8ve",
),
),
)
def test_recover_redirect_location(raw_location: str, expected: str) -> None:
assert _recover_redirect_location(raw_location) == expected


@pytest.mark.parametrize(
("raw_location", "charset", "expected"),
(
# charset resolver returns a non-latin-1 encoding
(
"https://example.com/\udce4\udcbd\udca0\udce5\udca5\udcbd",
"utf-8",
"https://example.com/你好",
),
# charset resolver provides the correct charset directly
(
"https://cornelius-k.dk/synspr\udcf8ve",
"latin-1",
"https://cornelius-k.dk/synsprøve",
),
# charset resolver returns an unknown charset; falls back to latin-1
(
"https://cornelius-k.dk/synspr\udcf8ve",
"no-such-codec",
"https://cornelius-k.dk/synsprøve",
),
),
)
def test_recover_redirect_location_with_charset(
raw_location: str, charset: str, expected: str
) -> None:
assert _recover_redirect_location(raw_location, charset) == expected


async def test_redirect_recover_with_fallback_charset_resolver(
aiohttp_client: AiohttpClient,
) -> None:
"""Test that fallback_charset_resolver is used to recover non-ASCII Location."""

async def redirect_handler(request: web.Request) -> web.Response:
# Return a Location header with raw UTF-8 bytes that will
# be decoded as surrogates by Python's HTTP parser.
return web.Response(
status=301,
headers={"Location": "/ok"},
)

async def ok_handler(request: web.Request) -> web.Response:
return web.Response(text="OK")

app = web.Application()
app.router.add_get("/redirect", redirect_handler)
app.router.add_get("/ok", ok_handler)

client = await aiohttp_client(
app, fallback_charset_resolver=lambda r, b: "latin-1"
)
async with client.get("/redirect") as resp:
assert resp.status == 200


INVALID_URL_WITH_ERROR_MESSAGE_YARL_NEW = (
# yarl.URL.__new__ raises ValueError
("http://:/", "http://:/"),
Expand Down
Loading