diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 8f6e4218c97b38..816d02d86f4fc4 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -403,11 +403,26 @@ added matters. To illustrate:: .. attribute:: utf8 If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in - headers by encoding them as "encoded words". If ``True``, follow - :rfc:`6532` and use ``utf-8`` encoding for headers. Messages + headers by encoding them as :rfc:`2047` "encoded words". If ``True``, + follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages formatted in this way may be passed to SMTP servers that support the ``SMTPUTF8`` extension (:rfc:`6531`). + When ``False``, the generator will raise + :exc:`~email.errors.HeaderWriteError` if any header includes non-ASCII + characters in a context where :rfc:`2047` does not permit encoded words. + This particularly applies to mailboxes ("addr-spec") with non-ASCII + characters, which can be created via + :class:`~email.headerregistry.Address`. To use a mailbox with a non-ASCII + domain name with ``utf8=False``, first encode the domain using the + third-party :pypi:`idna` or :pypi:`uts46` module or with + :mod:`encodings.idna`. It is not possible to use a non-ASCII username + ("local-part") in a mailbox when ``utf8=False``. + + .. versionchanged:: 3.15 + Can trigger the raising of :exc:`~email.errors.HeaderWriteError`. + (Earlier versions incorrectly applied :rfc:`2047` in certain contexts, + mostly notably in addr-specs.) .. attribute:: refold_source diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 90d24bf96afeb4..2de824744cedb8 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -914,6 +914,16 @@ faulthandler (Contributed by Eric Froemling in :gh:`149085`.) +email +----- + +* Email generators now raise an error when an :class:`.EmailMessage` cannot be + accurately flattened due to a non-ASCII email address (mailbox) in an address + header. Options for supporting Email Address Internationalization (EAI) are + discussed in :attr:`.EmailPolicy.utf8`. + (Contributed by R David Murray and Mike Edmunds in :gh:`122540`.) + + functools --------- diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 4c5394ab6353ac..43216b0af84326 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -157,10 +157,7 @@ def all_defects(self): def startswith_fws(self): return self[0].startswith_fws() - @property - def as_ew_allowed(self): - """True if all top level tokens of this part may be RFC2047 encoded.""" - return all(part.as_ew_allowed for part in self) + as_ew_allowed = True @property def comments(self): @@ -429,6 +426,7 @@ def addr_spec(self): class AngleAddr(TokenList): token_type = 'angle-addr' + as_ew_allowed = False @property def local_part(self): @@ -847,26 +845,22 @@ def params(self): class ContentType(ParameterizedHeaderValue): token_type = 'content-type' - as_ew_allowed = False maintype = 'text' subtype = 'plain' class ContentDisposition(ParameterizedHeaderValue): token_type = 'content-disposition' - as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): token_type = 'content-transfer-encoding' - as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): token_type = 'header-label' - as_ew_allowed = False class MsgID(TokenList): @@ -2838,13 +2832,68 @@ def _steal_trailing_WSP_if_exists(lines): def _refold_parse_tree(parse_tree, *, policy): - """Return string of contents of parse_tree folded according to RFC rules. - - """ # max_line_length 0/None means no limit, ie: infinitely long. maxlen = policy.max_line_length or sys.maxsize encoding = 'utf-8' if policy.utf8 else 'us-ascii' lines = [''] # Folded lines to be output + if parse_tree.as_ew_allowed: + _refold_with_ew(parse_tree, lines, maxlen, encoding, policy=policy) + else: + _refold_without_ew(parse_tree, lines, maxlen, encoding, policy=policy) + return policy.linesep.join(lines) + policy.linesep + +def _refold_without_ew(parse_tree, lines, maxlen, encoding, *, policy): + parts = list(parse_tree) + while parts: + part = parts.pop(0) + tstr = str(part) + try: + tstr.encode(encoding) + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + # There is garbage data from parsing a message in binary mode, + # just pass it through. Not good, but the best we can do. + pass + elif policy.utf8: + # If this happens, it's a programmer error. + raise + else: + raise errors.HeaderWriteError( + f"Non-ASCII {part.token_type} '{part}' is invalid" + " under current policy setting (utf8=False)" + ) + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + parts = newparts + parts + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return + + +def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ last_word_is_ew = False last_ew = None # if there is an encoded word in the last line of lines, # points to the encoded word's first character @@ -2858,6 +2907,11 @@ def _refold_parse_tree(parse_tree, *, policy): if part is end_ew_not_allowed: wrap_as_ew_blocked -= 1 continue + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + last_word_is_ew = False + continue tstr = str(part) if not want_encoding: if part.token_type in ('ptext', 'vtext'): @@ -2879,14 +2933,11 @@ def _refold_parse_tree(parse_tree, *, policy): charset = 'utf-8' want_encoding = True - if part.token_type == 'mime-parameters': - # Mime parameter folding (using RFC2231) is extra special. - _fold_mime_parameters(part, lines, maxlen, encoding) - last_word_is_ew = False - continue - if want_encoding and not wrap_as_ew_blocked: - if not part.as_ew_allowed: + if any( + not x.as_ew_allowed for x in part + if hasattr(x, 'as_ew_allowed') + ): want_encoding = False last_ew = None if part.syntactic_break: @@ -2967,6 +3018,8 @@ def _refold_parse_tree(parse_tree, *, policy): [ValueTerminal(make_quoted_pairs(p), 'ptext') for p in newparts] + [ValueTerminal('"', 'ptext')]) + _refold_without_ew(newparts, lines, maxlen, encoding, policy=policy) + continue if part.token_type == 'comment': newparts = ( [ValueTerminal('(', 'ptext')] + @@ -2994,7 +3047,7 @@ def _refold_parse_tree(parse_tree, *, policy): lines[-1] += tstr last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP)) - return policy.linesep.join(lines) + policy.linesep + return def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew): """Fold string to_encode into lines as encoded word, combining if allowed. diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e28fe3892015b9..f8f5c41b4474c8 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3364,10 +3364,12 @@ def test_fold_unfoldable_element_stealing_whitespace(self): self._test(token, expected, policy=policy) def test_encoded_word_with_undecodable_bytes(self): - self._test(parser.get_address_list( - ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + self._test( + parser.get_address_list( + ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + ' ' )[0], - ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n', + ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?= \n', ) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 3c9a86f3e8cf29..8d912738029f78 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -1,4 +1,5 @@ import io +import re import textwrap import unittest import random @@ -295,6 +296,69 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_non_ascii_addr_spec_raises(self): + # non-ascii is not permitted in any part of an addr-spec. If the + # programmer generated it, it's an error. (See also + # test_non_ascii_addr_spec_preserved below.) + p = self.policy.clone(utf8=False, max_line_length=20) + g = self.genclass(self.ioclass(), policy=p) + # XXX The particular part detected here isn't part of a behavioral + # spec and may change in the future. + cases = [ + ('wők@example.com', 'wők', 'local-part'), + ('wok@exàmple.com', 'exàmple.com', 'domain'), + ('wők@exàmple.com', 'wők', 'local-part'), + ( + '"Name, for display" ', + 'wők@example.com', + 'addr-spec', + ), + ( + 'Näyttönimi ', + 'wők@example.com', + 'addr-spec', + ), + ( + '"a lőng quoted string as the local part"@example.com', + 'a lőng quoted string as the local part', + 'local-part', + ), + + ] + for address, badtoken, partname in cases: + with self.subTest(address=address): + msg = EmailMessage() + msg['To'] = address + expected_error = ( + fr"(?i)(?=.*non-ascii)" + fr"(?=.*{re.escape(badtoken)})" + fr"(?=.*{partname})" + fr"(?=.*policy.*utf8)" + ) + with self.assertRaisesRegex( + email.errors.HeaderWriteError, expected_error + ): + g.flatten(msg) + + def test_local_part_quoted_string_wrapped_correctly(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: <"a long local part in a quoted string"@example.com> + Subject: test + + None + """)), policy=self.policy.clone(max_line_length=20)) + expected = textwrap.dedent("""\ + To: <"a long local part in a + quoted string"@example.com> + Subject: test + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=30)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + def _test_boundary_detection(self, linesep): # Generate a boundary token in the same way as _make_boundary token = random.randrange(sys.maxsize) @@ -515,12 +579,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self): def test_smtputf8_policy(self): msg = EmailMessage() - msg['From'] = "Páolo " + msg['From'] = "Páolo " msg['To'] = 'Dinsdale' msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' msg.set_content("oh là là, know what I mean, know what I mean?") expected = textwrap.dedent("""\ - From: Páolo + From: Páolo To: Dinsdale Subject: Nudge nudge, wink, wink \u1F609 Content-Type: text/plain; charset="utf-8" @@ -555,6 +619,37 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_non_ascii_addr_spec_preserved(self): + # A defective non-ASCII addr-spec parsed from the original + # message is left unchanged when flattening. + # (See also test_non_ascii_addr_spec_raises above.) + source = ( + 'To: jörg@example.com, "But a long name still works with refold_source" ' + ).encode() + expected = ( + b'To: j\xc3\xb6rg@example.com,\n' + b' "But a long name still works with refold_source" \n' + b'\n' + ) + msg = message_from_bytes(source, policy=policy.default) + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_idna_encoding_preserved(self): + # Nothing tries to decode a pre-encoded IDNA domain. + msg = EmailMessage() + msg["To"] = Address( + username='jörg', + domain='☕.example'.encode('idna').decode() # IDNA 2003 + ) + expected = 'To: jörg@xn--53h.example\n\n'.encode() + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default.clone(utf8=True)) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst new file mode 100644 index 00000000000000..7082c72f685b05 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -0,0 +1,8 @@ +The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for +a mailbox with non-ASCII characters in its domain. Under a policy with +:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize +such a message will now raise an :exc:`~email.errors.HeaderWriteError`. +Either apply an appropriate IDNA encoding to convert the domain to ASCII before +serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with +``utf8=True``) to correctly pass through the internationalized domain name +as Unicode characters. diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst new file mode 100644 index 00000000000000..29c076d3a746c6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -0,0 +1,7 @@ +The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for +a mailbox with non-ASCII characters in its local-part. Under a policy with +:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize +such a message will now raise an :exc:`~email.errors.HeaderWriteError`. +There is no valid 7-bit encoding for an internationalized local-part. Use +:data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to +correctly pass through the local-part as Unicode characters.