Skip to content

Commit 2821af0

Browse files
gh-148792: Add support for locales with @-modifiers on Windows
locale.setlocale() now supports Unix-like locale names with @-modifiers on Windows. For example: "ca_ES@valencia", "sr_RS@latin", "uz_UZ@cyrillic" and "ks_IN@devanagari".
1 parent bfe6f9f commit 2821af0

File tree

3 files changed

+94
-54
lines changed

3 files changed

+94
-54
lines changed

Doc/whatsnew/3.15.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,7 @@ locale
895895
* :func:`~locale.setlocale` now supports language codes with ``@``-modifiers.
896896
``@``-modifiers are no longer silently removed in :func:`~locale.getlocale`,
897897
but included in the language code.
898-
(Contributed by Serhiy Storchaka in :gh:`137729`.)
898+
(Contributed by Serhiy Storchaka in :gh:`137729` and :gh:`148792`.)
899899

900900
* Undeprecate the :func:`locale.getdefaultlocale` function.
901901
(Contributed by Victor Stinner in :gh:`130796`.)

Lib/locale.py

Lines changed: 90 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
1111
"""
1212

13+
import os
1314
import sys
1415
import encodings
1516
import encodings.aliases
@@ -468,6 +469,30 @@ def normalize(localename):
468469

469470
return localename
470471

472+
def _conv_to_windows(locale):
473+
locale = locale.replace('_', '-')
474+
if '@' in locale:
475+
locale, modifier = locale.split('@', 1)
476+
locale, _, encoding = locale.partition('.')
477+
locale, _, territory = locale.partition('-')
478+
suffix = ''
479+
modifier = modifier.lower()
480+
if modifier == 'valencia':
481+
suffix = '-' + modifier
482+
elif modifier:
483+
if modifier in _modifier_to_script:
484+
modifier = _modifier_to_script[modifier]
485+
else:
486+
modifier = modifier.title()
487+
locale += '-' + modifier
488+
if territory:
489+
locale += '-' + territory
490+
if suffix:
491+
locale += suffix
492+
if encoding:
493+
locale += '.' + encoding
494+
return locale
495+
471496
def _parse_localename(localename):
472497

473498
""" Parses the locale code for localename and returns the
@@ -621,6 +646,8 @@ def setlocale(category, locale=None):
621646
if locale and not isinstance(locale, _builtin_str):
622647
# convert to string
623648
locale = normalize(_build_localename(locale))
649+
if os.name == 'nt':
650+
locale = _conv_to_windows(locale)
624651
return _setlocale(category, locale)
625652

626653

@@ -1546,9 +1573,9 @@ def getpreferredencoding(do_setlocale=True):
15461573
0x004d: "as", # Assamese
15471574
0x044d: "as_IN", # Assamese - India
15481575
0x002c: "az", # Azerbaijani (Latin)
1549-
0x742c: "az", # Azerbaijani (Cyrillic)
1550-
0x782c: "az", # Azerbaijani (Latin)
1551-
0x042c: "az_AZ", # Azerbaijani (Latin) - Azerbaijan
1576+
0x742c: "az@cyrillic", # Azerbaijani (Cyrillic)
1577+
0x782c: "az@latin", # Azerbaijani (Latin)
1578+
0x042c: "az_AZ@latin", # Azerbaijani (Latin) - Azerbaijan
15521579
0x0045: "bn", # Bangla
15531580
0x0445: "bn_IN", # Bangla - India
15541581
0x0845: "bn_BD", # Bangla - Bangladesh
@@ -1558,10 +1585,10 @@ def getpreferredencoding(do_setlocale=True):
15581585
0x042d: "eu_ES", # Basque - Spain
15591586
0x0023: "be", # Belarusian
15601587
0x0423: "be_BY", # Belarusian - Belarus
1561-
0x641a: "bs", # Bosnian (Cyrillic)
1562-
0x681a: "bs", # Bosnian (Latin)
1563-
0x141a: "bs_BA", # Bosnian (Latin) - Bosnia and Herzegovina
1564-
0x201a: "bs_BA", # Bosnian (Cyrillic) - Bosnia and Herzegovina
1588+
0x641a: "bs@cyrillic", # Bosnian (Cyrillic)
1589+
0x681a: "bs@latin", # Bosnian (Latin)
1590+
0x141a: "bs_BA@latin", # Bosnian (Latin) - Bosnia and Herzegovina
1591+
0x201a: "bs_BA@cyrillic", # Bosnian (Cyrillic) - Bosnia and Herzegovina
15651592
0x781a: "bs", # Bosnian (Latin)
15661593
0x007e: "br", # Breton
15671594
0x047e: "br_FR", # Breton - France
@@ -1571,16 +1598,16 @@ def getpreferredencoding(do_setlocale=True):
15711598
0x0455: "my_MM", # Burmese - Myanmar
15721599
0x0003: "ca", # Catalan
15731600
0x0403: "ca_ES", # Catalan - Spain
1574-
0x0803: "ca_ES", # Valencian - Spain
1601+
0x0803: "ca_ES@valencia", # Valencian - Spain
15751602
0x0092: "ku", # Central Kurdish
1576-
0x7c92: "ku", # Central Kurdish
1577-
0x0492: "ku_IQ", # Central Kurdish - Iraq
1603+
0x7c92: "ku@arabic", # Central Kurdish
1604+
0x0492: "ku_IQ@arabic", # Central Kurdish - Iraq
15781605
0x005c: "chr", # Cherokee
1579-
0x7c5c: "chr", # Cherokee
1580-
0x045c: "chr_US", # Cherokee - United States
1581-
0x0004: "zh", # Chinese (Simplified)
1606+
0x7c5c: "chr@Cher", # Cherokee
1607+
0x045c: "chr_US@Cher", # Cherokee - United States
1608+
0x0004: "zh@Hans", # Chinese (Simplified)
15821609
0x7804: "zh", # Chinese (Simplified)
1583-
0x7c04: "zh", # Chinese (Traditional)
1610+
0x7c04: "zh@Hant", # Chinese (Traditional)
15841611
0x0404: "zh_TW", # Chinese (Traditional) - Taiwan
15851612
0x0804: "zh_CN", # Chinese (Simplified) - People's Republic of China
15861613
0x0c04: "zh_HK", # Chinese (Traditional) - Hong Kong S.A.R.
@@ -1648,9 +1675,9 @@ def getpreferredencoding(do_setlocale=True):
16481675
0x0062: "fy", # Frisian
16491676
0x0462: "fy_NL", # Frisian - Netherlands
16501677
0x0067: "ff", # Fulah
1651-
0x7c67: "ff", # Fulah (Latin)
1652-
0x0467: "ff_NG",
1653-
0x0867: "ff_SN", # Fulah - Senegal
1678+
0x7c67: "ff@latin", # Fulah (Latin)
1679+
0x0467: "ff_NG@latin",
1680+
0x0867: "ff_SN@latin", # Fulah - Senegal
16541681
0x0056: "gl", # Galician
16551682
0x0456: "gl_ES", # Galician - Spain
16561683
0x0037: "ka", # Georgian
@@ -1670,8 +1697,8 @@ def getpreferredencoding(do_setlocale=True):
16701697
0x0047: "gu", # Gujarati
16711698
0x0447: "gu_IN", # Gujarati - India
16721699
0x0068: "ha", # Hausa (Latin)
1673-
0x7c68: "ha", # Hausa (Latin)
1674-
0x0468: "ha_NG", # Hausa (Latin) - Nigeria
1700+
0x7c68: "ha@latin", # Hausa (Latin)
1701+
0x0468: "ha_NG@latin", # Hausa (Latin) - Nigeria
16751702
0x0075: "haw", # Hawaiian
16761703
0x0475: "haw_US", # Hawaiian - United States
16771704
0x000d: "he", # Hebrew
@@ -1687,10 +1714,10 @@ def getpreferredencoding(do_setlocale=True):
16871714
0x0021: "id", # Indonesian
16881715
0x0421: "id_ID", # Indonesian - Indonesia
16891716
0x005d: "iu", # Inuktitut (Latin)
1690-
0x785d: "iu", # Inuktitut (Syllabics)
1691-
0x7c5d: "iu", # Inuktitut (Latin)
1692-
0x045d: "iu_CA", # Inuktitut (Syllabics) - Canada
1693-
0x085d: "iu_CA", # Inuktitut (Latin) - Canada
1717+
0x785d: "iu@Cans", # Inuktitut (Syllabics)
1718+
0x7c5d: "iu@latin", # Inuktitut (Latin)
1719+
0x045d: "iu_CA@Cans", # Inuktitut (Syllabics) - Canada
1720+
0x085d: "iu_CA@latin", # Inuktitut (Latin) - Canada
16941721
0x003c: "ga", # Irish
16951722
0x083c: "ga_IE", # Irish - Ireland
16961723
0x0010: "it", # Italian
@@ -1700,10 +1727,10 @@ def getpreferredencoding(do_setlocale=True):
17001727
0x0411: "ja_JP", # Japanese - Japan
17011728
0x004b: "kn", # Kannada
17021729
0x044b: "kn_IN", # Kannada - India
1703-
0x0471: "kr_NG", # Kanuri (Latin) - Nigeria
1730+
0x0471: "kr_NG@latin", # Kanuri (Latin) - Nigeria
17041731
0x0060: "ks", # Kashmiri
1705-
0x0460: "ks", # Kashmiri - Perso_Arabic
1706-
0x0860: "ks_IN", # Kashmiri (Devanagari) - India
1732+
0x0460: "ks@arabic", # Kashmiri - Perso_Arabic
1733+
0x0860: "ks_IN@devanagari", # Kashmiri (Devanagari) - India
17071734
0x003f: "kk", # Kazakh
17081735
0x043f: "kk_KZ", # Kazakh - Kazakhstan
17091736
0x0053: "km", # Khmer
@@ -1747,10 +1774,10 @@ def getpreferredencoding(do_setlocale=True):
17471774
0x007c: "moh", # Mohawk
17481775
0x047c: "moh_CA", # Mohawk - Canada
17491776
0x0050: "mn", # Mongolian (Cyrillic)
1750-
0x7850: "mn", # Mongolian (Cyrillic)
1751-
0x7c50: "mn", # Mongolian (Traditional Mongolian)
1777+
0x7850: "mn@cyrillic", # Mongolian (Cyrillic)
1778+
0x7c50: "mn@Mong", # Mongolian (Traditional Mongolian)
17521779
0x0450: "mn_MN", # Mongolian (Cyrillic) - Mongolia
1753-
0x0c50: "mn_MN", # Mongolian (Traditional Mongolian) - Mongolia
1780+
0x0c50: "mn_MN@Mong", # Mongolian (Traditional Mongolian) - Mongolia
17541781
0x0061: "ne", # Nepali
17551782
0x0461: "ne_NP", # Nepali - Nepal
17561783
0x0861: "ne_IN", # Nepali - India
@@ -1775,9 +1802,9 @@ def getpreferredencoding(do_setlocale=True):
17751802
0x0416: "pt_BR", # Portuguese - Brazil
17761803
0x0816: "pt_PT", # Portuguese - Portugal
17771804
0x0046: "pa", # Punjabi
1778-
0x7c46: "pa", # Punjabi
1805+
0x7c46: "pa@arabic", # Punjabi
17791806
0x0446: "pa_IN", # Punjabi - India
1780-
0x0846: "pa_PK", # Punjabi - Islamic Republic of Pakistan
1807+
0x0846: "pa_PK@arabic", # Punjabi - Islamic Republic of Pakistan
17811808
0x006b: "quz", # Quechua
17821809
0x046b: "quz_BO", # Quechua - Bolivia
17831810
0x086b: "quz_EC", # Quechua - Ecuador
@@ -1810,25 +1837,25 @@ def getpreferredencoding(do_setlocale=True):
18101837
0x044f: "sa_IN", # Sanskrit - India
18111838
0x0091: "gd", # Scottish Gaelic
18121839
0x0491: "gd_GB", # Scottish Gaelic - United Kingdom
1813-
0x6c1a: "sr", # Serbian (Cyrillic)
1814-
0x701a: "sr", # Serbian (Latin)
1840+
0x6c1a: "sr@cyrillic", # Serbian (Cyrillic)
1841+
0x701a: "sr@latin", # Serbian (Latin)
18151842
0x7c1a: "sr", # Serbian (Latin)
1816-
0x081a: "sr_CS", # Serbian (Latin) - Serbia and Montenegro (Former)
1817-
0x0c1a: "sr_CS", # Serbian (Cyrillic) - Serbia and Montenegro (Former)
1818-
0x181a: "sr_BA", # Serbian (Latin) - Bosnia and Herzegovina
1819-
0x1c1a: "sr_BA", # Serbian (Cyrillic) - Bosnia and Herzegovina
1820-
0x241a: "sr_RS", # Serbian (Latin) - Serbia
1821-
0x281a: "sr_RS", # Serbian (Cyrillic) - Serbia
1822-
0x2c1a: "sr_ME", # Serbian (Latin) - Montenegro
1823-
0x301a: "sr_ME", # Serbian (Cyrillic) - Montenegro
1843+
0x081a: "sr_CS@latin", # Serbian (Latin) - Serbia and Montenegro (Former)
1844+
0x0c1a: "sr_CS@cyrillic", # Serbian (Cyrillic) - Serbia and Montenegro (Former)
1845+
0x181a: "sr_BA@latin", # Serbian (Latin) - Bosnia and Herzegovina
1846+
0x1c1a: "sr_BA@cyrillic", # Serbian (Cyrillic) - Bosnia and Herzegovina
1847+
0x241a: "sr_RS@latin", # Serbian (Latin) - Serbia
1848+
0x281a: "sr_RS@cyrillic", # Serbian (Cyrillic) - Serbia
1849+
0x2c1a: "sr_ME@latin", # Serbian (Latin) - Montenegro
1850+
0x301a: "sr_ME@cyrillic", # Serbian (Cyrillic) - Montenegro
18241851
0x006c: "nso", # Sesotho sa Leboa
18251852
0x046c: "nso_ZA", # Sesotho sa Leboa - South Africa
18261853
0x0032: "tn", # Setswana
18271854
0x0432: "tn_ZA", # Setswana - South Africa
18281855
0x0832: "tn_BW", # Setswana - Botswana
18291856
0x0059: "sd", # Sindhi
1830-
0x7c59: "sd", # Sindhi
1831-
0x0859: "sd_PK", # Sindhi - Islamic Republic of Pakistan
1857+
0x7c59: "sd@arabic", # Sindhi
1858+
0x0859: "sd_PK@arabic", # Sindhi - Islamic Republic of Pakistan
18321859
0x005b: "si", # Sinhala
18331860
0x045b: "si_LK", # Sinhala - Sri Lanka
18341861
0x001b: "sk", # Slovak
@@ -1867,14 +1894,14 @@ def getpreferredencoding(do_setlocale=True):
18671894
0x005a: "syr", # Syriac
18681895
0x045a: "syr_SY", # Syriac - Syria
18691896
0x0028: "tg", # Tajik (Cyrillic)
1870-
0x7c28: "tg", # Tajik (Cyrillic)
1871-
0x0428: "tg_TJ", # Tajik (Cyrillic) - Tajikistan
1897+
0x7c28: "tg@cyrillic", # Tajik (Cyrillic)
1898+
0x0428: "tg_TJ@cyrillic", # Tajik (Cyrillic) - Tajikistan
18721899
0x005f: "tzm", # Tamazight (Latin)
1873-
0x785f: "tzm",
1874-
0x7c5f: "tzm", # Tamazight (Latin)
1875-
0x085f: "tzm_DZ", # Tamazight (Latin) - Algeria
1876-
0x045f: "tzm_MA", # Central Atlas Tamazight (Arabic) - Morocco
1877-
0x105f: "tzm_MA",
1900+
0x785f: "tzm@Tfng",
1901+
0x7c5f: "tzm@latin", # Tamazight (Latin)
1902+
0x085f: "tzm_DZ@latin", # Tamazight (Latin) - Algeria
1903+
0x045f: "tzm_MA@arabic", # Central Atlas Tamazight (Arabic) - Morocco
1904+
0x105f: "tzm_MA@Tfng",
18781905
0x0049: "ta", # Tamil
18791906
0x0449: "ta_IN", # Tamil - India
18801907
0x0849: "ta_LK", # Tamil - Sri Lanka
@@ -1905,9 +1932,9 @@ def getpreferredencoding(do_setlocale=True):
19051932
0x0080: "ug", # Uyghur
19061933
0x0480: "ug_CN", # Uyghur - People's Republic of China
19071934
0x0043: "uz", # Uzbek (Latin)
1908-
0x7843: "uz", # Uzbek (Cyrillic)
1909-
0x7c43: "uz", # Uzbek (Latin)
1910-
0x0443: "uz_UZ", # Uzbek (Latin) - Uzbekistan
1935+
0x7843: "uz@cyrillic", # Uzbek (Cyrillic)
1936+
0x7c43: "uz@latin", # Uzbek (Latin)
1937+
0x0443: "uz_UZ@latin", # Uzbek (Latin) - Uzbekistan
19111938
0x0033: "ve", # Venda
19121939
0x0433: "ve_ZA", # Venda - South Africa
19131940
0x002a: "vi", # Vietnamese
@@ -1943,6 +1970,16 @@ def getpreferredencoding(do_setlocale=True):
19431970
0x00051004: "zh_SG",
19441971
}
19451972

1973+
# Maps Unix-like modifiers to ISO15924 script names
1974+
# https://www.unicode.org/iso15924/iso15924.txt
1975+
1976+
_modifier_to_script = {
1977+
'arabic': 'Arab',
1978+
'cyrillic': 'Cyrl',
1979+
'devanagari': 'Deva',
1980+
'latin': 'Latn',
1981+
}
1982+
19461983
def _print_locale():
19471984

19481985
""" Test function.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
:func:`locale.setlocale` now supports Unix-like locale names with
2+
@-modifiers on Windows. For example: "ca_ES@valencia", "sr_RS@latin",
3+
"uz_UZ@cyrillic" and "ks_IN@devanagari".

0 commit comments

Comments
 (0)