Skip to content

Commit b69fdfc

Browse files
Add lookup tests and fix case-insensitivity for Tangut ideographs.
1 parent 2ae016c commit b69fdfc

2 files changed

Lines changed: 30 additions & 5 deletions

File tree

Lib/test/test_ucn.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,30 @@ def test_cjk_unified_ideographs(self):
111111
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
112112
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
113113

114+
def test_tangut_ideographs(self):
115+
self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000")
116+
self.checkletter("TANGUT IDEOGRAPH-187FF", "\U000187ff")
117+
self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00")
118+
self.checkletter("TANGUT IDEOGRAPH-18D1E", "\U00018d1e")
119+
self.checkletter("tangut ideograph-18d1e", "\U00018d1e")
120+
121+
def test_egyptian_hieroglyphs(self):
122+
self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460")
123+
self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa")
124+
self.checkletter("egyptian hieroglyph-143fa", "\U000143fa")
125+
126+
def test_khitan_small_script_characters(self):
127+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00")
128+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5")
129+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
130+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
131+
self.checkletter("khitan small script character-18cff", "\U00018cff")
132+
133+
def test_nushu_characters(self):
134+
self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170")
135+
self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb")
136+
self.checkletter("nushu character-1b2fb", "\U0001b2fb")
137+
114138
def test_bmp_characters(self):
115139
for code in range(0x10000):
116140
char = chr(code)

Modules/unicodedata.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1498,7 +1498,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14981498
}
14991499

15001500
/* Check for Tangut ideographs. */
1501-
if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
1501+
if (PyOS_strnicmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
15021502
/* Five hexdigits must follow. */
15031503
unsigned int v = 0;
15041504
name += 17;
@@ -1507,10 +1507,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
15071507
return 0;
15081508
while (namelen--) {
15091509
v *= 16;
1510-
if (*name >= '0' && *name <= '9')
1511-
v += *name - '0';
1512-
else if (*name >= 'A' && *name <= 'F')
1513-
v += *name - 'A' + 10;
1510+
Py_UCS1 c = Py_TOUPPER(*name);
1511+
if (c >= '0' && c <= '9')
1512+
v += c - '0';
1513+
else if (c >= 'A' && c <= 'F')
1514+
v += c - 'A' + 10;
15141515
else
15151516
return 0;
15161517
name++;

0 commit comments

Comments
 (0)