@@ -1031,7 +1031,7 @@ static const char * const hangul_syllables[][3] = {
10311031
10321032/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10331033static int
1034- is_unified_ideograph (Py_UCS4 code )
1034+ is_cjk_unified_ideograph (Py_UCS4 code )
10351035{
10361036 return
10371037 (0x3400 <= code && code <= 0x4DBF ) || /* CJK Ideograph Extension A */
@@ -1045,6 +1045,15 @@ is_unified_ideograph(Py_UCS4 code)
10451045 (0x31350 <= code && code <= 0x323AF ); /* CJK Ideograph Extension H */
10461046}
10471047
1048+ /* These ranges need to match makeunicodedata.py:tangut_ranges. */
1049+ static int
1050+ is_tangut_ideograph (Py_UCS4 code )
1051+ {
1052+ return
1053+ (0x17000 <= code && code <= 0x187F7 ) || /* Tangut */
1054+ (0x18D00 <= code && code <= 0x18D08 ); /* Tangut Supplement */
1055+ }
1056+
10481057/* macros used to determine if the given code point is in the PUA range that
10491058 * we are using to store aliases and named sequences */
10501059#define IS_ALIAS (cp ) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1104,14 +1113,22 @@ _getucname(PyObject *self,
11041113 return 1 ;
11051114 }
11061115
1107- if (is_unified_ideograph (code )) {
1116+ if (is_cjk_unified_ideograph (code )) {
11081117 if (buflen < 28 )
11091118 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
11101119 return 0 ;
11111120 sprintf (buffer , "CJK UNIFIED IDEOGRAPH-%X" , code );
11121121 return 1 ;
11131122 }
11141123
1124+ if (is_tangut_ideograph (code )) {
1125+ if (buflen < 23 )
1126+ /* Worst case: TANGUT IDEOGRAPH-18D08 */
1127+ return 0 ;
1128+ sprintf (buffer , "TANGUT IDEOGRAPH-%X" , code );
1129+ return 1 ;
1130+ }
1131+
11151132 /* get offset into phrasebook */
11161133 offset = phrasebook_offset1 [(code >>phrasebook_shift )];
11171134 offset = phrasebook_offset2 [(offset <<phrasebook_shift ) +
@@ -1242,7 +1259,7 @@ _getcode(PyObject* self,
12421259 return 0 ;
12431260 }
12441261
1245- /* Check for unified ideographs. */
1262+ /* Check for CJK unified ideographs. */
12461263 if (strncmp (name , "CJK UNIFIED IDEOGRAPH-" , 22 ) == 0 ) {
12471264 /* Four or five hexdigits must follow. */
12481265 v = 0 ;
@@ -1260,12 +1277,38 @@ _getcode(PyObject* self,
12601277 return 0 ;
12611278 name ++ ;
12621279 }
1263- if (!is_unified_ideograph (v ))
1280+ if (!is_cjk_unified_ideograph (v ))
1281+ return 0 ;
1282+ * code = v ;
1283+ return 1 ;
1284+ }
1285+
1286+
1287+ /* Check for Tangut ideographs. */
1288+ if (strncmp (name , "TANGUT IDEOGRAPH-" , 17 ) == 0 ) {
1289+ /* Five hexdigits must follow. */
1290+ v = 0 ;
1291+ name += 17 ;
1292+ namelen -= 17 ;
1293+ if (namelen != 5 )
1294+ return 0 ;
1295+ while (namelen -- ) {
1296+ v *= 16 ;
1297+ if (* name >= '0' && * name <= '9' )
1298+ v += * name - '0' ;
1299+ else if (* name >= 'A' && * name <= 'F' )
1300+ v += * name - 'A' + 10 ;
1301+ else
1302+ return 0 ;
1303+ name ++ ;
1304+ }
1305+ if (!is_tangut_ideograph (v ))
12641306 return 0 ;
12651307 * code = v ;
12661308 return 1 ;
12671309 }
12681310
1311+
12691312 /* the following is the same as python's dictionary lookup, with
12701313 only minor changes. see the makeunicodedata script for more
12711314 details */
0 commit comments