Skip to content

Commit 36cd313

Browse files
committed
Convert more Unicode tables to packtab
Replace the remaining split-bin Unicode lookup tables in the unicodedata path with packtab-generated helpers for: - decomposition indexes - NFC composition pairs - Unicode name inverse codepoint lookup - legacy 3.2.0 change indexes Measured on macOS arm64 builds versus clean HEAD: - python.exe: 6163528 -> 6092632 bytes (-70896, -1.15%) - unicodedata.so: 772352 -> 673344 bytes (-99008, -12.82%) - combined shipped: 6935880 -> 6765976 bytes (-169904, -2.45%)
1 parent f302acc commit 36cd313

4 files changed

Lines changed: 5391 additions & 8543 deletions

File tree

Modules/unicodedata.c

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ _getrecord_ex(Py_UCS4 code)
8989
if (code >= 0x110000)
9090
index = 0;
9191
else {
92-
index = unicodedata_get_record_index(code);
92+
index = unicodedata_record_get_record_index(code);
9393
}
9494

9595
return &_PyUnicode_Database_Records[index];
@@ -492,9 +492,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
492492
if (code < 0 || code >= 0x110000)
493493
index = 0;
494494
else {
495-
index = decomp_index1[(code>>DECOMP_SHIFT)];
496-
index = decomp_index2[(index<<DECOMP_SHIFT)+
497-
(code&((1<<DECOMP_SHIFT)-1))];
495+
index = unicodedata_decomp_get_decomp_index(code);
498496
}
499497

500498
/* high byte is number of hex bytes (usually one or two), low byte
@@ -538,9 +536,7 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
538536
*index = 0;
539537
}
540538
else {
541-
*index = decomp_index1[(code>>DECOMP_SHIFT)];
542-
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
543-
(code&((1<<DECOMP_SHIFT)-1))];
539+
*index = unicodedata_decomp_get_decomp_index(code);
544540
}
545541

546542
/* high byte is number of hex bytes (usually one or two), low byte
@@ -710,7 +706,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
710706
const void *data;
711707
Py_UCS4 *output;
712708
Py_ssize_t i, i1, o, len;
713-
int f,l,index,index1,comb;
709+
int f,l,index,comb;
714710
Py_UCS4 code;
715711
Py_ssize_t skipped[20];
716712
int cskipped = 0;
@@ -809,9 +805,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
809805
continue;
810806
}
811807
index = f*TOTAL_LAST + l;
812-
index1 = comp_index[index >> COMP_SHIFT];
813-
code = comp_data[(index1<<COMP_SHIFT)+
814-
(index&((1<<COMP_SHIFT)-1))];
808+
code = unicodedata_comp_get_comp_data(index);
815809
if (code == 0)
816810
goto not_combinable;
817811

@@ -1395,9 +1389,7 @@ _getucname(PyObject *self,
13951389
}
13961390

13971391
/* get position of codepoint in order of names in the dawg */
1398-
offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
1399-
offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
1400-
(code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
1392+
offset = unicodename_get_dawg_codepoint_pos(code);
14011393
if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
14021394
return 0;
14031395

0 commit comments

Comments
 (0)