Skip to content

Commit 507cfb0

Browse files
authored
Add librt functionality for lazy deserialization (#21158)
I am going to add actual lazy deserialization in a separate PR, probably next week. It will require one more preparatory PR setting up global state for `lookup_fully_qualified()` that I will probably add on the weekend.
1 parent 4c85ff0 commit 507cfb0

10 files changed

Lines changed: 305 additions & 13 deletions

File tree

.github/workflows/test.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ jobs:
139139
timeout-minutes: 60
140140
env:
141141
TOX_SKIP_MISSING_INTERPRETERS: False
142+
VIRTUALENV_SYSTEM_SITE_PACKAGES: ${{ matrix.test_mypyc && 1 || 0 }}
142143
# Rich (pip) -- Disable color for windows + pytest
143144
FORCE_COLOR: ${{ !(startsWith(matrix.os, 'windows-') && startsWith(matrix.toxenv, 'py')) && 1 || 0 }}
144145
# Tox
@@ -209,8 +210,10 @@ jobs:
209210
210211
- name: Compiled with mypyc
211212
if: ${{ matrix.test_mypyc }}
213+
# Use local version of librt during self-compilation in tests.
212214
run: |
213215
pip install -r test-requirements.txt
216+
pip install -U mypyc/lib-rt
214217
CC=clang MYPYC_OPT_LEVEL=0 MYPY_USE_MYPYC=1 pip install -e .
215218
216219
- name: Setup tox environment

mypy-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ typing_extensions>=4.6.0
44
mypy_extensions>=1.0.0
55
pathspec>=1.0.0
66
tomli>=1.1.0; python_version<'3.11'
7-
librt>=0.8.0; platform_python_implementation != 'PyPy'
7+
librt>=0.9.0; platform_python_implementation != 'PyPy'

mypy/cache.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def read(cls, data: ReadBuffer, data_file: str) -> CacheMeta | None:
239239
# Always use this type alias to refer to type tags.
240240
Tag = u8
241241

242+
# Note: all tags should be kept in sync with lib-rt/internal/librt_internal.c.
242243
# Primitives.
243244
LITERAL_FALSE: Final[Tag] = 0
244245
LITERAL_TRUE: Final[Tag] = 1
@@ -264,6 +265,7 @@ def read(cls, data: ReadBuffer, data_file: str) -> CacheMeta | None:
264265
# Four integers representing source file (line, column) range.
265266
LOCATION: Final[Tag] = 152
266267

268+
RESERVED: Final[Tag] = 254
267269
END_TAG: Final[Tag] = 255
268270

269271

mypy/nodes.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
)
2222

2323
from librt.internal import (
24+
extract_symbol,
2425
read_float as read_float_bare,
2526
read_int as read_int_bare,
2627
read_str as read_str_bare,
@@ -4953,8 +4954,15 @@ def read(cls, data: ReadBuffer) -> SymbolTableNode:
49534954
sym.plugin_generated = read_bool(data)
49544955
cross_ref = read_str_opt(data)
49554956
if cross_ref is None:
4956-
sym._node = read_symbol(data)
4957-
if not isinstance(sym._node, TypeInfo):
4957+
tag = read_tag(data)
4958+
if tag == TYPE_INFO:
4959+
sym._node = TypeInfo.read(data)
4960+
else:
4961+
# This logic is temporary, to make sure we don't introduce
4962+
# regressions until we have proper lazy deserialization.
4963+
# It has negligible performance impact.
4964+
node_bytes = extract_symbol(data)
4965+
sym._node = read_symbol(ReadBuffer(node_bytes), tag)
49584966
sym.unfixed = True
49594967
else:
49604968
sym.cross_ref = cross_ref
@@ -5374,17 +5382,14 @@ def set_info(node: SymbolNode, info: TypeInfo) -> None:
53745382
TSTRING_EXPR: Final[Tag] = 229
53755383

53765384

5377-
def read_symbol(data: ReadBuffer) -> SymbolNode:
5378-
tag = read_tag(data)
5385+
def read_symbol(data: ReadBuffer, tag: Tag) -> SymbolNode:
53795386
# The branches here are ordered manually by type "popularity".
53805387
if tag == VAR:
53815388
return Var.read(data)
53825389
if tag == FUNC_DEF:
53835390
return FuncDef.read(data)
53845391
if tag == DECORATOR:
53855392
return Decorator.read(data)
5386-
if tag == TYPE_INFO:
5387-
return TypeInfo.read(data)
53885393
if tag == OVERLOADED_FUNC_DEF:
53895394
return OverloadedFuncDef.read(data)
53905395
if tag == TYPE_VAR_EXPR:

mypy/typeshed/stubs/librt/librt/internal.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ def read_int(data: ReadBuffer, /) -> int: ...
1919
def write_tag(data: WriteBuffer, value: u8, /) -> None: ...
2020
def read_tag(data: ReadBuffer, /) -> u8: ...
2121
def cache_version() -> u8: ...
22+
def extract_symbol(data: ReadBuffer, /) -> bytes: ...

mypyc/lib-rt/internal/librt_internal.c

Lines changed: 272 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ _write_long_int(PyObject *data, CPyTagged value) {
804804
}
805805

806806
// Write absolute integer value as byte array in a variable-length little endian format.
807-
int i;
807+
Py_ssize_t i;
808808
for (i = len; i > 1; i -= 2) {
809809
if (write_tag_internal(
810810
data, hex_to_int(str[i - 1]) | (hex_to_int(str[i - 2]) << 4)) == CPY_NONE_ERROR)
@@ -920,6 +920,275 @@ write_tag(PyObject *self, PyObject *const *args, size_t nargs) {
920920
return Py_None;
921921
}
922922

923+
// All tags must be kept in sync with cache.py, nodes.py, and types.py.
924+
// Primitive types.
925+
#define LITERAL_FALSE 0
926+
#define LITERAL_TRUE 1
927+
#define LITERAL_NONE 2
928+
#define LITERAL_INT 3
929+
#define LITERAL_STR 4
930+
#define LITERAL_BYTES 5
931+
#define LITERAL_FLOAT 6
932+
#define LITERAL_COMPLEX 7
933+
934+
// Supported builtin collections.
935+
#define LIST_GEN 20
936+
#define LIST_INT 21
937+
#define LIST_STR 22
938+
#define LIST_BYTES 23
939+
#define TUPLE_GEN 24
940+
#define DICT_STR_GEN 30
941+
942+
// This is the smallest custom class tag.
943+
#define MYPY_FILE 50
944+
945+
// Instance class has special formats.
946+
#define INSTANCE 80
947+
#define INSTANCE_SIMPLE 81
948+
#define INSTANCE_GENERIC 82
949+
#define INSTANCE_STR 83
950+
#define INSTANCE_FUNCTION 84
951+
#define INSTANCE_INT 85
952+
#define INSTANCE_BOOL 86
953+
#define INSTANCE_OBJECT 87
954+
955+
#define RESERVED 254
956+
#define END_TAG 255
957+
958+
// Forward declaration.
959+
static char _skip_object(PyObject *data, uint8_t tag);
960+
961+
static inline char
962+
_skip(PyObject *data, Py_ssize_t size) {
963+
// We are careful about error conditions, so all
964+
// _skip_xxx() functions can return an error value.
965+
_CHECK_READ(data, size, CPY_NONE_ERROR)
966+
((ReadBufferObject *)data)->ptr += size;
967+
return CPY_NONE;
968+
}
969+
970+
static inline char
971+
_skip_short_int(PyObject *data, uint8_t first) {
972+
if ((first & TWO_BYTES_INT_BIT) == 0)
973+
return CPY_NONE;
974+
if ((first & FOUR_BYTES_INT_BIT) == 0)
975+
return _skip(data, 1);
976+
return _skip(data, 3);
977+
}
978+
979+
static inline char
980+
_skip_int(PyObject *data) {
981+
_CHECK_READ(data, 1, CPY_NONE_ERROR)
982+
983+
uint8_t first;
984+
_READ(&first, data, uint8_t);
985+
if (likely(first != LONG_INT_TRAILER)) {
986+
return _skip_short_int(data, first);
987+
}
988+
989+
_CHECK_READ(data, 1, CPY_NONE_ERROR)
990+
_READ(&first, data, uint8_t);
991+
Py_ssize_t size_and_sign = _read_short_int(data, first);
992+
if (size_and_sign == CPY_INT_TAG)
993+
return CPY_NONE_ERROR;
994+
if ((Py_ssize_t)size_and_sign < 0) {
995+
PyErr_SetString(PyExc_ValueError, "invalid int data");
996+
return CPY_NONE_ERROR;
997+
}
998+
Py_ssize_t size = size_and_sign >> 2;
999+
return _skip(data, size);
1000+
}
1001+
1002+
// This is essentially a wrapper around _read_short_int() that makes
1003+
// sure the result is valid.
1004+
static inline Py_ssize_t
1005+
_read_size(PyObject *data) {
1006+
_CHECK_READ(data, 1, -1)
1007+
uint8_t first;
1008+
_READ(&first, data, uint8_t);
1009+
// We actually allow serializing lists/dicts with over 4 billion items,
1010+
// but we don't really need to, fail with ValueError just in case.
1011+
if (unlikely(first == LONG_INT_TRAILER)) {
1012+
PyErr_SetString(PyExc_ValueError, "unsupported size");
1013+
return -1;
1014+
}
1015+
CPyTagged tagged_size = _read_short_int(data, first);
1016+
if (tagged_size == CPY_INT_TAG)
1017+
return -1;
1018+
if ((Py_ssize_t)tagged_size < 0) {
1019+
PyErr_SetString(PyExc_ValueError, "invalid size");
1020+
return -1;
1021+
}
1022+
Py_ssize_t size = tagged_size >> 1;
1023+
return size;
1024+
}
1025+
1026+
static inline char
1027+
_skip_str_bytes(PyObject *data) {
1028+
Py_ssize_t size = _read_size(data);
1029+
if (size < 0)
1030+
return CPY_NONE_ERROR;
1031+
return _skip(data, size);
1032+
}
1033+
1034+
// List/dict logic should be kept in sync with mypy/cache.py
1035+
static inline char
1036+
_skip_list_gen(PyObject *data) {
1037+
Py_ssize_t size = _read_size(data);
1038+
if (size < 0)
1039+
return CPY_NONE_ERROR;
1040+
Py_ssize_t i;
1041+
for (i = 0; i < size; i++) {
1042+
uint8_t tag = read_tag_internal(data);
1043+
if (unlikely(tag == CPY_LL_UINT_ERROR && PyErr_Occurred())) {
1044+
return CPY_NONE_ERROR;
1045+
}
1046+
if (unlikely(_skip_object(data, tag) == CPY_NONE_ERROR))
1047+
return CPY_NONE_ERROR;
1048+
}
1049+
return CPY_NONE;
1050+
}
1051+
1052+
static inline char
1053+
_skip_list_int(PyObject *data) {
1054+
Py_ssize_t size = _read_size(data);
1055+
if (size < 0)
1056+
return CPY_NONE_ERROR;
1057+
Py_ssize_t i;
1058+
for (i = 0; i < size; i++) {
1059+
if (unlikely(_skip_int(data) == CPY_NONE_ERROR))
1060+
return CPY_NONE_ERROR;
1061+
}
1062+
return CPY_NONE;
1063+
}
1064+
1065+
static inline char
1066+
_skip_list_str_bytes(PyObject *data) {
1067+
Py_ssize_t size = _read_size(data);
1068+
if (size < 0)
1069+
return CPY_NONE_ERROR;
1070+
Py_ssize_t i;
1071+
for (i = 0; i < size; i++) {
1072+
if (unlikely(_skip_str_bytes(data) == CPY_NONE_ERROR))
1073+
return CPY_NONE_ERROR;
1074+
}
1075+
return CPY_NONE;
1076+
}
1077+
1078+
static inline char
1079+
_skip_dict_str_gen(PyObject *data) {
1080+
Py_ssize_t size = _read_size(data);
1081+
if (size < 0)
1082+
return CPY_NONE_ERROR;
1083+
Py_ssize_t i;
1084+
for (i = 0; i < size; i++) {
1085+
// Bare key followed by tagged value.
1086+
if (unlikely(_skip_str_bytes(data) == CPY_NONE_ERROR))
1087+
return CPY_NONE_ERROR;
1088+
uint8_t tag = read_tag_internal(data);
1089+
if (unlikely(tag == CPY_LL_UINT_ERROR && PyErr_Occurred())) {
1090+
return CPY_NONE_ERROR;
1091+
}
1092+
if (unlikely(_skip_object(data, tag) == CPY_NONE_ERROR))
1093+
return CPY_NONE_ERROR;
1094+
}
1095+
return CPY_NONE;
1096+
}
1097+
1098+
// Similar to mypy/cache.py, the convention is that the caller reads
1099+
// the opening tag for custom classes.
1100+
static inline char
1101+
_skip_class(PyObject *data) {
1102+
while (1) {
1103+
uint8_t tag = read_tag_internal(data);
1104+
if (unlikely(tag == CPY_LL_UINT_ERROR && PyErr_Occurred())) {
1105+
return CPY_NONE_ERROR;
1106+
}
1107+
if (tag == END_TAG) {
1108+
return CPY_NONE;
1109+
}
1110+
if (unlikely(_skip_object(data, tag) == CPY_NONE_ERROR)) {
1111+
return CPY_NONE_ERROR;
1112+
}
1113+
}
1114+
}
1115+
1116+
// Instance has special compact layout (as an important optimization).
1117+
static inline char
1118+
_skip_instance(PyObject *data) {
1119+
uint8_t second_tag = read_tag_internal(data);
1120+
if (unlikely(second_tag == CPY_LL_UINT_ERROR && PyErr_Occurred())) {
1121+
return CPY_NONE_ERROR;
1122+
}
1123+
if (second_tag >= INSTANCE_STR && second_tag <= INSTANCE_OBJECT) {
1124+
return CPY_NONE;
1125+
}
1126+
if (second_tag == INSTANCE_SIMPLE) {
1127+
return _skip_str_bytes(data);
1128+
}
1129+
if (second_tag == INSTANCE_GENERIC) {
1130+
return _skip_class(data);
1131+
}
1132+
PyErr_Format(PyExc_ValueError, "Unexpected instance tag: %d", second_tag);
1133+
return CPY_NONE_ERROR;
1134+
}
1135+
1136+
// This is the main dispatch point. Branches are ordered manually
1137+
// based roughly on frequency in self-check.
1138+
static char
1139+
_skip_object(PyObject *data, uint8_t tag) {
1140+
if (tag == LITERAL_STR || tag == LITERAL_BYTES)
1141+
return _skip_str_bytes(data);
1142+
if (tag == LITERAL_NONE || tag == LITERAL_FALSE || tag == LITERAL_TRUE)
1143+
return CPY_NONE;
1144+
if (tag == LIST_GEN || tag == TUPLE_GEN)
1145+
return _skip_list_gen(data);
1146+
if (tag == LITERAL_INT)
1147+
return _skip_int(data);
1148+
if (tag == INSTANCE)
1149+
return _skip_instance(data);
1150+
// We intentionally exclude MypyFile as a sanity check. Module symbols should
1151+
// be always handled via cross_ref, and never appear in a symbol table as is.
1152+
if (tag > MYPY_FILE && tag < RESERVED)
1153+
return _skip_class(data);
1154+
if (tag == LIST_INT)
1155+
return _skip_list_int(data);
1156+
if (tag == LIST_STR || tag == LIST_BYTES)
1157+
return _skip_list_str_bytes(data);
1158+
if (tag == DICT_STR_GEN)
1159+
return _skip_dict_str_gen(data);
1160+
if (tag == LITERAL_FLOAT)
1161+
return _skip(data, 8);
1162+
if (tag == LITERAL_COMPLEX)
1163+
return _skip(data, 16);
1164+
PyErr_Format(PyExc_ValueError, "Unsupported tag: %d", tag);
1165+
return CPY_NONE_ERROR;
1166+
}
1167+
1168+
static PyObject*
1169+
extract_symbol_internal(PyObject *data) {
1170+
char *ptr = ((ReadBufferObject *)data)->ptr;
1171+
if (unlikely(_skip_class(data) == CPY_NONE_ERROR))
1172+
return NULL;
1173+
Py_ssize_t size = ((ReadBufferObject *)data)->ptr - ptr;
1174+
PyObject *res = PyBytes_FromStringAndSize(ptr, size);
1175+
if (unlikely(res == NULL))
1176+
return NULL;
1177+
return res;
1178+
}
1179+
1180+
static PyObject*
1181+
extract_symbol(PyObject *self, PyObject *const *args, size_t nargs) {
1182+
if (unlikely(nargs != 1)) {
1183+
PyErr_Format(PyExc_TypeError,
1184+
"extract_symbol() takes exactly 1 argument (%zu given)", nargs);
1185+
return NULL;
1186+
}
1187+
PyObject *data = args[0];
1188+
_CHECK_READ_BUFFER(data, NULL)
1189+
return extract_symbol_internal(data);
1190+
}
1191+
9231192
static uint8_t
9241193
cache_version_internal(void) {
9251194
return 0;
@@ -954,6 +1223,7 @@ static PyMethodDef librt_internal_module_methods[] = {
9541223
{"write_tag", (PyCFunction)write_tag, METH_FASTCALL, PyDoc_STR("write a short int")},
9551224
{"read_tag", (PyCFunction)read_tag, METH_FASTCALL, PyDoc_STR("read a short int")},
9561225
{"cache_version", (PyCFunction)cache_version, METH_NOARGS, PyDoc_STR("cache format version")},
1226+
{"extract_symbol", (PyCFunction)extract_symbol, METH_FASTCALL, PyDoc_STR("extract bytes for a mypy symbol")},
9571227
{NULL, NULL, 0, NULL}
9581228
};
9591229

@@ -1005,6 +1275,7 @@ librt_internal_module_exec(PyObject *m)
10051275
(void *)ReadBuffer_type_internal,
10061276
(void *)WriteBuffer_type_internal,
10071277
(void *)NativeInternal_API_Version,
1278+
(void *)extract_symbol_internal
10081279
};
10091280
PyObject *c_api_object = PyCapsule_New((void *)NativeInternal_API, "librt.internal._C_API", NULL);
10101281
if (PyModule_Add(m, "_C_API", c_api_object) < 0) {

0 commit comments

Comments
 (0)