diff --git a/Doc/c-api/perfmaps.rst b/Doc/c-api/perfmaps.rst index 76a1e9f528dc70..bd05e628faaaa1 100644 --- a/Doc/c-api/perfmaps.rst +++ b/Doc/c-api/perfmaps.rst @@ -31,7 +31,7 @@ Note that holding an :term:`attached thread state` is not required for these API or ``-2`` on failure to create a lock. Check ``errno`` for more information about the cause of a failure. -.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name) +.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, size_t code_size, const char *entry_name) Write one single entry to the ``/tmp/perf-$pid.map`` file. This function is thread safe. Here is what an example entry looks like:: diff --git a/Include/cpython/ceval.h b/Include/cpython/ceval.h index bbab8d35b75cb2..5b66fa1040d738 100644 --- a/Include/cpython/ceval.h +++ b/Include/cpython/ceval.h @@ -38,7 +38,7 @@ typedef struct { PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void); PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( const void *code_addr, - unsigned int code_size, + size_t code_size, const char *entry_name); PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void); PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename); diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 9fd3be74404907..fd98db16650507 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -94,7 +94,7 @@ typedef struct { void* (*init_state)(void); // Callback to register every trampoline being created void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); + size_t code_size, PyCodeObject* code); // Callback to free the trampoline state int (*free_state)(void* state); } _PyPerf_Callbacks; @@ -108,6 +108,10 @@ extern PyStatus _PyPerfTrampoline_AfterFork_Child(void); #ifdef PY_HAVE_PERF_TRAMPOLINE extern _PyPerf_Callbacks _Py_perfmap_callbacks; extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks; +extern void _PyPerfJit_WriteNamedCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename); #endif static inline PyObject* diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index f76d4f41c55119..a1304dbb298c8c 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -69,7 +69,7 @@ struct code_arena_st; struct trampoline_api_st { void* (*init_state)(void); void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); + size_t code_size, PyCodeObject* code); int (*free_state)(void* state); void *state; Py_ssize_t code_padding; diff --git a/Include/internal/pycore_jit_unwind.h b/Include/internal/pycore_jit_unwind.h new file mode 100644 index 00000000000000..99f3ac585bf650 --- /dev/null +++ b/Include/internal/pycore_jit_unwind.h @@ -0,0 +1,90 @@ +#ifndef Py_INTERNAL_JIT_UNWIND_H +#define Py_INTERNAL_JIT_UNWIND_H + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include +#include + +/* + * Compiler-emitted CFI for the shim region (GDB path only). + * + * Captured at build time by Tools/jit from the shim's compiled .eh_frame + * so the runtime CIE/FDE can describe whatever prologue the compiler + * chose, without hand-rolling DWARF. Executors pass NULL and fall back + * to the invariant-based steady-state rule that the CIE emits by hand. + * + * The struct is defined unconditionally so jit_record_code() in Python/jit.c + * has a valid pointer type on every platform — callers on non-(Linux+ELF) + * always pass NULL, matching jit_record_code()'s internal #ifdef. + */ +typedef struct { + const uint8_t *cie_init_cfi; + size_t cie_init_cfi_size; + const uint8_t *fde_cfi; + size_t fde_cfi_size; + uint32_t code_align; + int32_t data_align; + uint32_t ra_column; +} _PyJitUnwind_ShimCfi; + +#if defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__)) + +/* DWARF exception-handling pointer encodings shared by JIT unwind users. */ +enum { + DWRF_EH_PE_absptr = 0x00, + DWRF_EH_PE_omit = 0xff, + + /* Data type encodings */ + DWRF_EH_PE_uleb128 = 0x01, + DWRF_EH_PE_udata2 = 0x02, + DWRF_EH_PE_udata4 = 0x03, + DWRF_EH_PE_udata8 = 0x04, + DWRF_EH_PE_sleb128 = 0x09, + DWRF_EH_PE_sdata2 = 0x0a, + DWRF_EH_PE_sdata4 = 0x0b, + DWRF_EH_PE_sdata8 = 0x0c, + DWRF_EH_PE_signed = 0x08, + + /* Reference type encodings */ + DWRF_EH_PE_pcrel = 0x10, + DWRF_EH_PE_textrel = 0x20, + DWRF_EH_PE_datarel = 0x30, + DWRF_EH_PE_funcrel = 0x40, + DWRF_EH_PE_aligned = 0x50, + DWRF_EH_PE_indirect = 0x80 +}; + +/* Return the size of the generated .eh_frame data for the given encoding. */ +size_t _PyJitUnwind_EhFrameSize(int absolute_addr, + const _PyJitUnwind_ShimCfi *shim_cfi); + +/* + * Build DWARF .eh_frame data for JIT code; returns size written or 0 on error. + * absolute_addr selects the FDE address encoding: + * - 0: PC-relative offsets (perf jitdump synthesized DSO). + * - nonzero: absolute addresses (GDB JIT in-memory ELF). + * + * shim_cfi selects which JIT region the CFI describes (GDB path only): + * - NULL: executor trace; steady-state rule in the CIE applies at every PC. + * - non-NULL: compile_shim() output; the captured CIE/FDE CFI bytes are + * spliced in so unwinding is valid at every PC in the shim. + */ +size_t _PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr, + const _PyJitUnwind_ShimCfi *shim_cfi); + +void *_PyJitUnwind_GdbRegisterCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename, + const _PyJitUnwind_ShimCfi *shim_cfi); + +void _PyJitUnwind_GdbUnregisterCode(void *handle); + +#endif // defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__)) + +#endif // Py_INTERNAL_JIT_UNWIND_H diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 2986afb142b5d1..df6467e41cc208 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -151,6 +151,7 @@ typedef struct _PyExecutorObject { uint32_t code_size; size_t jit_size; void *jit_code; + void *jit_gdb_handle; _PyExitData exits[1]; } _PyExecutorObject; diff --git a/Lib/test/test_gdb/gdb_jit_sample.py b/Lib/test/test_gdb/gdb_jit_sample.py new file mode 100644 index 00000000000000..b439e82e8b312f --- /dev/null +++ b/Lib/test/test_gdb/gdb_jit_sample.py @@ -0,0 +1,27 @@ +# Sample script for use by test_gdb.test_jit + +import _testinternalcapi +import operator + + +WARMUP_ITERATIONS = _testinternalcapi.TIER2_THRESHOLD + 10 + + +def jit_bt_hot(depth, warming_up_caller=False): + if depth == 0: + if not warming_up_caller: + id(42) + return + + for iteration in range(WARMUP_ITERATIONS): + operator.call( + jit_bt_hot, + depth - 1, + warming_up_caller or iteration + 1 != WARMUP_ITERATIONS, + ) + + +# Warm the shared shim once without hitting builtin_id so the real run uses +# the steady-state shim path when GDB breaks inside id(42). +jit_bt_hot(1, warming_up_caller=True) +jit_bt_hot(1) diff --git a/Lib/test/test_gdb/test_jit.py b/Lib/test/test_gdb/test_jit.py new file mode 100644 index 00000000000000..39e09f6c8b4a9f --- /dev/null +++ b/Lib/test/test_gdb/test_jit.py @@ -0,0 +1,204 @@ +import os +import platform +import re +import sys +import unittest + +from .util import setup_module, DebuggerTests + + +JIT_SAMPLE_SCRIPT = os.path.join(os.path.dirname(__file__), "gdb_jit_sample.py") +# In batch GDB, break in builtin_id() while it is running under JIT, +# then repeatedly "finish" until the selected frame is the JIT entry. +# That gives a deterministic backtrace starting with py::jit_entry:. +# +# builtin_id() sits only a few helper frames above the JIT entry on this path. +# This bound is just a generous upper limit so the test fails clearly if the +# expected stack shape changes. +MAX_FINISH_STEPS = 20 +# Break directly on the lazy shim entry in the binary, then single-step just +# enough to let it install the compiled JIT entry and set a temporary +# breakpoint on the resulting address. +MAX_ENTRY_SETUP_STEPS = 20 +# After landing on the JIT entry frame, single-step a bounded number of +# instructions further into the blob so the backtrace is taken from JIT code +# itself rather than the immediate helper-return site. The exact number of +# steps is not significant: each step is cross-checked against the selected +# frame's symbol so the test fails loudly if stepping escapes the registered +# JIT region, instead of asserting against a misleading backtrace. +MAX_JIT_ENTRY_STEPS = 4 +EVAL_FRAME_RE = r"(_PyEval_EvalFrameDefault|_PyEval_Vector)" +BACKTRACE_FRAME_RE = re.compile(r"^#\d+\s+.*$", re.MULTILINE) + +FINISH_TO_JIT_ENTRY = ( + "python exec(\"import gdb\\n" + "target = 'py::jit_entry:'\\n" + f"for _ in range({MAX_FINISH_STEPS}):\\n" + " frame = gdb.selected_frame()\\n" + " if frame is not None and frame.name() == target:\\n" + " break\\n" + " gdb.execute('finish')\\n" + "else:\\n" + " raise RuntimeError('did not reach %s' % target)\\n\")" +) +BREAK_IN_COMPILED_JIT_ENTRY = ( + "python exec(\"import gdb\\n" + "lazy = int(gdb.parse_and_eval('(void*)_Py_LazyJitShim'))\\n" + f"for _ in range({MAX_ENTRY_SETUP_STEPS}):\\n" + " entry = int(gdb.parse_and_eval('(void*)_Py_jit_entry'))\\n" + " if entry != lazy:\\n" + " gdb.execute('tbreak *0x%x' % entry)\\n" + " break\\n" + " gdb.execute('next')\\n" + "else:\\n" + " raise RuntimeError('compiled JIT entry was not installed')\\n\")" +) +STEP_INSIDE_JIT_ENTRY = ( + "python exec(\"import gdb\\n" + "target = 'py::jit_entry:'\\n" + f"for _ in range({MAX_JIT_ENTRY_STEPS}):\\n" + " frame = gdb.selected_frame()\\n" + " if frame is None or frame.name() != target:\\n" + " raise RuntimeError('left JIT region during stepping: '\\n" + " + repr(frame and frame.name()))\\n" + " gdb.execute('si')\\n" + "frame = gdb.selected_frame()\\n" + "if frame is None or frame.name() != target:\\n" + " raise RuntimeError('stepped out of JIT region after si')\\n\")" +) + + +def setUpModule(): + setup_module() + + +# The GDB JIT interface registration is gated on __linux__ && __ELF__ in +# Python/jit_unwind.c, and the synthetic EH-frame is only implemented for +# x86_64 and AArch64 (a #error fires otherwise). Skip cleanly on other +# platforms or architectures instead of producing timeouts / empty backtraces. +# is_enabled() implies is_available() and also implies that the runtime has +# JIT execution active; interpreter-only tier 2 builds don't hit this path. +@unittest.skipUnless(sys.platform == "linux", + "GDB JIT interface is only implemented for Linux + ELF") +@unittest.skipUnless(platform.machine() in ("x86_64", "aarch64"), + "GDB JIT CFI emitter only supports x86_64 and AArch64") +@unittest.skipUnless(hasattr(sys, "_jit") and sys._jit.is_enabled(), + "requires a JIT-enabled build with JIT execution active") +class JitBacktraceTests(DebuggerTests): + def get_stack_trace(self, **kwargs): + # These tests validate the JIT-relevant part of the backtrace via + # _assert_jit_backtrace_shape, so an unrelated "?? ()" frame below + # the JIT/eval segment (e.g. libc without debug info) is tolerable. + kwargs.setdefault("skip_on_truncation", False) + return super().get_stack_trace(**kwargs) + + def _extract_backtrace_frames(self, gdb_output): + frames = BACKTRACE_FRAME_RE.findall(gdb_output) + self.assertGreater( + len(frames), 0, + f"expected at least one GDB backtrace frame in output:\n{gdb_output}", + ) + return frames + + def _assert_jit_backtrace_shape(self, gdb_output, *, anchor_at_top): + # Shape assertions applied to every JIT backtrace we produce: + # 1. The synthetic JIT symbol appears exactly once. A second + # py::jit_entry: frame would mean the unwinder is + # materializing two native frames for a single logical JIT + # region, or failing to unwind out of the region entirely. + # 2. At least one _PyEval_EvalFrameDefault / _PyEval_Vector + # frame appears after the JIT frame, proving the unwinder + # climbs back out of the JIT region into the eval loop. + # Helper frames from inside the JITted region may still + # appear above the synthetic JIT frame in the backtrace. + # 4. For tests that assert a specific entry PC, the JIT frame + # is also at #0. + frames = self._extract_backtrace_frames(gdb_output) + backtrace = "\n".join(frames) + + jit_frames = [frame for frame in frames if "py::jit_entry:" in frame] + jit_count = len(jit_frames) + self.assertEqual( + jit_count, 1, + f"expected exactly 1 py::jit_entry: frame, got {jit_count}\n" + f"backtrace:\n{backtrace}", + ) + eval_frames = [frame for frame in frames if re.search(EVAL_FRAME_RE, frame)] + eval_count = len(eval_frames) + self.assertGreaterEqual( + eval_count, 1, + f"expected at least one _PyEval_* frame, got {eval_count}\n" + f"backtrace:\n{backtrace}", + ) + jit_frame_index = next( + i for i, frame in enumerate(frames) if "py::jit_entry:" in frame + ) + eval_after_jit = any( + re.search(EVAL_FRAME_RE, frame) + for frame in frames[jit_frame_index + 1:] + ) + self.assertTrue( + eval_after_jit, + f"expected an eval frame after the JIT frame\n" + f"backtrace:\n{backtrace}", + ) + relevant_end = max( + i + for i, frame in enumerate(frames) + if "py::jit_entry:" in frame or re.search(EVAL_FRAME_RE, frame) + ) + truncated_frames = [ + frame for frame in frames[: relevant_end + 1] + if " ?? ()" in frame + ] + self.assertFalse( + truncated_frames, + "unexpected truncated frame before the validated JIT/eval segment\n" + f"backtrace:\n{backtrace}", + ) + if anchor_at_top: + self.assertRegex( + frames[0], + re.compile(r"^#0\s+py::jit_entry:"), + ) + + def test_bt_shows_compiled_jit_entry(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + breakpoint="_Py_LazyJitShim", + cmds_after_breakpoint=[ + BREAK_IN_COMPILED_JIT_ENTRY, + "continue", + "bt", + ], + PYTHON_JIT="1", + ) + # GDB registers the compiled JIT entry and per-trace JIT regions under + # the same synthetic symbol name; breaking at the entry PC pins the + # JIT frame at #0. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=True) + + def test_bt_unwinds_through_jit_frames(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=["bt"], + PYTHON_JIT="1", + ) + # The executor should appear as a named JIT frame and unwind back into + # the eval loop. Whether GDB also materializes a separate shim frame is + # an implementation detail of the synthetic executor CFI. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=False) + + def test_bt_unwinds_from_inside_jit_entry(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=[ + FINISH_TO_JIT_ENTRY, + STEP_INSIDE_JIT_ENTRY, + "bt", + ], + PYTHON_JIT="1", + ) + # Once the selected PC is inside the JIT entry, we require that GDB + # identifies the JIT frame at #0 and keeps unwinding into _PyEval_*. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=True) diff --git a/Lib/test/test_gdb/util.py b/Lib/test/test_gdb/util.py index 8097fd52ababe6..d903adcf2903f3 100644 --- a/Lib/test/test_gdb/util.py +++ b/Lib/test/test_gdb/util.py @@ -20,6 +20,27 @@ PYTHONHASHSEED = '123' +# gh-91960, bpo-40019: gdb reports these when the optimizer has dropped +# python-frame debug info; the test can't read what's not there. +_OPTIMIZED_OUT_PATTERNS = ( + '(frame information optimized out)', + 'Unable to read information on python frame', + '(unable to read python frame information)', +) +# gdb prints this when the unwinder genuinely failed to walk a frame — +# i.e. the CFI (ours or a library's) is wrong. Treat as a hard failure, +# not a skip, so regressions in our own unwind info don't hide. +_UNWIND_FAILURE_PATTERNS = ( + 'Backtrace stopped: frame did not save the PC', +) +# gh-104736: " ?? ()" in the bt usually means the unwinder bailed early, +# but can also be unrelated frames without debug info (e.g. libc). Tests +# that validate the JIT-relevant part of the backtrace themselves can +# opt out via skip_on_truncation=False. +_TRUNCATED_BACKTRACE_PATTERNS = ( + ' ?? ()', +) + def clean_environment(): # Remove PYTHON* environment variables such as PYTHONHOME @@ -160,7 +181,9 @@ def get_stack_trace(self, source=None, script=None, breakpoint=BREAKPOINT_FN, cmds_after_breakpoint=None, import_site=False, - ignore_stderr=False): + ignore_stderr=False, + skip_on_truncation=True, + **env_vars): ''' Run 'python -c SOURCE' under gdb with a breakpoint. @@ -239,7 +262,7 @@ def get_stack_trace(self, source=None, script=None, args += [script] # Use "args" to invoke gdb, capturing stdout, stderr: - out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED) + out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED, **env_vars) if not ignore_stderr: for line in err.splitlines(): @@ -255,26 +278,20 @@ def get_stack_trace(self, source=None, script=None, " because the Program Counter is" " not present") + for pattern in _UNWIND_FAILURE_PATTERNS: + if pattern in out: + raise AssertionError( + f"gdb unwinder failed ({pattern!r}) — CFI bug in our " + f"generated code or in a linked library.\n" + f"Full gdb output:\n{out}" + ) + # bpo-40019: Skip the test if gdb failed to read debug information # because the Python binary is optimized. - for pattern in ( - '(frame information optimized out)', - 'Unable to read information on python frame', - - # gh-91960: On Python built with "clang -Og", gdb gets - # "frame=" for _PyEval_EvalFrameDefault() parameter - '(unable to read python frame information)', - - # gh-104736: On Python built with "clang -Og" on ppc64le, - # "py-bt" displays a truncated or not traceback, but "where" - # logs this error message: - 'Backtrace stopped: frame did not save the PC', - - # gh-104736: When "bt" command displays something like: - # "#1 0x0000000000000000 in ?? ()", the traceback is likely - # truncated or wrong. - ' ?? ()', - ): + patterns = _OPTIMIZED_OUT_PATTERNS + if skip_on_truncation: + patterns = patterns + _TRUNCATED_BACKTRACE_PATTERNS + for pattern in patterns: if pattern in out: raise unittest.SkipTest(f"{pattern!r} found in gdb output") diff --git a/Lib/test/test_perfmaps.py b/Lib/test/test_perfmaps.py index 647c32656abd6d..ee4eb50033c470 100644 --- a/Lib/test/test_perfmaps.py +++ b/Lib/test/test_perfmaps.py @@ -1,4 +1,5 @@ import os +import sys import sysconfig import unittest @@ -17,6 +18,9 @@ def supports_trampoline_profiling(): raise unittest.SkipTest("perf trampoline profiling not supported") class TestPerfMapWriting(unittest.TestCase): + def tearDown(self): + perf_map_state_teardown() + def test_write_perf_map_entry(self): self.assertEqual(write_perf_map_entry(0x1234, 5678, "entry1"), 0) self.assertEqual(write_perf_map_entry(0x2345, 6789, "entry2"), 0) @@ -24,4 +28,15 @@ def test_write_perf_map_entry(self): perf_file_contents = f.read() self.assertIn("1234 162e entry1", perf_file_contents) self.assertIn("2345 1a85 entry2", perf_file_contents) - perf_map_state_teardown() + + @unittest.skipIf(sys.maxsize <= 2**32, "requires size_t wider than unsigned int") + def test_write_perf_map_entry_large_size(self): + code_addr = 0x3456 + code_size = 1 << 33 + entry_name = "entry_big" + + self.assertEqual(write_perf_map_entry(code_addr, code_size, entry_name), 0) + with open(f"/tmp/perf-{os.getpid()}.map") as f: + perf_file_contents = f.read() + self.assertIn(f"{code_addr:x} {code_size:x} {entry_name}", + perf_file_contents) diff --git a/Makefile.pre.in b/Makefile.pre.in index 354580aa482d25..b7342da861fb68 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -510,6 +510,7 @@ PYTHON_OBJS= \ Python/suggestions.o \ Python/perf_trampoline.o \ Python/perf_jit_trampoline.o \ + Python/jit_unwind.o \ Python/remote_debugging.o \ Python/$(DYNLOADFILE) \ $(LIBOBJS) \ diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst new file mode 100644 index 00000000000000..57e897cabc494b --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst @@ -0,0 +1 @@ +Add support for unwinding JIT frames using GDB. Patch by Diego Russo diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 7f6ea621f87145..7a37a526ffb53e 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1210,16 +1210,22 @@ write_perf_map_entry(PyObject *self, PyObject *args) { PyObject *code_addr_v; const void *code_addr; - unsigned int code_size; + PyObject *code_size_s; + size_t code_size; const char *entry_name; - if (!PyArg_ParseTuple(args, "OIs", &code_addr_v, &code_size, &entry_name)) + if (!PyArg_ParseTuple(args, "OOs", &code_addr_v, &code_size_s, &entry_name)) return NULL; code_addr = PyLong_AsVoidPtr(code_addr_v); if (code_addr == NULL) { return NULL; } + code_size = PyLong_AsSize_t(code_size_s); + if (code_size == (size_t)-1 && PyErr_Occurred()) { + return NULL; + } + int ret = PyUnstable_WritePerfMapEntry(code_addr, code_size, entry_name); if (ret < 0) { PyErr_SetFromErrno(PyExc_OSError); diff --git a/Python/jit.c b/Python/jit.c index 4990c743224d3c..fd0a53673d21da 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -15,6 +15,7 @@ #include "pycore_interpframe.h" #include "pycore_interpolation.h" #include "pycore_intrinsics.h" +#include "pycore_jit_unwind.h" #include "pycore_lazyimportobject.h" #include "pycore_list.h" #include "pycore_long.h" @@ -60,7 +61,36 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } +static void * +jit_record_code(const void *code_addr, size_t code_size, + const char *entry, const char *filename, + const _PyJitUnwind_ShimCfi *shim_cfi) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + _PyPerf_Callbacks callbacks; + _PyPerfTrampoline_GetCallbacks(&callbacks); + if (callbacks.write_state == _Py_perfmap_jit_callbacks.write_state) { + _PyPerfJit_WriteNamedCode( + code_addr, code_size, entry, filename); + return NULL; + } +#endif + +#if defined(__linux__) && defined(__ELF__) + return _PyJitUnwind_GdbRegisterCode( + code_addr, code_size, entry, filename, shim_cfi); +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; + (void)shim_cfi; + return NULL; +#endif +} + static size_t _Py_jit_shim_size = 0; +static void *_Py_jit_shim_gdb_handle = NULL; static int address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr) @@ -731,6 +761,11 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz } executor->jit_code = memory; executor->jit_size = total_size; + executor->jit_gdb_handle = jit_record_code(memory, + code_size + state.trampolines.size, + "jit_entry", + "", + /*shim_cfi=*/NULL); return 0; } @@ -781,6 +816,29 @@ compile_shim(void) return NULL; } _Py_jit_shim_size = total_size; + /* GDB JIT unwind info (and the captured-.eh_frame blob that feeds it) + * is only wired up on Linux+ELF — see jit_record_code() below. Even + * there, user-provided JIT CFLAGS can suppress the shim's .eh_frame, + * so jit_stencils.h advertises whether the captured CFI blob exists. */ +#if defined(__linux__) && defined(__ELF__) && _Py_JIT_HAS_SHIM_CFI + static const _PyJitUnwind_ShimCfi shim_cfi = { + .cie_init_cfi = _Py_jit_shim_cie_init_cfi, + .cie_init_cfi_size = sizeof(_Py_jit_shim_cie_init_cfi), + .fde_cfi = _Py_jit_shim_fde_cfi, + .fde_cfi_size = sizeof(_Py_jit_shim_fde_cfi), + .code_align = _Py_jit_shim_code_align, + .data_align = _Py_jit_shim_data_align, + .ra_column = _Py_jit_shim_ra_column, + }; + const _PyJitUnwind_ShimCfi *shim_cfi_ptr = &shim_cfi; +#else + const _PyJitUnwind_ShimCfi *shim_cfi_ptr = NULL; +#endif + _Py_jit_shim_gdb_handle = jit_record_code(memory, + code_size + state.trampolines.size, + "jit_entry", + "", + shim_cfi_ptr); return (_PyJitEntryFuncPtr)memory; } @@ -812,6 +870,12 @@ _PyJIT_Free(_PyExecutorObject *executor) if (memory) { executor->jit_code = NULL; executor->jit_size = 0; +#if defined(__linux__) && defined(__ELF__) + if (executor->jit_gdb_handle != NULL) { + _PyJitUnwind_GdbUnregisterCode(executor->jit_gdb_handle); + executor->jit_gdb_handle = NULL; + } +#endif if (jit_free(memory, size)) { PyErr_FormatUnraisable("Exception ignored while " "freeing JIT memory"); @@ -829,6 +893,12 @@ _PyJIT_Fini(void) if (size) { _Py_jit_entry = _Py_LazyJitShim; _Py_jit_shim_size = 0; +#if defined(__linux__) && defined(__ELF__) + if (_Py_jit_shim_gdb_handle != NULL) { + _PyJitUnwind_GdbUnregisterCode(_Py_jit_shim_gdb_handle); + _Py_jit_shim_gdb_handle = NULL; + } +#endif if (jit_free(memory, size)) { PyErr_FormatUnraisable("Exception ignored while " "freeing JIT entry code"); diff --git a/Python/jit_unwind.c b/Python/jit_unwind.c new file mode 100644 index 00000000000000..0e8c61129d5609 --- /dev/null +++ b/Python/jit_unwind.c @@ -0,0 +1,1017 @@ +/* + * Python JIT - DWARF .eh_frame builder + * + * This file contains the DWARF CFI generator used to build .eh_frame + * data for JIT code (perf jitdump and other unwinders). + */ + +#include "Python.h" +#include "pycore_jit_unwind.h" +#include "pycore_lock.h" + +#if defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__)) + +#if defined(__linux__) +# include +#endif +#include +#include + +// ============================================================================= +// DWARF CONSTANTS +// ============================================================================= + +/* + * DWARF (Debug With Arbitrary Record Formats) constants + * + * DWARF is a debugging data format used to provide stack unwinding information. + * These constants define the various encoding types and opcodes used in + * DWARF Call Frame Information (CFI) records. + */ + +/* DWARF Call Frame Information version */ +#define DWRF_CIE_VERSION 1 + +/* DWARF CFA (Call Frame Address) opcodes */ +enum { + DWRF_CFA_nop = 0x0, // No operation + DWRF_CFA_offset_extended = 0x5, // Extended offset instruction + DWRF_CFA_def_cfa = 0xc, // Define CFA rule + DWRF_CFA_def_cfa_register = 0xd, // Define CFA register + DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset + DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset + DWRF_CFA_advance_loc = 0x40, // Advance location counter + DWRF_CFA_offset = 0x80, // Simple offset instruction + DWRF_CFA_restore = 0xc0 // Restore register +}; + +/* + * Architecture-specific DWARF register numbers + * + * These constants define the register numbering scheme used by DWARF + * for each supported architecture. The numbers must match the ABI + * specification for proper stack unwinding. + */ +enum { +#ifdef __x86_64__ + /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ + DWRF_REG_AX, // RAX + DWRF_REG_DX, // RDX + DWRF_REG_CX, // RCX + DWRF_REG_BX, // RBX + DWRF_REG_SI, // RSI + DWRF_REG_DI, // RDI + DWRF_REG_BP, // RBP + DWRF_REG_SP, // RSP + DWRF_REG_8, // R8 + DWRF_REG_9, // R9 + DWRF_REG_10, // R10 + DWRF_REG_11, // R11 + DWRF_REG_12, // R12 + DWRF_REG_13, // R13 + DWRF_REG_14, // R14 + DWRF_REG_15, // R15 + DWRF_REG_RA, // Return address (RIP) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 register numbering */ + DWRF_REG_FP = 29, // Frame Pointer + DWRF_REG_RA = 30, // Link register (return address) + DWRF_REG_SP = 31, // Stack pointer +#else +# error "Unsupported target architecture" +#endif +}; + +// ============================================================================= +// ELF OBJECT CONTEXT +// ============================================================================= + +/* + * Context for building ELF/DWARF structures + * + * This structure maintains state while constructing DWARF unwind information. + * It acts as a simple buffer manager with pointers to track current position + * and important landmarks within the buffer. + */ +typedef struct ELFObjectContext { + uint8_t* p; // Current write position in buffer + uint8_t* startp; // Start of buffer (for offset calculations) + uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) + uintptr_t code_addr; // Address of the code section + size_t code_size; // Size of the code section + const _PyJitUnwind_ShimCfi* shim_cfi; // GDB emitter: NULL => executor +} ELFObjectContext; + +// ============================================================================= +// DWARF GENERATION UTILITIES +// ============================================================================= + +/* + * Append a null-terminated string to the ELF context buffer. + * + * Args: + * ctx: ELF object context + * str: String to append (must be null-terminated) + * + * Returns: Offset from start of buffer where string was written + */ +static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { + uint8_t* p = ctx->p; + uint32_t ofs = (uint32_t)(p - ctx->startp); + + /* Copy string including null terminator */ + do { + *p++ = (uint8_t)*str; + } while (*str++); + + ctx->p = p; + return ofs; +} + +/* + * Append a SLEB128 (Signed Little Endian Base 128) value + * + * SLEB128 is a variable-length encoding used extensively in DWARF. + * It efficiently encodes small numbers in fewer bytes. + * + * Args: + * ctx: ELF object context + * v: Signed value to encode + */ +static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { + *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Append a ULEB128 (Unsigned Little Endian Base 128) value + * + * Similar to SLEB128 but for unsigned values. + * + * Args: + * ctx: ELF object context + * v: Unsigned value to encode + */ +static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; v >= 0x80; v >>= 7) { + *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (char)v; // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Macros for generating DWARF structures + * + * These macros provide a convenient way to write various data types + * to the DWARF buffer while automatically advancing the pointer. + */ +#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit +#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit +#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit +#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit +#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address +#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 +#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 +#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string +#define DWRF_BYTES(buf, len) (memcpy(p, (buf), (len)), p += (len)) // Splice in raw bytes + +/* Align to specified boundary with NOP instructions */ +#define DWRF_ALIGNNOP(s) \ + while ((uintptr_t)p & ((s)-1)) { \ + *p++ = DWRF_CFA_nop; \ + } + +/* Write a DWARF section with automatic size calculation */ +#define DWRF_SECTION(name, stmt) \ + { \ + uint32_t* szp_##name = (uint32_t*)p; \ + p += 4; \ + stmt; \ + *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ + } + +// ============================================================================= +// DWARF EH FRAME GENERATION +// ============================================================================= + +static void elf_init_ehframe_perf(ELFObjectContext* ctx); +static void elf_init_ehframe_gdb(ELFObjectContext* ctx); + +static inline void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr) { + if (absolute_addr) { + elf_init_ehframe_gdb(ctx); + } + else { + elf_init_ehframe_perf(ctx); + } +} + +size_t +_PyJitUnwind_EhFrameSize(int absolute_addr, + const _PyJitUnwind_ShimCfi *shim_cfi) +{ + /* The .eh_frame we emit is small and bounded; keep a generous buffer. */ + uint8_t scratch[512]; + _Static_assert(sizeof(scratch) >= 256, + "scratch buffer may be too small for elf_init_ehframe"); + ELFObjectContext ctx; + ctx.code_size = 1; + ctx.code_addr = 0; + ctx.startp = ctx.p = scratch; + ctx.fde_p = NULL; + ctx.shim_cfi = shim_cfi; + /* Generate once into scratch to learn the required size. */ + elf_init_ehframe(&ctx, absolute_addr); + ptrdiff_t size = ctx.p - ctx.startp; + assert(size <= (ptrdiff_t)sizeof(scratch)); + return (size_t)size; +} + +size_t +_PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr, + const _PyJitUnwind_ShimCfi *shim_cfi) +{ + if (buffer == NULL || code_addr == NULL || code_size == 0) { + return 0; + } + /* Generate the frame twice: once to size-check, once to write. */ + size_t required = _PyJitUnwind_EhFrameSize(absolute_addr, shim_cfi); + if (required == 0 || required > buffer_size) { + return 0; + } + ELFObjectContext ctx; + ctx.code_size = code_size; + ctx.code_addr = (uintptr_t)code_addr; + ctx.startp = ctx.p = buffer; + ctx.fde_p = NULL; + ctx.shim_cfi = shim_cfi; + elf_init_ehframe(&ctx, absolute_addr); + size_t written = (size_t)(ctx.p - ctx.startp); + /* The frame size is independent of code_addr/code_size (fixed-width fields). */ + assert(written == required); + return written; +} + +/* + * Generate a minimal .eh_frame for a single JIT code region. + * + * The .eh_frame section contains Call Frame Information (CFI) that describes + * how to unwind the stack at any point in the code. This is essential for + * unwinding through JIT-generated code. + * + * The generated data contains: + * 1. A CIE (Common Information Entry) describing the calling convention. + * 2. An FDE (Frame Description Entry) describing how to unwind the JIT frame. + * + * Two flavors are emitted, dispatched on the absolute_addr flag: + * + * - absolute_addr == 0 (elf_init_ehframe_perf): PC-relative FDE address + * encoding for perf's synthesized DSO layout. The CIE describes the + * trampoline's entry state and the FDE walks through the prologue and + * epilogue with advance_loc instructions. This matches the pre-existing + * perf_jit_trampoline behavior byte-for-byte. + * + * - absolute_addr == 1 (elf_init_ehframe_gdb): absolute FDE address + * encoding for the GDB JIT in-memory ELF. The CIE describes the + * steady-state frame layout (CFA = %rbp+16 / x29+16, with saved fp and + * return-address column at fixed offsets) and the FDE emits no further + * CFI. The same rule applies at every PC in the registered region, + * which is correct for executor stencils (they pin the frame pointer + * across the region). This is the GDB-side fix; see elf_init_ehframe_gdb + * for details. + */ +static void elf_init_ehframe_perf(ELFObjectContext* ctx) { + int fde_ptr_enc = DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4; + uint8_t* p = ctx->p; + uint8_t* framep = p; // Remember start of frame data + + /* + * DWARF Unwind Table for Trampoline Function + * + * This section defines DWARF Call Frame Information (CFI) using encoded macros + * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function + * preserves and restores registers. This is used by profiling tools (e.g., `perf`) + * and debuggers for stack unwinding in JIT-compiled code. + * + * ------------------------------------------------- + * TO REGENERATE THIS TABLE FROM GCC OBJECTS: + * ------------------------------------------------- + * + * 1. Create a trampoline source file (e.g., `trampoline.c`): + * + * #include + * typedef PyObject* (*py_evaluator)(void*, void*, int); + * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { + * return evaluator(ts, f, throwflag); + * } + * + * 2. Compile to an object file with frame pointer preservation: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * 3. Extract DWARF unwind info from the object file: + * + * readelf -w trampoline.o + * + * Example output from `.eh_frame`: + * + * 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * 00000014 FDE cie=00000000 pc=0..14 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. + * + * ---------------------------------- + * HOW TO TRANSLATE TO DWRF_* MACROS: + * ---------------------------------- + * + * After compiling your trampoline with: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * run: + * + * readelf -w trampoline.o + * + * to inspect the generated `.eh_frame` data. You will see two main components: + * + * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. + * 2. An FDE (Frame Description Entry): function-specific unwind instructions. + * + * --------------------- + * Translating the CIE: + * --------------------- + * From `readelf -w`, you might see: + * + * 00000000 0000000000000010 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * Augmentation data: 1b + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * Map this to: + * + * DWRF_SECTION(CIE, + * DWRF_U32(0); // CIE ID (always 0 for CIEs) + * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 + * DWRF_STR("zR"); // Augmentation string "zR" + * DWRF_UV(4); // Code alignment factor = 4 + * DWRF_SV(-8); // Data alignment factor = -8 + * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) + * DWRF_UV(1); // Augmentation data length = 1 + * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers + * + * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa + * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) + * DWRF_UV(0); // Offset = 0 + * + * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary + * ) + * + * Notes: + * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. + * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. + * + * --------------------- + * Translating the FDE: + * --------------------- + * From `readelf -w`: + * + * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * Map the FDE header and instructions to: + * + * DWRF_SECTION(FDE, + * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) + * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) + * DWRF_U32(ctx->code_size); // Code range covered by this FDE + * DWRF_U8(0); // Augmentation data length (none) + * + * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + * DWRF_UV(16); + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) + * DWRF_UV(2); // At offset 2 * 8 = 16 bytes + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) + * DWRF_UV(1); // At offset 1 * 8 = 8 bytes + * + * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 + * + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + * DWRF_UV(0); + * ) + * + * To regenerate: + * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. + * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as + * the code is in a different address space every time. + * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: + * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) + * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) + * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset + * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) + * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. + * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. + */ + + /* + * Emit DWARF EH CIE (Common Information Entry) + * + * The CIE describes the calling conventions and basic unwinding rules + * that apply to all functions in this compilation unit. + */ + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID (0 indicates this is a CIE) + DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) + DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) +#ifdef __x86_64__ + DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) +#endif + DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) + DWRF_U8(DWRF_REG_RA); // Return address register number + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + /* Initial CFI instructions - describe default calling convention */ +#ifdef __x86_64__ + /* x86_64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size + DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved + DWRF_UV(1); // At offset 1 from CFA +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) + // No initial register saves in AArch64 CIE +#endif + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + /* + * Emit DWARF EH FDE (Frame Description Entry) + * + * The FDE describes unwinding information specific to this function. + * It references the CIE and provides function-specific CFI instructions. + * + * The PC-relative offset is calculated after the entire EH frame is built + * to ensure accurate positioning relative to the synthesized DSO layout. + */ + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + /* + * In perf jitdump mode the FDE PC field is encoded PC-relative and + * points back to code_start. Record where that field lives so we can + * patch in the final offset after the rest of the synthetic DSO + * layout is known. + */ + ctx->fde_p = p; // Remember where PC offset field is located for later calculation + DWRF_U32(0); // Placeholder for PC-relative offset (calculated below) + DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) + DWRF_U8(0); // Augmentation data length (none) + + /* + * Architecture-specific CFI instructions + * + * These instructions describe how registers are saved and restored + * during function calls. Each architecture has different calling + * conventions and register usage patterns. + */ +#ifdef __x86_64__ + /* x86_64 calling convention unwinding rules */ +# if defined(__CET__) && (__CET__ & 1) + DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) +# endif + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) + DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 + DWRF_UV(16); // New offset: SP + 16 + DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 + DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 + DWRF_UV(DWRF_REG_BP); // Use base pointer register + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 + DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 + DWRF_UV(DWRF_REG_SP); // Use stack pointer register + DWRF_UV(8); // New offset: SP + 8 +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 calling convention unwinding rules */ + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + DWRF_UV(16); // Stack pointer moved by 16 bytes + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved + DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved + DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // CFA = FP (x29) + 16 + DWRF_UV(DWRF_REG_FP); + DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_def_cfa); // CFA = SP + 0 (stack restored) + DWRF_UV(DWRF_REG_SP); + DWRF_UV(0); + +#else +# error "Unsupported target architecture" +#endif + + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + ctx->p = p; // Update context pointer to end of generated data + + /* Calculate and update the PC-relative offset in the FDE + * + * When perf processes the jitdump, it creates a synthesized DSO with this layout: + * + * Synthesized DSO Memory Layout: + * ┌─────────────────────────────────────────────────────────────┐ < code_start + * │ Code Section │ + * │ (round_up(code_size, 8) bytes) │ + * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data + * │ EH Frame Data │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ CIE data │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ FDE Header: │ │ + * │ │ - CIE offset (4 bytes) │ │ + * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start + * │ │ - address range (4 bytes) │ │ (this specific field) + * │ │ CFI Instructions... │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * ├─────────────────────────────────────────────────────────────┤ < reference_point + * │ EhFrameHeader │ + * │ (navigation metadata) │ + * └─────────────────────────────────────────────────────────────┘ + * + * The PC offset field in the FDE must contain the distance from itself to code_start: + * + * distance = code_start - fde_pc_field + * + * Where: + * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame + * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) + * + * Therefore: + * distance = code_start_location - fde_pc_field_location + * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) + * = -rounded_code_size - fde_offset_in_frame + * = -(round_up(code_size, 8) + fde_offset_in_frame) + * + * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field. + * + */ + int32_t rounded_code_size = + (int32_t)_Py_SIZE_ROUND_UP(ctx->code_size, 8); + int32_t fde_offset_in_frame = (int32_t)(ctx->fde_p - framep); + *(int32_t *)ctx->fde_p = -(rounded_code_size + fde_offset_in_frame); +} + +/* + * Build .eh_frame data for the GDB JIT interface. + * + * Two region kinds share the same ELF layout but diverge on CFI source: + * + * - Executor (shim_cfi == NULL). The CIE's initial CFI hard-codes the + * pinned-frame-pointer steady state (CFA = %rbp+16 / x29+96 with the + * frame record saved there). Executor stencils never touch the frame + * pointer — enforced by Tools/jit/_optimizers.py _validate() and + * -mframe-pointer=reserved — so that rule is valid at every PC and + * the FDE body is empty. + * + * - Shim (shim_cfi != NULL). Tools/jit captures the shim's compiled + * .eh_frame at build time; we splice the CIE-initial-CFI and FDE-CFI + * byte blobs into a freshly-built ELF with our chosen (absolute) FDE + * pointer encoding. Whatever prologue clang emits is described + * accurately, regardless of target or flags. + */ +static void elf_init_ehframe_gdb(ELFObjectContext* ctx) { + int fde_ptr_enc = DWRF_EH_PE_absptr; + uint8_t* p = ctx->p; + uint8_t* framep = p; + const _PyJitUnwind_ShimCfi* shim = ctx->shim_cfi; + + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID + DWRF_U8(DWRF_CIE_VERSION); + DWRF_STR("zR"); // aug data length + FDE ptr encoding follow + if (shim != NULL) { + DWRF_UV(shim->code_align); + DWRF_SV(shim->data_align); + DWRF_U8(shim->ra_column); + } + else { +#ifdef __x86_64__ + DWRF_UV(1); // x86_64: 1 byte per instruction +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_UV(4); // AArch64: 4 bytes per instruction +#endif + DWRF_SV(-(int64_t)sizeof(uintptr_t)); + DWRF_U8(DWRF_REG_RA); + } + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + if (shim != NULL) { + DWRF_BYTES(shim->cie_init_cfi, shim->cie_init_cfi_size); + } + else { + /* Executor steady-state rule (our invariant, not the compiler's). */ +#ifdef __x86_64__ + DWRF_U8(DWRF_CFA_def_cfa); // CFA = %rbp + 16 + DWRF_UV(DWRF_REG_BP); + DWRF_UV(16); + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); + DWRF_UV(1); + DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); + DWRF_UV(2); +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_U8(DWRF_CFA_def_cfa); // CFA = x29 + 96 + DWRF_UV(DWRF_REG_FP); + DWRF_UV(96); + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); + DWRF_UV(12); // saved x29 at cfa-96 + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); + DWRF_UV(11); // x30 at cfa-88 +#else +# error "Unsupported target architecture" +#endif + } + DWRF_ALIGNNOP(sizeof(uintptr_t)); + ) + + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + DWRF_ADDR(ctx->code_addr); // Absolute code start + DWRF_ADDR((uintptr_t)ctx->code_size); // Code range covered + DWRF_U8(0); // Augmentation data length (none) + if (shim != NULL) { + DWRF_BYTES(shim->fde_cfi, shim->fde_cfi_size); + } + DWRF_ALIGNNOP(sizeof(uintptr_t)); + ) + + ctx->p = p; +} + +#if defined(__linux__) && defined(__ELF__) +enum { + JIT_NOACTION = 0, + JIT_REGISTER_FN = 1, + JIT_UNREGISTER_FN = 2, +}; + +struct jit_code_entry { + struct jit_code_entry *next; + struct jit_code_entry *prev; + const char *symfile_addr; + uint64_t symfile_size; + const void *code_addr; +}; + +struct jit_descriptor { + uint32_t version; + uint32_t action_flag; + struct jit_code_entry *relevant_entry; + struct jit_code_entry *first_entry; +}; + +static PyMutex jit_debug_mutex = {0}; + +Py_EXPORTED_SYMBOL volatile struct jit_descriptor __jit_debug_descriptor = { + 1, JIT_NOACTION, NULL, NULL +}; + +Py_EXPORTED_SYMBOL void __attribute__((noinline)) +__jit_debug_register_code(void) +{ + /* Keep this call visible to debuggers and not optimized away. */ + (void)__jit_debug_descriptor.action_flag; +#if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static uint16_t +gdb_jit_machine_id(void) +{ + /* Map the current target to ELF e_machine; return 0 to skip registration. */ +#if defined(__x86_64__) || defined(_M_X64) + return EM_X86_64; +#elif defined(__aarch64__) && !defined(__ILP32__) + return EM_AARCH64; +#else + return 0; +#endif +} + +static struct jit_code_entry * +gdb_jit_register_code( + const void *code_addr, + size_t code_size, + const char *symname, + const uint8_t *eh_frame, + size_t eh_frame_size +) +{ + /* + * Build a minimal in-memory ELF for GDB's JIT interface and link it into + * __jit_debug_descriptor so debuggers can resolve JIT code. + */ + if (code_addr == NULL || code_size == 0 || symname == NULL) { + return NULL; + } + + const uint16_t machine = gdb_jit_machine_id(); + if (machine == 0) { + return NULL; + } + + enum { + SH_NULL = 0, + SH_TEXT, + SH_EH_FRAME, + SH_SHSTRTAB, + SH_STRTAB, + SH_SYMTAB, + SH_NUM, + }; + static const char shstrtab[] = + "\0.text\0.eh_frame\0.shstrtab\0.strtab\0.symtab"; + _Static_assert(sizeof(shstrtab) == + 1 + sizeof(".text") + sizeof(".eh_frame") + + sizeof(".shstrtab") + sizeof(".strtab") + sizeof(".symtab"), + "shstrtab size mismatch"); + const size_t shstrtab_size = sizeof(shstrtab); + const size_t sh_text = 1; + const size_t sh_eh_frame = sh_text + sizeof(".text"); + const size_t sh_shstrtab = sh_eh_frame + sizeof(".eh_frame"); + const size_t sh_strtab = sh_shstrtab + sizeof(".shstrtab"); + const size_t sh_symtab = sh_strtab + sizeof(".strtab"); + const size_t text_size = code_size; + const size_t text_padded = _Py_SIZE_ROUND_UP(text_size, 8); + const size_t strtab_size = 1 + strlen(symname) + 1; + const size_t symtab_size = 3 * sizeof(Elf64_Sym); + + size_t offset = sizeof(Elf64_Ehdr); + offset = _Py_SIZE_ROUND_UP(offset, 16); + const size_t text_off = offset; + const size_t eh_off = text_off + text_padded; + offset = eh_off + eh_frame_size; + const size_t shstr_off = offset; + offset += shstrtab_size; + const size_t str_off = offset; + offset += strtab_size; + offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Sym)); + const size_t sym_off = offset; + offset += symtab_size; + offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Shdr)); + const size_t sh_off = offset; + + const size_t shnum = SH_NUM; + const size_t total_size = sh_off + shnum * sizeof(Elf64_Shdr); + uint8_t *buf = (uint8_t *)PyMem_RawMalloc(total_size); + if (buf == NULL) { + return NULL; + } + memset(buf, 0, total_size); + + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)buf; + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELFOSABI_NONE; + ehdr->e_type = ET_DYN; + ehdr->e_machine = machine; + ehdr->e_version = EV_CURRENT; + ehdr->e_entry = 0; + ehdr->e_phoff = 0; + ehdr->e_shoff = sh_off; + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_shentsize = sizeof(Elf64_Shdr); + ehdr->e_shnum = shnum; + ehdr->e_shstrndx = SH_SHSTRTAB; + + memcpy(buf + text_off, code_addr, text_size); + memcpy(buf + eh_off, eh_frame, eh_frame_size); + + char *shstr = (char *)(buf + shstr_off); + memcpy(shstr, shstrtab, shstrtab_size); + + char *strtab = (char *)(buf + str_off); + strtab[0] = '\0'; + memcpy(strtab + 1, symname, strlen(symname)); + strtab[strtab_size - 1] = '\0'; + + Elf64_Sym *syms = (Elf64_Sym *)(buf + sym_off); + memset(syms, 0, symtab_size); + /* Section symbol for .text (local) */ + syms[1].st_info = ELF64_ST_INFO(STB_LOCAL, STT_SECTION); + syms[1].st_shndx = 1; + /* Function symbol */ + syms[2].st_name = 1; + syms[2].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC); + syms[2].st_other = STV_DEFAULT; + syms[2].st_shndx = 1; + /* For ET_DYN/ET_EXEC, st_value is the absolute virtual address. */ + syms[2].st_value = (Elf64_Addr)(uintptr_t)code_addr; + syms[2].st_size = code_size; + + Elf64_Shdr *shdrs = (Elf64_Shdr *)(buf + sh_off); + memset(shdrs, 0, shnum * sizeof(Elf64_Shdr)); + + shdrs[SH_TEXT].sh_name = sh_text; + shdrs[SH_TEXT].sh_type = SHT_PROGBITS; + shdrs[SH_TEXT].sh_flags = SHF_ALLOC | SHF_EXECINSTR; + shdrs[SH_TEXT].sh_addr = (Elf64_Addr)(uintptr_t)code_addr; + shdrs[SH_TEXT].sh_offset = text_off; + shdrs[SH_TEXT].sh_size = text_size; + shdrs[SH_TEXT].sh_addralign = 16; + + shdrs[SH_EH_FRAME].sh_name = sh_eh_frame; + shdrs[SH_EH_FRAME].sh_type = SHT_PROGBITS; + shdrs[SH_EH_FRAME].sh_flags = SHF_ALLOC; + shdrs[SH_EH_FRAME].sh_addr = + (Elf64_Addr)((uintptr_t)code_addr + text_padded); + shdrs[SH_EH_FRAME].sh_offset = eh_off; + shdrs[SH_EH_FRAME].sh_size = eh_frame_size; + shdrs[SH_EH_FRAME].sh_addralign = 8; + + shdrs[SH_SHSTRTAB].sh_name = sh_shstrtab; + shdrs[SH_SHSTRTAB].sh_type = SHT_STRTAB; + shdrs[SH_SHSTRTAB].sh_offset = shstr_off; + shdrs[SH_SHSTRTAB].sh_size = shstrtab_size; + shdrs[SH_SHSTRTAB].sh_addralign = 1; + + shdrs[SH_STRTAB].sh_name = sh_strtab; + shdrs[SH_STRTAB].sh_type = SHT_STRTAB; + shdrs[SH_STRTAB].sh_offset = str_off; + shdrs[SH_STRTAB].sh_size = strtab_size; + shdrs[SH_STRTAB].sh_addralign = 1; + + shdrs[SH_SYMTAB].sh_name = sh_symtab; + shdrs[SH_SYMTAB].sh_type = SHT_SYMTAB; + shdrs[SH_SYMTAB].sh_offset = sym_off; + shdrs[SH_SYMTAB].sh_size = symtab_size; + shdrs[SH_SYMTAB].sh_link = SH_STRTAB; + shdrs[SH_SYMTAB].sh_info = 2; + shdrs[SH_SYMTAB].sh_addralign = 8; + shdrs[SH_SYMTAB].sh_entsize = sizeof(Elf64_Sym); + + struct jit_code_entry *entry = PyMem_RawMalloc(sizeof(*entry)); + if (entry == NULL) { + PyMem_RawFree(buf); + return NULL; + } + entry->symfile_addr = (const char *)buf; + entry->symfile_size = total_size; + entry->code_addr = code_addr; + + PyMutex_Lock(&jit_debug_mutex); + entry->prev = NULL; + entry->next = __jit_debug_descriptor.first_entry; + if (entry->next != NULL) { + entry->next->prev = entry; + } + __jit_debug_descriptor.first_entry = entry; + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_REGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + __jit_debug_descriptor.relevant_entry = NULL; + PyMutex_Unlock(&jit_debug_mutex); + return entry; +} +#endif // __linux__ && __ELF__ + +void * +_PyJitUnwind_GdbRegisterCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename, + const _PyJitUnwind_ShimCfi *shim_cfi) +{ +#if defined(__linux__) && defined(__ELF__) + /* GDB expects a stable symbol name and absolute addresses in .eh_frame. */ + if (entry == NULL) { + entry = ""; + } + if (filename == NULL) { + filename = ""; + } + size_t name_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; + char *name = (char *)PyMem_RawMalloc(name_size); + if (name == NULL) { + return NULL; + } + snprintf(name, name_size, "py::%s:%s", entry, filename); + + uint8_t buffer[1024]; + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 1, shim_cfi); + if (eh_frame_size == 0) { + PyMem_RawFree(name); + return NULL; + } + + void *handle = gdb_jit_register_code(code_addr, code_size, name, + buffer, eh_frame_size); + PyMem_RawFree(name); + return handle; +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; + (void)shim_cfi; + return NULL; +#endif +} + +void +_PyJitUnwind_GdbUnregisterCode(void *handle) +{ +#if defined(__linux__) && defined(__ELF__) + struct jit_code_entry *entry = (struct jit_code_entry *)handle; + if (entry == NULL) { + return; + } + + PyMutex_Lock(&jit_debug_mutex); + if (entry->prev != NULL) { + entry->prev->next = entry->next; + } + else { + __jit_debug_descriptor.first_entry = entry->next; + } + if (entry->next != NULL) { + entry->next->prev = entry->prev; + } + + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + __jit_debug_descriptor.relevant_entry = NULL; + + PyMutex_Unlock(&jit_debug_mutex); + + PyMem_RawFree((void *)entry->symfile_addr); + PyMem_RawFree(entry); +#else + (void)handle; +#endif +} + +#endif // defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__)) diff --git a/Python/optimizer.c b/Python/optimizer.c index f09bf778587b12..ee8cff46481b71 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1296,6 +1296,7 @@ allocate_executor(int exit_count, int length) res->trace = (_PyUOpInstruction *)(res->exits + exit_count); res->code_size = length; res->exit_count = exit_count; + res->jit_gdb_handle = NULL; return res; } @@ -1442,6 +1443,7 @@ make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, i // This is initialized to false so we can prevent the executor // from being immediately detected as cold and invalidated. executor->vm_data.cold = false; + executor->jit_gdb_handle = NULL; if (_PyJIT_Compile(executor, executor->trace, length)) { Py_DECREF(executor); return NULL; @@ -1674,6 +1676,7 @@ make_cold_executor(uint16_t opcode) #ifdef _Py_JIT cold->jit_code = NULL; cold->jit_size = 0; + cold->jit_gdb_handle = NULL; if (_PyJIT_Compile(cold, cold->trace, 1)) { Py_DECREF(cold); Py_FatalError("Cannot allocate core JIT code"); diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 0ba856ea610e59..942c20fc2c69af 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -62,6 +62,7 @@ #include "pycore_frame.h" #include "pycore_interp.h" #include "pycore_mmap.h" // _PyAnnotateMemoryMap() +#include "pycore_jit_unwind.h" #include "pycore_runtime.h" // _PyRuntime #ifdef PY_HAVE_PERF_TRAMPOLINE @@ -73,6 +74,7 @@ #include // File control operations #include // Standard I/O operations #include // Standard library functions +#include // memcpy, strlen #include // Memory mapping functions (mmap) #include // System data types #include // System calls (sysconf, getpid) @@ -246,6 +248,25 @@ typedef struct { */ } CodeUnwindingInfoEvent; +/* + * EH Frame Header structure for DWARF unwinding + * + * This header provides metadata about the .eh_frame data that follows. + * It uses PC-relative and data-relative encodings to keep the synthesized + * DSO self-contained when perf injects it. + */ +typedef struct __attribute__((packed)) { + uint8_t version; + uint8_t eh_frame_ptr_enc; + uint8_t fde_count_enc; + uint8_t table_enc; + int32_t eh_frame_ptr; + uint32_t eh_fde_count; + int32_t from; + int32_t to; +} EhFrameHeader; +_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch"); + // ============================================================================= // GLOBAL STATE MANAGEMENT // ============================================================================= @@ -262,7 +283,8 @@ typedef struct { PyThread_type_lock map_lock; // Thread synchronization lock void* mapped_buffer; // Memory-mapped region (signals perf we're active) size_t mapped_size; // Size of the mapped region - int code_id; // Counter for unique code region identifiers + uint32_t code_id; // Counter for unique code region identifiers + uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs } PerfMapJitState; /* Global singleton instance */ @@ -316,40 +338,6 @@ static int64_t get_current_time_microseconds(void) { return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; } -// ============================================================================= -// UTILITY FUNCTIONS -// ============================================================================= - -/* - * Round up a value to the next multiple of a given number - * - * This is essential for maintaining proper alignment requirements in the - * jitdump format. Many structures need to be aligned to specific boundaries - * (typically 8 or 16 bytes) for efficient processing by perf. - * - * Args: - * value: The value to round up - * multiple: The multiple to round up to - * - * Returns: The smallest value >= input that is a multiple of 'multiple' - */ -static size_t round_up(int64_t value, int64_t multiple) { - if (multiple == 0) { - return value; // Avoid division by zero - } - - int64_t remainder = value % multiple; - if (remainder == 0) { - return value; // Already aligned - } - - /* Calculate how much to add to reach the next multiple */ - int64_t difference = multiple - remainder; - int64_t rounded_up_value = value + difference; - - return rounded_up_value; -} - // ============================================================================= // FILE I/O UTILITIES // ============================================================================= @@ -406,623 +394,6 @@ static void perf_map_jit_write_header(int pid, FILE* out_file) { perf_map_jit_write_fully(&header, sizeof(header)); } -// ============================================================================= -// DWARF CONSTANTS AND UTILITIES -// ============================================================================= - -/* - * DWARF (Debug With Arbitrary Record Formats) constants - * - * DWARF is a debugging data format used to provide stack unwinding information. - * These constants define the various encoding types and opcodes used in - * DWARF Call Frame Information (CFI) records. - */ - -/* DWARF Call Frame Information version */ -#define DWRF_CIE_VERSION 1 - -/* DWARF CFA (Call Frame Address) opcodes */ -enum { - DWRF_CFA_nop = 0x0, // No operation - DWRF_CFA_offset_extended = 0x5, // Extended offset instruction - DWRF_CFA_def_cfa = 0xc, // Define CFA rule - DWRF_CFA_def_cfa_register = 0xd, // Define CFA register - DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset - DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset - DWRF_CFA_advance_loc = 0x40, // Advance location counter - DWRF_CFA_offset = 0x80, // Simple offset instruction - DWRF_CFA_restore = 0xc0 // Restore register -}; - -/* DWARF Exception Handling pointer encodings */ -enum { - DWRF_EH_PE_absptr = 0x00, // Absolute pointer - DWRF_EH_PE_omit = 0xff, // Omitted value - - /* Data type encodings */ - DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 - DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte - DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte - DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte - DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 - DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte - DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte - DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte - DWRF_EH_PE_signed = 0x08, // Signed flag - - /* Reference type encodings */ - DWRF_EH_PE_pcrel = 0x10, // PC-relative - DWRF_EH_PE_textrel = 0x20, // Text-relative - DWRF_EH_PE_datarel = 0x30, // Data-relative - DWRF_EH_PE_funcrel = 0x40, // Function-relative - DWRF_EH_PE_aligned = 0x50, // Aligned - DWRF_EH_PE_indirect = 0x80 // Indirect -}; - -/* Additional DWARF constants for debug information */ -enum { DWRF_TAG_compile_unit = 0x11 }; -enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; -enum { - DWRF_AT_name = 0x03, // Name attribute - DWRF_AT_stmt_list = 0x10, // Statement list - DWRF_AT_low_pc = 0x11, // Low PC address - DWRF_AT_high_pc = 0x12 // High PC address -}; -enum { - DWRF_FORM_addr = 0x01, // Address form - DWRF_FORM_data4 = 0x06, // 4-byte data - DWRF_FORM_string = 0x08 // String form -}; - -/* Line number program opcodes */ -enum { - DWRF_LNS_extended_op = 0, // Extended opcode - DWRF_LNS_copy = 1, // Copy operation - DWRF_LNS_advance_pc = 2, // Advance program counter - DWRF_LNS_advance_line = 3 // Advance line number -}; - -/* Line number extended opcodes */ -enum { - DWRF_LNE_end_sequence = 1, // End of sequence - DWRF_LNE_set_address = 2 // Set address -}; - -/* - * Architecture-specific DWARF register numbers - * - * These constants define the register numbering scheme used by DWARF - * for each supported architecture. The numbers must match the ABI - * specification for proper stack unwinding. - */ -enum { -#ifdef __x86_64__ - /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ - DWRF_REG_AX, // RAX - DWRF_REG_DX, // RDX - DWRF_REG_CX, // RCX - DWRF_REG_BX, // RBX - DWRF_REG_SI, // RSI - DWRF_REG_DI, // RDI - DWRF_REG_BP, // RBP - DWRF_REG_SP, // RSP - DWRF_REG_8, // R8 - DWRF_REG_9, // R9 - DWRF_REG_10, // R10 - DWRF_REG_11, // R11 - DWRF_REG_12, // R12 - DWRF_REG_13, // R13 - DWRF_REG_14, // R14 - DWRF_REG_15, // R15 - DWRF_REG_RA, // Return address (RIP) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 register numbering */ - DWRF_REG_FP = 29, // Frame Pointer - DWRF_REG_RA = 30, // Link register (return address) - DWRF_REG_SP = 31, // Stack pointer -#else -# error "Unsupported target architecture" -#endif -}; - -/* DWARF encoding constants used in EH frame headers */ -static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data -static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data -static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding -static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding - -// ============================================================================= -// ELF OBJECT CONTEXT -// ============================================================================= - -/* - * Context for building ELF/DWARF structures - * - * This structure maintains state while constructing DWARF unwind information. - * It acts as a simple buffer manager with pointers to track current position - * and important landmarks within the buffer. - */ -typedef struct ELFObjectContext { - uint8_t* p; // Current write position in buffer - uint8_t* startp; // Start of buffer (for offset calculations) - uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) - uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) - uint32_t code_size; // Size of the code being described -} ELFObjectContext; - -/* - * EH Frame Header structure for DWARF unwinding - * - * This structure provides metadata about the DWARF unwinding information - * that follows. It's required by the perf jitdump format to enable proper - * stack unwinding during profiling. - */ -typedef struct { - unsigned char version; // EH frame version (always 1) - unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer - unsigned char fde_count_enc; // Encoding of FDE count - unsigned char table_enc; // Encoding of table entries - int32_t eh_frame_ptr; // Pointer to EH frame data - int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) - int32_t from; // Start address of code range - int32_t to; // End address of code range -} EhFrameHeader; - -// ============================================================================= -// DWARF GENERATION UTILITIES -// ============================================================================= - -/* - * Append a null-terminated string to the ELF context buffer - * - * Args: - * ctx: ELF object context - * str: String to append (must be null-terminated) - * - * Returns: Offset from start of buffer where string was written - */ -static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { - uint8_t* p = ctx->p; - uint32_t ofs = (uint32_t)(p - ctx->startp); - - /* Copy string including null terminator */ - do { - *p++ = (uint8_t)*str; - } while (*str++); - - ctx->p = p; - return ofs; -} - -/* - * Append a SLEB128 (Signed Little Endian Base 128) value - * - * SLEB128 is a variable-length encoding used extensively in DWARF. - * It efficiently encodes small numbers in fewer bytes. - * - * Args: - * ctx: ELF object context - * v: Signed value to encode - */ -static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { - *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Append a ULEB128 (Unsigned Little Endian Base 128) value - * - * Similar to SLEB128 but for unsigned values. - * - * Args: - * ctx: ELF object context - * v: Unsigned value to encode - */ -static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; v >= 0x80; v >>= 7) { - *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (char)v; // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Macros for generating DWARF structures - * - * These macros provide a convenient way to write various data types - * to the DWARF buffer while automatically advancing the pointer. - */ -#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit -#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit -#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit -#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit -#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address -#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 -#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 -#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string - -/* Align to specified boundary with NOP instructions */ -#define DWRF_ALIGNNOP(s) \ - while ((uintptr_t)p & ((s)-1)) { \ - *p++ = DWRF_CFA_nop; \ - } - -/* Write a DWARF section with automatic size calculation */ -#define DWRF_SECTION(name, stmt) \ - { \ - uint32_t* szp_##name = (uint32_t*)p; \ - p += 4; \ - stmt; \ - *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ - } - -// ============================================================================= -// DWARF EH FRAME GENERATION -// ============================================================================= - -static void elf_init_ehframe(ELFObjectContext* ctx); - -/* - * Initialize DWARF .eh_frame section for a code region - * - * The .eh_frame section contains Call Frame Information (CFI) that describes - * how to unwind the stack at any point in the code. This is essential for - * proper profiling as it allows perf to generate accurate call graphs. - * - * The function generates two main components: - * 1. CIE (Common Information Entry) - describes calling conventions - * 2. FDE (Frame Description Entry) - describes specific function unwinding - * - * Args: - * ctx: ELF object context containing code size and buffer pointers - */ -static size_t calculate_eh_frame_size(void) { - /* Calculate the EH frame size for the trampoline function */ - extern void *_Py_trampoline_func_start; - extern void *_Py_trampoline_func_end; - - size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; - - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; - - elf_init_ehframe(&ctx); - return ctx.p - ctx.startp; -} - -static void elf_init_ehframe(ELFObjectContext* ctx) { - uint8_t* p = ctx->p; - uint8_t* framep = p; // Remember start of frame data - - /* - * DWARF Unwind Table for Trampoline Function - * - * This section defines DWARF Call Frame Information (CFI) using encoded macros - * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function - * preserves and restores registers. This is used by profiling tools (e.g., `perf`) - * and debuggers for stack unwinding in JIT-compiled code. - * - * ------------------------------------------------- - * TO REGENERATE THIS TABLE FROM GCC OBJECTS: - * ------------------------------------------------- - * - * 1. Create a trampoline source file (e.g., `trampoline.c`): - * - * #include - * typedef PyObject* (*py_evaluator)(void*, void*, int); - * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { - * return evaluator(ts, f, throwflag); - * } - * - * 2. Compile to an object file with frame pointer preservation: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * 3. Extract DWARF unwind info from the object file: - * - * readelf -w trampoline.o - * - * Example output from `.eh_frame`: - * - * 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * 00000014 FDE cie=00000000 pc=0..14 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. - * - * ---------------------------------- - * HOW TO TRANSLATE TO DWRF_* MACROS: - * ---------------------------------- - * - * After compiling your trampoline with: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * run: - * - * readelf -w trampoline.o - * - * to inspect the generated `.eh_frame` data. You will see two main components: - * - * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. - * 2. An FDE (Frame Description Entry): function-specific unwind instructions. - * - * --------------------- - * Translating the CIE: - * --------------------- - * From `readelf -w`, you might see: - * - * 00000000 0000000000000010 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * Augmentation data: 1b - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * Map this to: - * - * DWRF_SECTION(CIE, - * DWRF_U32(0); // CIE ID (always 0 for CIEs) - * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 - * DWRF_STR("zR"); // Augmentation string "zR" - * DWRF_UV(4); // Code alignment factor = 4 - * DWRF_SV(-8); // Data alignment factor = -8 - * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) - * DWRF_UV(1); // Augmentation data length = 1 - * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers - * - * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa - * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) - * DWRF_UV(0); // Offset = 0 - * - * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary - * ) - * - * Notes: - * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. - * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. - * - * --------------------- - * Translating the FDE: - * --------------------- - * From `readelf -w`: - * - * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * Map the FDE header and instructions to: - * - * DWRF_SECTION(FDE, - * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) - * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) - * DWRF_U32(ctx->code_size); // Code range covered by this FDE - * DWRF_U8(0); // Augmentation data length (none) - * - * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - * DWRF_UV(16); - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) - * DWRF_UV(2); // At offset 2 * 8 = 16 bytes - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) - * DWRF_UV(1); // At offset 1 * 8 = 8 bytes - * - * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 - * - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP - * DWRF_UV(0); - * ) - * - * To regenerate: - * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. - * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as - * the code is in a different address space every time. - * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: - * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) - * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) - * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset - * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) - * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. - * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. - */ - - /* - * Emit DWARF EH CIE (Common Information Entry) - * - * The CIE describes the calling conventions and basic unwinding rules - * that apply to all functions in this compilation unit. - */ - DWRF_SECTION(CIE, - DWRF_U32(0); // CIE ID (0 indicates this is a CIE) - DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) - DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) -#ifdef __x86_64__ - DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) -#endif - DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) - DWRF_U8(DWRF_REG_RA); // Return address register number - DWRF_UV(1); // Augmentation data length - DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding - - /* Initial CFI instructions - describe default calling convention */ -#ifdef __x86_64__ - /* x86_64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size - DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved - DWRF_UV(1); // At offset 1 from CFA -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) - // No initial register saves in AArch64 CIE -#endif - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->eh_frame_p = p; // Remember start of FDE data - - /* - * Emit DWARF EH FDE (Frame Description Entry) - * - * The FDE describes unwinding information specific to this function. - * It references the CIE and provides function-specific CFI instructions. - * - * The PC-relative offset is calculated after the entire EH frame is built - * to ensure accurate positioning relative to the synthesized DSO layout. - */ - DWRF_SECTION(FDE, - DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) - ctx->fde_p = p; // Remember where PC offset field is located for later calculation - DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) - DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) - DWRF_U8(0); // Augmentation data length (none) - - /* - * Architecture-specific CFI instructions - * - * These instructions describe how registers are saved and restored - * during function calls. Each architecture has different calling - * conventions and register usage patterns. - */ -#ifdef __x86_64__ - /* x86_64 calling convention unwinding rules with frame pointer */ -# if defined(__CET__) && (__CET__ & 1) - DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) -# endif - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) - DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 - DWRF_UV(16); // New offset: SP + 16 - DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 - DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) - DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 - DWRF_UV(DWRF_REG_BP); // Use base pointer register - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 - DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 - DWRF_UV(DWRF_REG_SP); // Use stack pointer register - DWRF_UV(8); // New offset: SP + 8 -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 calling convention unwinding rules */ - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - DWRF_UV(16); // Stack pointer moved by 16 bytes - DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved - DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved - DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) - DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) - DWRF_UV(0); // Back to original stack position -#else -# error "Unsupported target architecture" -#endif - - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->p = p; // Update context pointer to end of generated data - - /* Calculate and update the PC-relative offset in the FDE - * - * When perf processes the jitdump, it creates a synthesized DSO with this layout: - * - * Synthesized DSO Memory Layout: - * ┌─────────────────────────────────────────────────────────────┐ < code_start - * │ Code Section │ - * │ (round_up(code_size, 8) bytes) │ - * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data - * │ EH Frame Data │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ CIE data │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ FDE Header: │ │ - * │ │ - CIE offset (4 bytes) │ │ - * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start - * │ │ - address range (4 bytes) │ │ (this specific field) - * │ │ CFI Instructions... │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * ├─────────────────────────────────────────────────────────────┤ < reference_point - * │ EhFrameHeader │ - * │ (navigation metadata) │ - * └─────────────────────────────────────────────────────────────┘ - * - * The PC offset field in the FDE must contain the distance from itself to code_start: - * - * distance = code_start - fde_pc_field - * - * Where: - * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame - * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) - * - * Therefore: - * distance = code_start_location - fde_pc_field_location - * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) - * = -rounded_code_size - fde_offset_in_frame - * = -(round_up(code_size, 8) + fde_offset_in_frame) - * - * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, - * - */ - if (ctx->fde_p != NULL) { - int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); - int32_t rounded_code_size = round_up(ctx->code_size, 8); - int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); - - - // Update the PC-relative offset in the FDE - *(int32_t*)ctx->fde_p = pc_relative_offset; - } -} - // ============================================================================= // JITDUMP INITIALIZATION // ============================================================================= @@ -1128,11 +499,13 @@ static void* perf_map_jit_init(void) { /* Initialize code ID counter */ perf_jit_map_state.code_id = 0; + perf_jit_map_state.build_id_salt = + ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks(); /* Calculate padding size based on actual unwind info requirements */ - size_t eh_frame_size = calculate_eh_frame_size(); + size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0, NULL); size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; - trampoline_api.code_padding = round_up(unwind_data_size, 16); + trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16); trampoline_api.code_alignment = 32; return &perf_jit_map_state; @@ -1143,30 +516,19 @@ static void* perf_map_jit_init(void) { // ============================================================================= /* - * Write a complete jitdump entry for a Python function - * - * This is the main function called by Python's trampoline system whenever - * a new piece of JIT-compiled code needs to be recorded. It writes both - * the unwinding information and the code load event to the jitdump file. - * - * The function performs these steps: - * 1. Initialize jitdump system if not already done - * 2. Extract function name and filename from Python code object - * 3. Generate DWARF unwinding information - * 4. Write unwinding info event to jitdump file - * 5. Write code load event to jitdump file + * Write a complete jitdump entry for a code region with a provided name. * - * Args: - * state: Jitdump state (currently unused, uses global state) - * code_addr: Address where the compiled code resides - * code_size: Size of the compiled code in bytes - * co: Python code object containing metadata - * - * IMPORTANT: This function signature is part of Python's internal API - * and must not be changed without coordinating with core Python development. + * This shares the same implementation as the trampoline callback, but + * allows callers that don't have a PyCodeObject to reuse the jitdump + * infrastructure. */ -static void perf_map_jit_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) +static void perf_map_jit_write_entry_with_name( + void *state, + const void *code_addr, + size_t code_size, + const char *entry, + const char *filename +) { /* Initialize jitdump system on first use */ if (perf_jit_map_state.perf_map == NULL) { @@ -1176,21 +538,11 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, } } - /* - * Extract function information from Python code object - * - * We create a human-readable function name by combining the qualified - * name (includes class/module context) with the filename. This helps - * developers identify functions in perf reports. - */ - const char *entry = ""; - if (co->co_qualname != NULL) { - entry = PyUnicode_AsUTF8(co->co_qualname); + if (entry == NULL) { + entry = ""; } - - const char *filename = ""; - if (co->co_filename != NULL) { - filename = PyUnicode_AsUTF8(co->co_filename); + if (filename == NULL) { + filename = ""; } /* @@ -1218,16 +570,13 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, * Without it, perf cannot generate accurate call graphs, especially * in optimized code where frame pointers may be omitted. */ - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written - - /* Generate EH frame (Exception Handling frame) data */ - elf_init_ehframe(&ctx); - int eh_frame_size = ctx.p - ctx.startp; - + uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 0, NULL); + if (eh_frame_size == 0) { + PyMem_RawFree(perf_map_entry); + return; + } /* * Write Code Unwinding Information Event * @@ -1244,12 +593,12 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); - ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment + ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment /* Calculate total event size with padding */ - int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; - int padding_size = round_up(content_size, 8) - content_size; // 8-byte align - ev2.base.size = content_size + padding_size; + int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size); + int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align + ev2.base.size = (uint32_t)(content_size + padding_size); /* Write the unwinding info event header */ perf_map_jit_write_fully(&ev2, sizeof(ev2)); @@ -1263,20 +612,21 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, */ EhFrameHeader f; f.version = 1; - f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte - f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count - f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte + f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel; + f.fde_count_enc = DWRF_EH_PE_udata4; + f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel; /* Calculate relative offsets for EH frame navigation */ - f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); + f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char)); f.eh_fde_count = 1; // We generate exactly one FDE per function - f.from = -(round_up(code_size, 8) + eh_frame_size); - - int cie_size = ctx.eh_frame_p - ctx.startp; - f.to = -(eh_frame_size - cie_size); + f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size); + uint32_t cie_payload_size; + memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size)); + int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size); + f.to = -(int32_t)(eh_frame_size - cie_size); /* Write EH frame data and header */ - perf_map_jit_write_fully(ctx.startp, eh_frame_size); + perf_map_jit_write_fully(buffer, eh_frame_size); perf_map_jit_write_fully(&f, sizeof(f)); /* Write padding to maintain alignment */ @@ -1313,12 +663,85 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, /* Write code load event and associated data */ perf_map_jit_write_fully(&ev, sizeof(ev)); perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator - perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code + /* + * Ensure each synthetic DSO has unique .text bytes. + * + * perf merges DSOs that share a build-id. Since trampolines can share + * identical code and unwind bytes, perf may resolve all JIT frames to + * the first symbol it saw (including entries from previous runs when + * build-id caching is enabled). Patch a small marker in the emitted + * bytes to make the build-id depend on a per-process salt and code id + * without modifying the live code. + */ + uint64_t marker = perf_jit_map_state.build_id_salt ^ + ((uint64_t)perf_jit_map_state.code_id << 32) ^ + (uint64_t)code_size; + if (size >= sizeof(marker)) { + size_t prefix = size - sizeof(marker); + perf_map_jit_write_fully((void *)(base), prefix); + perf_map_jit_write_fully(&marker, sizeof(marker)); + } + else if (size > 0) { + uint8_t tmp[sizeof(marker)]; + memcpy(tmp, (void *)(base), size); + for (size_t i = 0; i < size; i++) { + tmp[i] ^= (uint8_t)(marker >> (i * 8)); + } + perf_map_jit_write_fully(tmp, size); + } /* Clean up allocated memory */ PyMem_RawFree(perf_map_entry); } +/* + * Write a complete jitdump entry for a Python function + * + * This is the main function called by Python's trampoline system whenever + * a new piece of JIT-compiled code needs to be recorded. It writes both + * the unwinding information and the code load event to the jitdump file. + * + * The function performs these steps: + * 1. Initialize jitdump system if not already done + * 2. Extract function name and filename from Python code object + * 3. Generate DWARF unwinding information + * 4. Write unwinding info event to jitdump file + * 5. Write code load event to jitdump file + * + * Args: + * state: Jitdump state (currently unused, uses global state) + * code_addr: Address where the compiled code resides + * code_size: Size of the compiled code in bytes + * co: Python code object containing metadata + * + * IMPORTANT: This function signature is part of Python's internal API + * and must not be changed without coordinating with core Python development. + */ +static void perf_map_jit_write_entry(void *state, const void *code_addr, + size_t code_size, PyCodeObject *co) +{ + const char *entry = ""; + const char *filename = ""; + if (co != NULL) { + if (co->co_qualname != NULL) { + entry = PyUnicode_AsUTF8(co->co_qualname); + } + if (co->co_filename != NULL) { + filename = PyUnicode_AsUTF8(co->co_filename); + } + } + perf_map_jit_write_entry_with_name(state, code_addr, code_size, + entry, filename); +} + +void +_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size, + const char *entry, const char *filename) +{ + perf_map_jit_write_entry_with_name( + NULL, code_addr, code_size, entry, filename); +} + // ============================================================================= // CLEANUP AND FINALIZATION // ============================================================================= diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c index 0d835f3b7f56a9..58c61e64bfc4e9 100644 --- a/Python/perf_trampoline.c +++ b/Python/perf_trampoline.c @@ -243,7 +243,7 @@ perf_trampoline_code_watcher(PyCodeEvent event, PyCodeObject *co) static void perf_map_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) + size_t code_size, PyCodeObject *co) { const char *entry = ""; if (co->co_qualname != NULL) { diff --git a/Python/sysmodule.c b/Python/sysmodule.c index ce9c03bda7bd57..dfc3158f702bfe 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2706,7 +2706,7 @@ PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void) { PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( const void *code_addr, - unsigned int code_size, + size_t code_size, const char *entry_name ) { #ifndef MS_WINDOWS @@ -2717,7 +2717,7 @@ PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( } } PyThread_acquire_lock(perf_map_state.map_lock, 1); - fprintf(perf_map_state.perf_map, "%" PRIxPTR " %x %s\n", (uintptr_t) code_addr, code_size, entry_name); + fprintf(perf_map_state.perf_map, "%" PRIxPTR " %zx %s\n", (uintptr_t) code_addr, code_size, entry_name); fflush(perf_map_state.perf_map); PyThread_release_lock(perf_map_state.map_lock); #endif diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py index 7197d70bc8bd0c..17547d4d916e9f 100755 --- a/Tools/build/smelly.py +++ b/Tools/build/smelly.py @@ -25,6 +25,8 @@ # "Legacy": some old symbols are prefixed by "PY_". EXCEPTIONS = frozenset({ 'PY_TIMEOUT_MAX', + '__jit_debug_descriptor', + '__jit_debug_register_code', }) IGNORED_EXTENSION = "_ctypes_test" diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index a251a045b91144..a16d5773d5544c 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -324,8 +324,10 @@ def format_tsv_lines(lines): _abs('Objects/stringlib/unicode_format.h'): (10_000, 400), _abs('Objects/typeobject.c'): (380_000, 13_000), _abs('Python/compile.c'): (20_000, 500), + _abs('Python/jit_unwind.c'): (20_000, 300), _abs('Python/optimizer.c'): (100_000, 5_000), _abs('Python/parking_lot.c'): (40_000, 1000), + _abs('Python/perf_jit_trampoline.c'): (40_000, 1000), _abs('Python/pylifecycle.c'): (750_000, 5000), _abs('Python/pystate.c'): (750_000, 5000), _abs('Python/initconfig.c'): (50_000, 500), diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index d2489387f46caa..1b1a208dcfde0f 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -386,6 +386,8 @@ Python/intrinsics.c - _PyIntrinsics_UnaryFunctions - Python/intrinsics.c - _PyIntrinsics_BinaryFunctions - Python/lock.c - TIME_TO_BE_FAIR_NS - Python/opcode_targets.h - opcode_targets - +Python/jit_unwind.c - __jit_debug_descriptor - +Python/jit_unwind.c - jit_debug_mutex - Python/perf_trampoline.c - _Py_perfmap_callbacks - Python/perf_jit_trampoline.c - _Py_perfmap_jit_callbacks - Python/perf_jit_trampoline.c - perf_jit_map_state - diff --git a/Tools/jit/_eh_frame.py b/Tools/jit/_eh_frame.py new file mode 100644 index 00000000000000..a211c1e352a726 --- /dev/null +++ b/Tools/jit/_eh_frame.py @@ -0,0 +1,112 @@ +"""Minimal DWARF .eh_frame parser for the JIT shim's CFI extraction. + +Reads a compiled object's .eh_frame section and returns the bytes we +need to splice into the runtime-emitted EH frame in Python/jit_unwind.c: +the CIE's initial CFI, the FDE's CFI, and the CIE's alignment factors +and RA column. +""" + +import _stencils + + +def _read_uleb128(data: bytes, pos: int) -> tuple[int, int]: + result = shift = n = 0 + while True: + b = data[pos + n] + n += 1 + result |= (b & 0x7F) << shift + if not (b & 0x80): + return result, n + shift += 7 + + +def _read_sleb128(data: bytes, pos: int) -> tuple[int, int]: + result = shift = n = 0 + while True: + b = data[pos + n] + n += 1 + result |= (b & 0x7F) << shift + shift += 7 + if not (b & 0x80): + if b & 0x40: + result -= 1 << shift + return result, n + + +def parse(data: bytes) -> _stencils.ShimCfi: + """ + Parse a compiled .eh_frame section and return the shim's CFI bytes. + + Extracts the CIE's initial CFI (state at function entry) and the FDE's + CFI (prologue transitions) as raw bytes, plus the CIE's CAF/DAF/RA + column so jit_unwind.c can emit a matching synthetic CIE at runtime. + + Assumptions (verified as they're read): + - The object has exactly one CIE covering one FDE (the shim). + - FDE pointer encoding is DW_EH_PE_absptr or sdata4 (compilers always + pick one of these for an unlinked object); we skip the PC+range + fields either way since we re-emit them at runtime. + """ + pos = 0 + cie_init_cfi: bytes | None = None + code_align = data_align = ra_column = None + fde_cfi: bytes | None = None + cie_has_z = False + # sdata4 default; adjusted below if the CIE's R augmentation says absptr. + fde_ptr_len = 4 + while pos < len(data): + length = int.from_bytes(data[pos : pos + 4], "little") + if length == 0: + break + entry_end = pos + 4 + length + pos += 4 + cie_id = int.from_bytes(data[pos : pos + 4], "little") + pos += 4 + if cie_id == 0: + assert data[pos] == 1, "only DWARF CIE version 1 supported" + pos += 1 + aug_end = data.index(0, pos) + augmentation = data[pos:aug_end].decode("ascii") + pos = aug_end + 1 + code_align, n = _read_uleb128(data, pos) + pos += n + data_align, n = _read_sleb128(data, pos) + pos += n + ra_column = data[pos] + pos += 1 + if "z" in augmentation: + cie_has_z = True + aug_data_len, n = _read_uleb128(data, pos) + pos += n + aug_data_start = pos + for ch in augmentation[1:]: # skip 'z' + if ch == "R": + enc = data[pos] + if enc == 0x00: # DW_EH_PE_absptr + fde_ptr_len = 8 + pos += 1 + elif ch in ("L", "P"): + raise AssertionError( + f"shim .eh_frame has augmentation {ch!r}; unsupported" + ) + pos = aug_data_start + aug_data_len + # Trailing 0-bytes are DW_CFA_nop alignment padding; strip + # them so the splice at runtime re-aligns without double-pad. + cie_init_cfi = bytes(data[pos:entry_end]).rstrip(b"\x00") + else: + assert cie_has_z, "FDE before CIE" + pos += fde_ptr_len # pc_start (we re-emit with absolute encoding) + pos += fde_ptr_len # pc_range + aug_data_len, n = _read_uleb128(data, pos) + pos += n + pos += aug_data_len + fde_cfi = bytes(data[pos:entry_end]).rstrip(b"\x00") # nop padding + pos = entry_end + assert cie_init_cfi is not None and fde_cfi is not None, "no CIE/FDE in .eh_frame" + return _stencils.ShimCfi( + cie_init_cfi=cie_init_cfi, + fde_cfi=fde_cfi, + code_align=code_align, + data_align=data_align, + ra_column=ra_column, + ) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 258de8ab3136a4..ad50933b92b605 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -246,6 +246,22 @@ def pad(self, alignment: int) -> None: self.body.extend([0] * padding) +@dataclasses.dataclass +class ShimCfi: + """ + DWARF CFI bytes captured from the compiled shim's .eh_frame. + + These let jit_unwind.c emit an accurate GDB EH frame for the shim + without hand-rolling DWARF that tracks the compiler's prologue. + """ + + cie_init_cfi: bytes + fde_cfi: bytes + code_align: int + data_align: int + ra_column: int + + @dataclasses.dataclass class StencilGroup: """ @@ -259,6 +275,7 @@ class StencilGroup: symbols: dict[int | str, tuple[HoleValue, int]] = dataclasses.field( default_factory=dict, init=False ) + shim_cfi: ShimCfi | None = dataclasses.field(default=None, init=False) _jit_symbol_table: dict[str, int] = dataclasses.field( default_factory=dict, init=False ) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 787fcf53260f3d..e94f351966f1d3 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -12,6 +12,7 @@ import typing import shlex +import _eh_frame import _llvm import _optimizers import _schema @@ -163,10 +164,10 @@ async def _compile( # __FILE__ macro and assert failure messages) for reproducibility: f"-ffile-prefix-map={CPYTHON}=.", f"-ffile-prefix-map={tempdir}=.", - # This debug info isn't necessary, and bloats out the JIT'ed code. - # We *may* be able to re-enable this, process it, and JIT it for a - # nicer debugging experience... but that needs a lot more research: - "-fno-asynchronous-unwind-tables", + # Unwind info is per-stencil (see below): disabled for + # executors (bloats the JIT'ed code with info we'd have to + # process and re-JIT), enabled for the shim because its + # .eh_frame is what jit_unwind.c ships to GDB. # Don't call built-in functions that we can't find or patch: "-fno-builtin", # Don't call stack-smashing canaries that we can't find or patch: @@ -177,6 +178,16 @@ async def _compile( f"{c}", ] is_shim = opname == "shim" + # Executors ride the pinned-frame-pointer invariant so we can + # synthesize their CFI by hand. Only the Linux/ELF shim consumes + # compiler-emitted unwind tables; enabling them on COFF/Mach-O + # would introduce platform unwind relocations our parsers do not + # handle. + args_s.append( + "-fasynchronous-unwind-tables" + if is_shim and isinstance(self, _ELF) + else "-fno-asynchronous-unwind-tables" + ) if self.frame_pointers: frame_pointer = "all" if is_shim else "reserved" args_s += ["-Xclang", f"-mframe-pointer={frame_pointer}"] @@ -388,6 +399,7 @@ def _handle_section( self, section: _schema.ELFSection, group: _stencils.StencilGroup ) -> None: section_type = section["Type"]["Name"] + section_name = section["Name"]["Name"] flags = {flag["Name"] for flag in section["Flags"]["Flags"]} if section_type == "SHT_RELA": assert "SHF_INFO_LINK" in flags, flags @@ -406,6 +418,15 @@ def _handle_section( relocation = wrapped_relocation["Relocation"] hole = self._handle_relocation(base, relocation, stencil.body) stencil.holes.append(hole) + elif section_name == ".eh_frame": + if "SHF_ALLOC" not in flags: + return + # LLVM 21 emits x86_64 .eh_frame as SHT_X86_64_UNWIND. + assert section_type in {"SHT_PROGBITS", "SHT_X86_64_UNWIND"}, ( + section_type + ) + group.shim_cfi = _eh_frame.parse(bytes(section["SectionData"]["Bytes"])) + return elif section_type == "SHT_PROGBITS": if "SHF_ALLOC" not in flags: return diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 5fd9a2ee2d6e58..2768811ab75a9d 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -7,6 +7,38 @@ import _stencils +def _byte_rows(blob: bytes, per_row: int) -> typing.Iterator[str]: + for i in range(0, len(blob), per_row): + yield " ".join(f"{b:#04x}," for b in blob[i : i + per_row]) + + +def _dump_shim_cfi(group: _stencils.StencilGroup) -> typing.Iterator[str]: + # Only ELF objects carry a .eh_frame we can capture; on Mach-O and + # COFF the unwind info lives in different sections (and the GDB JIT + # interface this feeds is Linux+ELF-only anyway — see the #ifdef in + # Python/jit.c jit_record_code()). Custom CFLAGS may also suppress + # the shim's .eh_frame even on Linux+ELF, so always emit a feature + # guard before any optional symbol definitions. + cfi = group.shim_cfi + if cfi is None: + yield "#define _Py_JIT_HAS_SHIM_CFI 0" + yield "" + return + yield "#define _Py_JIT_HAS_SHIM_CFI 1" + for name, blob in ( + ("_Py_jit_shim_cie_init_cfi", cfi.cie_init_cfi), + ("_Py_jit_shim_fde_cfi", cfi.fde_cfi), + ): + yield f"static const uint8_t {name}[{len(blob)}] = {{" + for row in _byte_rows(blob, per_row=12): + yield f" {row}" + yield "};" + yield f"#define _Py_jit_shim_code_align {cfi.code_align}" + yield f"#define _Py_jit_shim_data_align {cfi.data_align}" + yield f"#define _Py_jit_shim_ra_column {cfi.ra_column}" + yield "" + + def _dump_footer( groups: dict[str, _stencils.StencilGroup], symbols: dict[str, int] ) -> typing.Iterator[str]: @@ -23,6 +55,7 @@ def _dump_footer( yield " symbol_mask got_mask;" yield "} StencilGroup;" yield "" + yield from _dump_shim_cfi(groups["shim"]) yield f"static const StencilGroup shim = {groups['shim'].as_c('shim')};" yield "" yield "static const StencilGroup stencil_groups[MAX_UOP_REGS_ID + 1] = {" @@ -53,8 +86,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator stripped = stencil.body.rstrip(b"\x00") if stripped: yield f" const unsigned char {part}_body[{len(stencil.body)}] = {{" - for i in range(0, len(stripped), 8): - row = " ".join(f"{byte:#04x}," for byte in stripped[i : i + 8]) + for row in _byte_rows(stripped, per_row=8): yield f" {row}" yield " };" # Data is written first (so relaxations in the code work properly):