diff --git a/Doc/c-api/perfmaps.rst b/Doc/c-api/perfmaps.rst
index 76a1e9f528dc70..bd05e628faaaa1 100644
--- a/Doc/c-api/perfmaps.rst
+++ b/Doc/c-api/perfmaps.rst
@@ -31,7 +31,7 @@ Note that holding an :term:`attached thread state` is not required for these API
    or ``-2`` on failure to create a lock. Check ``errno`` for more information
    about the cause of a failure.
 
-.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name)
+.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, size_t code_size, const char *entry_name)
 
    Write one single entry to the ``/tmp/perf-$pid.map`` file. This function is
    thread safe. Here is what an example entry looks like::
diff --git a/Include/cpython/ceval.h b/Include/cpython/ceval.h
index bbab8d35b75cb2..5b66fa1040d738 100644
--- a/Include/cpython/ceval.h
+++ b/Include/cpython/ceval.h
@@ -38,7 +38,7 @@ typedef struct {
 PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void);
 PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(
     const void *code_addr,
-    unsigned int code_size,
+    size_t code_size,
     const char *entry_name);
 PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void);
 PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename);
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 9fd3be74404907..fd98db16650507 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -94,7 +94,7 @@ typedef struct {
     void* (*init_state)(void);
     // Callback to register every trampoline being created
     void (*write_state)(void* state, const void *code_addr,
-                        unsigned int code_size, PyCodeObject* code);
+                        size_t code_size, PyCodeObject* code);
     // Callback to free the trampoline state
     int (*free_state)(void* state);
 } _PyPerf_Callbacks;
@@ -108,6 +108,10 @@ extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
 #ifdef PY_HAVE_PERF_TRAMPOLINE
 extern _PyPerf_Callbacks _Py_perfmap_callbacks;
 extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks;
+extern void _PyPerfJit_WriteNamedCode(const void *code_addr,
+                                      size_t code_size,
+                                      const char *entry,
+                                      const char *filename);
 #endif
 
 static inline PyObject*
diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
index f76d4f41c55119..a1304dbb298c8c 100644
--- a/Include/internal/pycore_interp_structs.h
+++ b/Include/internal/pycore_interp_structs.h
@@ -69,7 +69,7 @@ struct code_arena_st;
 struct trampoline_api_st {
     void* (*init_state)(void);
     void (*write_state)(void* state, const void *code_addr,
-                        unsigned int code_size, PyCodeObject* code);
+                        size_t code_size, PyCodeObject* code);
     int (*free_state)(void* state);
     void *state;
     Py_ssize_t code_padding;
diff --git a/Include/internal/pycore_jit_unwind.h b/Include/internal/pycore_jit_unwind.h
new file mode 100644
index 00000000000000..99f3ac585bf650
--- /dev/null
+++ b/Include/internal/pycore_jit_unwind.h
@@ -0,0 +1,90 @@
+#ifndef Py_INTERNAL_JIT_UNWIND_H
+#define Py_INTERNAL_JIT_UNWIND_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * Compiler-emitted CFI for the shim region (GDB path only).
+ *
+ * Captured at build time by Tools/jit from the shim's compiled .eh_frame
+ * so the runtime CIE/FDE can describe whatever prologue the compiler
+ * chose, without hand-rolling DWARF. Executors pass NULL and fall back
+ * to the invariant-based steady-state rule that the CIE emits by hand.
+ *
+ * The struct is defined unconditionally so jit_record_code() in Python/jit.c
+ * has a valid pointer type on every platform — callers on non-(Linux+ELF)
+ * always pass NULL, matching jit_record_code()'s internal #ifdef.
+ */
+typedef struct {
+    const uint8_t *cie_init_cfi;
+    size_t         cie_init_cfi_size;
+    const uint8_t *fde_cfi;
+    size_t         fde_cfi_size;
+    uint32_t       code_align;
+    int32_t        data_align;
+    uint32_t       ra_column;
+} _PyJitUnwind_ShimCfi;
+
+#if defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__))
+
+/* DWARF exception-handling pointer encodings shared by JIT unwind users. */
+enum {
+    DWRF_EH_PE_absptr = 0x00,
+    DWRF_EH_PE_omit = 0xff,
+
+    /* Data type encodings */
+    DWRF_EH_PE_uleb128 = 0x01,
+    DWRF_EH_PE_udata2 = 0x02,
+    DWRF_EH_PE_udata4 = 0x03,
+    DWRF_EH_PE_udata8 = 0x04,
+    DWRF_EH_PE_sleb128 = 0x09,
+    DWRF_EH_PE_sdata2 = 0x0a,
+    DWRF_EH_PE_sdata4 = 0x0b,
+    DWRF_EH_PE_sdata8 = 0x0c,
+    DWRF_EH_PE_signed = 0x08,
+
+    /* Reference type encodings */
+    DWRF_EH_PE_pcrel = 0x10,
+    DWRF_EH_PE_textrel = 0x20,
+    DWRF_EH_PE_datarel = 0x30,
+    DWRF_EH_PE_funcrel = 0x40,
+    DWRF_EH_PE_aligned = 0x50,
+    DWRF_EH_PE_indirect = 0x80
+};
+
+/* Return the size of the generated .eh_frame data for the given encoding. */
+size_t _PyJitUnwind_EhFrameSize(int absolute_addr,
+                                const _PyJitUnwind_ShimCfi *shim_cfi);
+
+/*
+ * Build DWARF .eh_frame data for JIT code; returns size written or 0 on error.
+ * absolute_addr selects the FDE address encoding:
+ * - 0: PC-relative offsets (perf jitdump synthesized DSO).
+ * - nonzero: absolute addresses (GDB JIT in-memory ELF).
+ *
+ * shim_cfi selects which JIT region the CFI describes (GDB path only):
+ * - NULL: executor trace; steady-state rule in the CIE applies at every PC.
+ * - non-NULL: compile_shim() output; the captured CIE/FDE CFI bytes are
+ *            spliced in so unwinding is valid at every PC in the shim.
+ */
+size_t _PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size,
+                                 const void *code_addr, size_t code_size,
+                                 int absolute_addr,
+                                 const _PyJitUnwind_ShimCfi *shim_cfi);
+
+void *_PyJitUnwind_GdbRegisterCode(const void *code_addr,
+                                  size_t code_size,
+                                  const char *entry,
+                                  const char *filename,
+                                  const _PyJitUnwind_ShimCfi *shim_cfi);
+
+void _PyJitUnwind_GdbUnregisterCode(void *handle);
+
+#endif  // defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__))
+
+#endif  // Py_INTERNAL_JIT_UNWIND_H
diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h
index 2986afb142b5d1..df6467e41cc208 100644
--- a/Include/internal/pycore_optimizer.h
+++ b/Include/internal/pycore_optimizer.h
@@ -151,6 +151,7 @@ typedef struct _PyExecutorObject {
     uint32_t code_size;
     size_t jit_size;
     void *jit_code;
+    void *jit_gdb_handle;
     _PyExitData exits[1];
 } _PyExecutorObject;
 
diff --git a/Lib/test/test_gdb/gdb_jit_sample.py b/Lib/test/test_gdb/gdb_jit_sample.py
new file mode 100644
index 00000000000000..b439e82e8b312f
--- /dev/null
+++ b/Lib/test/test_gdb/gdb_jit_sample.py
@@ -0,0 +1,27 @@
+# Sample script for use by test_gdb.test_jit
+
+import _testinternalcapi
+import operator
+
+
+WARMUP_ITERATIONS = _testinternalcapi.TIER2_THRESHOLD + 10
+
+
+def jit_bt_hot(depth, warming_up_caller=False):
+    if depth == 0:
+        if not warming_up_caller:
+            id(42)
+        return
+
+    for iteration in range(WARMUP_ITERATIONS):
+        operator.call(
+            jit_bt_hot,
+            depth - 1,
+            warming_up_caller or iteration + 1 != WARMUP_ITERATIONS,
+        )
+
+
+# Warm the shared shim once without hitting builtin_id so the real run uses
+# the steady-state shim path when GDB breaks inside id(42).
+jit_bt_hot(1, warming_up_caller=True)
+jit_bt_hot(1)
diff --git a/Lib/test/test_gdb/test_jit.py b/Lib/test/test_gdb/test_jit.py
new file mode 100644
index 00000000000000..39e09f6c8b4a9f
--- /dev/null
+++ b/Lib/test/test_gdb/test_jit.py
@@ -0,0 +1,204 @@
+import os
+import platform
+import re
+import sys
+import unittest
+
+from .util import setup_module, DebuggerTests
+
+
+JIT_SAMPLE_SCRIPT = os.path.join(os.path.dirname(__file__), "gdb_jit_sample.py")
+# In batch GDB, break in builtin_id() while it is running under JIT,
+# then repeatedly "finish" until the selected frame is the JIT entry.
+# That gives a deterministic backtrace starting with py::jit_entry:<jit>.
+#
+# builtin_id() sits only a few helper frames above the JIT entry on this path.
+# This bound is just a generous upper limit so the test fails clearly if the
+# expected stack shape changes.
+MAX_FINISH_STEPS = 20
+# Break directly on the lazy shim entry in the binary, then single-step just
+# enough to let it install the compiled JIT entry and set a temporary
+# breakpoint on the resulting address.
+MAX_ENTRY_SETUP_STEPS = 20
+# After landing on the JIT entry frame, single-step a bounded number of
+# instructions further into the blob so the backtrace is taken from JIT code
+# itself rather than the immediate helper-return site. The exact number of
+# steps is not significant: each step is cross-checked against the selected
+# frame's symbol so the test fails loudly if stepping escapes the registered
+# JIT region, instead of asserting against a misleading backtrace.
+MAX_JIT_ENTRY_STEPS = 4
+EVAL_FRAME_RE = r"(_PyEval_EvalFrameDefault|_PyEval_Vector)"
+BACKTRACE_FRAME_RE = re.compile(r"^#\d+\s+.*$", re.MULTILINE)
+
+FINISH_TO_JIT_ENTRY = (
+    "python exec(\"import gdb\\n"
+    "target = 'py::jit_entry:<jit>'\\n"
+    f"for _ in range({MAX_FINISH_STEPS}):\\n"
+    "    frame = gdb.selected_frame()\\n"
+    "    if frame is not None and frame.name() == target:\\n"
+    "        break\\n"
+    "    gdb.execute('finish')\\n"
+    "else:\\n"
+    "    raise RuntimeError('did not reach %s' % target)\\n\")"
+)
+BREAK_IN_COMPILED_JIT_ENTRY = (
+    "python exec(\"import gdb\\n"
+    "lazy = int(gdb.parse_and_eval('(void*)_Py_LazyJitShim'))\\n"
+    f"for _ in range({MAX_ENTRY_SETUP_STEPS}):\\n"
+    "    entry = int(gdb.parse_and_eval('(void*)_Py_jit_entry'))\\n"
+    "    if entry != lazy:\\n"
+    "        gdb.execute('tbreak *0x%x' % entry)\\n"
+    "        break\\n"
+    "    gdb.execute('next')\\n"
+    "else:\\n"
+    "    raise RuntimeError('compiled JIT entry was not installed')\\n\")"
+)
+STEP_INSIDE_JIT_ENTRY = (
+    "python exec(\"import gdb\\n"
+    "target = 'py::jit_entry:<jit>'\\n"
+    f"for _ in range({MAX_JIT_ENTRY_STEPS}):\\n"
+    "    frame = gdb.selected_frame()\\n"
+    "    if frame is None or frame.name() != target:\\n"
+    "        raise RuntimeError('left JIT region during stepping: '\\n"
+    "                           + repr(frame and frame.name()))\\n"
+    "    gdb.execute('si')\\n"
+    "frame = gdb.selected_frame()\\n"
+    "if frame is None or frame.name() != target:\\n"
+    "    raise RuntimeError('stepped out of JIT region after si')\\n\")"
+)
+
+
+def setUpModule():
+    setup_module()
+
+
+# The GDB JIT interface registration is gated on __linux__ && __ELF__ in
+# Python/jit_unwind.c, and the synthetic EH-frame is only implemented for
+# x86_64 and AArch64 (a #error fires otherwise). Skip cleanly on other
+# platforms or architectures instead of producing timeouts / empty backtraces.
+# is_enabled() implies is_available() and also implies that the runtime has
+# JIT execution active; interpreter-only tier 2 builds don't hit this path.
+@unittest.skipUnless(sys.platform == "linux",
+                     "GDB JIT interface is only implemented for Linux + ELF")
+@unittest.skipUnless(platform.machine() in ("x86_64", "aarch64"),
+                     "GDB JIT CFI emitter only supports x86_64 and AArch64")
+@unittest.skipUnless(hasattr(sys, "_jit") and sys._jit.is_enabled(),
+                     "requires a JIT-enabled build with JIT execution active")
+class JitBacktraceTests(DebuggerTests):
+    def get_stack_trace(self, **kwargs):
+        # These tests validate the JIT-relevant part of the backtrace via
+        # _assert_jit_backtrace_shape, so an unrelated "?? ()" frame below
+        # the JIT/eval segment (e.g. libc without debug info) is tolerable.
+        kwargs.setdefault("skip_on_truncation", False)
+        return super().get_stack_trace(**kwargs)
+
+    def _extract_backtrace_frames(self, gdb_output):
+        frames = BACKTRACE_FRAME_RE.findall(gdb_output)
+        self.assertGreater(
+            len(frames), 0,
+            f"expected at least one GDB backtrace frame in output:\n{gdb_output}",
+        )
+        return frames
+
+    def _assert_jit_backtrace_shape(self, gdb_output, *, anchor_at_top):
+        # Shape assertions applied to every JIT backtrace we produce:
+        #   1. The synthetic JIT symbol appears exactly once. A second
+        #      py::jit_entry:<jit> frame would mean the unwinder is
+        #      materializing two native frames for a single logical JIT
+        #      region, or failing to unwind out of the region entirely.
+        #   2. At least one _PyEval_EvalFrameDefault / _PyEval_Vector
+        #      frame appears after the JIT frame, proving the unwinder
+        #      climbs back out of the JIT region into the eval loop.
+        #      Helper frames from inside the JITted region may still
+        #      appear above the synthetic JIT frame in the backtrace.
+        #   4. For tests that assert a specific entry PC, the JIT frame
+        #      is also at #0.
+        frames = self._extract_backtrace_frames(gdb_output)
+        backtrace = "\n".join(frames)
+
+        jit_frames = [frame for frame in frames if "py::jit_entry:<jit>" in frame]
+        jit_count = len(jit_frames)
+        self.assertEqual(
+            jit_count, 1,
+            f"expected exactly 1 py::jit_entry:<jit> frame, got {jit_count}\n"
+            f"backtrace:\n{backtrace}",
+        )
+        eval_frames = [frame for frame in frames if re.search(EVAL_FRAME_RE, frame)]
+        eval_count = len(eval_frames)
+        self.assertGreaterEqual(
+            eval_count, 1,
+            f"expected at least one _PyEval_* frame, got {eval_count}\n"
+            f"backtrace:\n{backtrace}",
+        )
+        jit_frame_index = next(
+            i for i, frame in enumerate(frames) if "py::jit_entry:<jit>" in frame
+        )
+        eval_after_jit = any(
+            re.search(EVAL_FRAME_RE, frame)
+            for frame in frames[jit_frame_index + 1:]
+        )
+        self.assertTrue(
+            eval_after_jit,
+            f"expected an eval frame after the JIT frame\n"
+            f"backtrace:\n{backtrace}",
+        )
+        relevant_end = max(
+            i
+            for i, frame in enumerate(frames)
+            if "py::jit_entry:<jit>" in frame or re.search(EVAL_FRAME_RE, frame)
+        )
+        truncated_frames = [
+            frame for frame in frames[: relevant_end + 1]
+            if " ?? ()" in frame
+        ]
+        self.assertFalse(
+            truncated_frames,
+            "unexpected truncated frame before the validated JIT/eval segment\n"
+            f"backtrace:\n{backtrace}",
+        )
+        if anchor_at_top:
+            self.assertRegex(
+                frames[0],
+                re.compile(r"^#0\s+py::jit_entry:<jit>"),
+            )
+
+    def test_bt_shows_compiled_jit_entry(self):
+        gdb_output = self.get_stack_trace(
+            script=JIT_SAMPLE_SCRIPT,
+            breakpoint="_Py_LazyJitShim",
+            cmds_after_breakpoint=[
+                BREAK_IN_COMPILED_JIT_ENTRY,
+                "continue",
+                "bt",
+            ],
+            PYTHON_JIT="1",
+        )
+        # GDB registers the compiled JIT entry and per-trace JIT regions under
+        # the same synthetic symbol name; breaking at the entry PC pins the
+        # JIT frame at #0.
+        self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=True)
+
+    def test_bt_unwinds_through_jit_frames(self):
+        gdb_output = self.get_stack_trace(
+            script=JIT_SAMPLE_SCRIPT,
+            cmds_after_breakpoint=["bt"],
+            PYTHON_JIT="1",
+        )
+        # The executor should appear as a named JIT frame and unwind back into
+        # the eval loop. Whether GDB also materializes a separate shim frame is
+        # an implementation detail of the synthetic executor CFI.
+        self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=False)
+
+    def test_bt_unwinds_from_inside_jit_entry(self):
+        gdb_output = self.get_stack_trace(
+            script=JIT_SAMPLE_SCRIPT,
+            cmds_after_breakpoint=[
+                FINISH_TO_JIT_ENTRY,
+                STEP_INSIDE_JIT_ENTRY,
+                "bt",
+            ],
+            PYTHON_JIT="1",
+        )
+        # Once the selected PC is inside the JIT entry, we require that GDB
+        # identifies the JIT frame at #0 and keeps unwinding into _PyEval_*.
+        self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=True)
diff --git a/Lib/test/test_gdb/util.py b/Lib/test/test_gdb/util.py
index 8097fd52ababe6..d903adcf2903f3 100644
--- a/Lib/test/test_gdb/util.py
+++ b/Lib/test/test_gdb/util.py
@@ -20,6 +20,27 @@
 
 PYTHONHASHSEED = '123'
 
+# gh-91960, bpo-40019: gdb reports these when the optimizer has dropped
+# python-frame debug info; the test can't read what's not there.
+_OPTIMIZED_OUT_PATTERNS = (
+    '(frame information optimized out)',
+    'Unable to read information on python frame',
+    '(unable to read python frame information)',
+)
+# gdb prints this when the unwinder genuinely failed to walk a frame —
+# i.e. the CFI (ours or a library's) is wrong. Treat as a hard failure,
+# not a skip, so regressions in our own unwind info don't hide.
+_UNWIND_FAILURE_PATTERNS = (
+    'Backtrace stopped: frame did not save the PC',
+)
+# gh-104736: " ?? ()" in the bt usually means the unwinder bailed early,
+# but can also be unrelated frames without debug info (e.g. libc). Tests
+# that validate the JIT-relevant part of the backtrace themselves can
+# opt out via skip_on_truncation=False.
+_TRUNCATED_BACKTRACE_PATTERNS = (
+    ' ?? ()',
+)
+
 
 def clean_environment():
     # Remove PYTHON* environment variables such as PYTHONHOME
@@ -160,7 +181,9 @@ def get_stack_trace(self, source=None, script=None,
                         breakpoint=BREAKPOINT_FN,
                         cmds_after_breakpoint=None,
                         import_site=False,
-                        ignore_stderr=False):
+                        ignore_stderr=False,
+                        skip_on_truncation=True,
+                        **env_vars):
         '''
         Run 'python -c SOURCE' under gdb with a breakpoint.
 
@@ -239,7 +262,7 @@ def get_stack_trace(self, source=None, script=None,
             args += [script]
 
         # Use "args" to invoke gdb, capturing stdout, stderr:
-        out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED)
+        out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED, **env_vars)
 
         if not ignore_stderr:
             for line in err.splitlines():
@@ -255,26 +278,20 @@ def get_stack_trace(self, source=None, script=None,
                                     " because the Program Counter is"
                                     " not present")
 
+        for pattern in _UNWIND_FAILURE_PATTERNS:
+            if pattern in out:
+                raise AssertionError(
+                    f"gdb unwinder failed ({pattern!r}) — CFI bug in our "
+                    f"generated code or in a linked library.\n"
+                    f"Full gdb output:\n{out}"
+                )
+
         # bpo-40019: Skip the test if gdb failed to read debug information
         # because the Python binary is optimized.
-        for pattern in (
-            '(frame information optimized out)',
-            'Unable to read information on python frame',
-
-            # gh-91960: On Python built with "clang -Og", gdb gets
-            # "frame=<optimized out>" for _PyEval_EvalFrameDefault() parameter
-            '(unable to read python frame information)',
-
-            # gh-104736: On Python built with "clang -Og" on ppc64le,
-            # "py-bt" displays a truncated or not traceback, but "where"
-            # logs this error message:
-            'Backtrace stopped: frame did not save the PC',
-
-            # gh-104736: When "bt" command displays something like:
-            # "#1  0x0000000000000000 in ?? ()", the traceback is likely
-            # truncated or wrong.
-            ' ?? ()',
-        ):
+        patterns = _OPTIMIZED_OUT_PATTERNS
+        if skip_on_truncation:
+            patterns = patterns + _TRUNCATED_BACKTRACE_PATTERNS
+        for pattern in patterns:
             if pattern in out:
                 raise unittest.SkipTest(f"{pattern!r} found in gdb output")
 
diff --git a/Lib/test/test_perfmaps.py b/Lib/test/test_perfmaps.py
index 647c32656abd6d..ee4eb50033c470 100644
--- a/Lib/test/test_perfmaps.py
+++ b/Lib/test/test_perfmaps.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import sysconfig
 import unittest
 
@@ -17,6 +18,9 @@ def supports_trampoline_profiling():
     raise unittest.SkipTest("perf trampoline profiling not supported")
 
 class TestPerfMapWriting(unittest.TestCase):
+    def tearDown(self):
+        perf_map_state_teardown()
+
     def test_write_perf_map_entry(self):
         self.assertEqual(write_perf_map_entry(0x1234, 5678, "entry1"), 0)
         self.assertEqual(write_perf_map_entry(0x2345, 6789, "entry2"), 0)
@@ -24,4 +28,15 @@ def test_write_perf_map_entry(self):
             perf_file_contents = f.read()
             self.assertIn("1234 162e entry1", perf_file_contents)
             self.assertIn("2345 1a85 entry2", perf_file_contents)
-        perf_map_state_teardown()
+
+    @unittest.skipIf(sys.maxsize <= 2**32, "requires size_t wider than unsigned int")
+    def test_write_perf_map_entry_large_size(self):
+        code_addr = 0x3456
+        code_size = 1 << 33
+        entry_name = "entry_big"
+
+        self.assertEqual(write_perf_map_entry(code_addr, code_size, entry_name), 0)
+        with open(f"/tmp/perf-{os.getpid()}.map") as f:
+            perf_file_contents = f.read()
+            self.assertIn(f"{code_addr:x} {code_size:x} {entry_name}",
+                          perf_file_contents)
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 354580aa482d25..b7342da861fb68 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -510,6 +510,7 @@ PYTHON_OBJS=	\
 		Python/suggestions.o \
 		Python/perf_trampoline.o \
 		Python/perf_jit_trampoline.o \
+		Python/jit_unwind.o \
 		Python/remote_debugging.o \
 		Python/$(DYNLOADFILE) \
 		$(LIBOBJS) \
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst
new file mode 100644
index 00000000000000..57e897cabc494b
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst
@@ -0,0 +1 @@
+Add support for unwinding JIT frames using GDB. Patch by Diego Russo
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 7f6ea621f87145..7a37a526ffb53e 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -1210,16 +1210,22 @@ write_perf_map_entry(PyObject *self, PyObject *args)
 {
     PyObject *code_addr_v;
     const void *code_addr;
-    unsigned int code_size;
+    PyObject *code_size_s;
+    size_t code_size;
     const char *entry_name;
 
-    if (!PyArg_ParseTuple(args, "OIs", &code_addr_v, &code_size, &entry_name))
+    if (!PyArg_ParseTuple(args, "OOs", &code_addr_v, &code_size_s, &entry_name))
         return NULL;
     code_addr = PyLong_AsVoidPtr(code_addr_v);
     if (code_addr == NULL) {
         return NULL;
     }
 
+    code_size = PyLong_AsSize_t(code_size_s);
+    if (code_size == (size_t)-1 && PyErr_Occurred()) {
+        return NULL;
+    }
+
     int ret = PyUnstable_WritePerfMapEntry(code_addr, code_size, entry_name);
     if (ret < 0) {
         PyErr_SetFromErrno(PyExc_OSError);
diff --git a/Python/jit.c b/Python/jit.c
index 4990c743224d3c..fd0a53673d21da 100644
--- a/Python/jit.c
+++ b/Python/jit.c
@@ -15,6 +15,7 @@
 #include "pycore_interpframe.h"
 #include "pycore_interpolation.h"
 #include "pycore_intrinsics.h"
+#include "pycore_jit_unwind.h"
 #include "pycore_lazyimportobject.h"
 #include "pycore_list.h"
 #include "pycore_long.h"
@@ -60,7 +61,36 @@ jit_error(const char *message)
     PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint);
 }
 
+static void *
+jit_record_code(const void *code_addr, size_t code_size,
+                const char *entry, const char *filename,
+                const _PyJitUnwind_ShimCfi *shim_cfi)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    _PyPerf_Callbacks callbacks;
+    _PyPerfTrampoline_GetCallbacks(&callbacks);
+    if (callbacks.write_state == _Py_perfmap_jit_callbacks.write_state) {
+        _PyPerfJit_WriteNamedCode(
+            code_addr, code_size, entry, filename);
+        return NULL;
+    }
+#endif
+
+#if defined(__linux__) && defined(__ELF__)
+    return _PyJitUnwind_GdbRegisterCode(
+        code_addr, code_size, entry, filename, shim_cfi);
+#else
+    (void)code_addr;
+    (void)code_size;
+    (void)entry;
+    (void)filename;
+    (void)shim_cfi;
+    return NULL;
+#endif
+}
+
 static size_t _Py_jit_shim_size = 0;
+static void *_Py_jit_shim_gdb_handle = NULL;
 
 static int
 address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr)
@@ -731,6 +761,11 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
     }
     executor->jit_code = memory;
     executor->jit_size = total_size;
+    executor->jit_gdb_handle = jit_record_code(memory,
+                    code_size + state.trampolines.size,
+                    "jit_entry",
+                    "<jit>",
+                    /*shim_cfi=*/NULL);
     return 0;
 }
 
@@ -781,6 +816,29 @@ compile_shim(void)
         return NULL;
     }
     _Py_jit_shim_size = total_size;
+    /* GDB JIT unwind info (and the captured-.eh_frame blob that feeds it)
+     * is only wired up on Linux+ELF — see jit_record_code() below. Even
+     * there, user-provided JIT CFLAGS can suppress the shim's .eh_frame,
+     * so jit_stencils.h advertises whether the captured CFI blob exists. */
+#if defined(__linux__) && defined(__ELF__) && _Py_JIT_HAS_SHIM_CFI
+    static const _PyJitUnwind_ShimCfi shim_cfi = {
+        .cie_init_cfi      = _Py_jit_shim_cie_init_cfi,
+        .cie_init_cfi_size = sizeof(_Py_jit_shim_cie_init_cfi),
+        .fde_cfi           = _Py_jit_shim_fde_cfi,
+        .fde_cfi_size      = sizeof(_Py_jit_shim_fde_cfi),
+        .code_align        = _Py_jit_shim_code_align,
+        .data_align        = _Py_jit_shim_data_align,
+        .ra_column         = _Py_jit_shim_ra_column,
+    };
+    const _PyJitUnwind_ShimCfi *shim_cfi_ptr = &shim_cfi;
+#else
+    const _PyJitUnwind_ShimCfi *shim_cfi_ptr = NULL;
+#endif
+    _Py_jit_shim_gdb_handle = jit_record_code(memory,
+                                              code_size + state.trampolines.size,
+                                              "jit_entry",
+                                              "<jit>",
+                                              shim_cfi_ptr);
     return (_PyJitEntryFuncPtr)memory;
 }
 
@@ -812,6 +870,12 @@ _PyJIT_Free(_PyExecutorObject *executor)
     if (memory) {
         executor->jit_code = NULL;
         executor->jit_size = 0;
+#if defined(__linux__) && defined(__ELF__)
+        if (executor->jit_gdb_handle != NULL) {
+            _PyJitUnwind_GdbUnregisterCode(executor->jit_gdb_handle);
+            executor->jit_gdb_handle = NULL;
+        }
+#endif
         if (jit_free(memory, size)) {
             PyErr_FormatUnraisable("Exception ignored while "
                                    "freeing JIT memory");
@@ -829,6 +893,12 @@ _PyJIT_Fini(void)
     if (size) {
         _Py_jit_entry = _Py_LazyJitShim;
         _Py_jit_shim_size = 0;
+#if defined(__linux__) && defined(__ELF__)
+        if (_Py_jit_shim_gdb_handle != NULL) {
+            _PyJitUnwind_GdbUnregisterCode(_Py_jit_shim_gdb_handle);
+            _Py_jit_shim_gdb_handle = NULL;
+        }
+#endif
         if (jit_free(memory, size)) {
             PyErr_FormatUnraisable("Exception ignored while "
                                    "freeing JIT entry code");
diff --git a/Python/jit_unwind.c b/Python/jit_unwind.c
new file mode 100644
index 00000000000000..0e8c61129d5609
--- /dev/null
+++ b/Python/jit_unwind.c
@@ -0,0 +1,1017 @@
+/*
+ * Python JIT - DWARF .eh_frame builder
+ *
+ * This file contains the DWARF CFI generator used to build .eh_frame
+ * data for JIT code (perf jitdump and other unwinders).
+ */
+
+#include "Python.h"
+#include "pycore_jit_unwind.h"
+#include "pycore_lock.h"
+
+#if defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__))
+
+#if defined(__linux__)
+#  include <elf.h>
+#endif
+#include <stdio.h>
+#include <string.h>
+
+// =============================================================================
+//                              DWARF CONSTANTS
+// =============================================================================
+
+/*
+ * DWARF (Debug With Arbitrary Record Formats) constants
+ *
+ * DWARF is a debugging data format used to provide stack unwinding information.
+ * These constants define the various encoding types and opcodes used in
+ * DWARF Call Frame Information (CFI) records.
+ */
+
+/* DWARF Call Frame Information version */
+#define DWRF_CIE_VERSION 1
+
+/* DWARF CFA (Call Frame Address) opcodes */
+enum {
+    DWRF_CFA_nop = 0x0,                    // No operation
+    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
+    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
+    DWRF_CFA_def_cfa_register = 0xd,      // Define CFA register
+    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
+    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
+    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
+    DWRF_CFA_offset = 0x80,               // Simple offset instruction
+    DWRF_CFA_restore = 0xc0               // Restore register
+};
+
+/*
+ * Architecture-specific DWARF register numbers
+ *
+ * These constants define the register numbering scheme used by DWARF
+ * for each supported architecture. The numbers must match the ABI
+ * specification for proper stack unwinding.
+ */
+enum {
+#ifdef __x86_64__
+    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
+    DWRF_REG_AX,    // RAX
+    DWRF_REG_DX,    // RDX
+    DWRF_REG_CX,    // RCX
+    DWRF_REG_BX,    // RBX
+    DWRF_REG_SI,    // RSI
+    DWRF_REG_DI,    // RDI
+    DWRF_REG_BP,    // RBP
+    DWRF_REG_SP,    // RSP
+    DWRF_REG_8,     // R8
+    DWRF_REG_9,     // R9
+    DWRF_REG_10,    // R10
+    DWRF_REG_11,    // R11
+    DWRF_REG_12,    // R12
+    DWRF_REG_13,    // R13
+    DWRF_REG_14,    // R14
+    DWRF_REG_15,    // R15
+    DWRF_REG_RA,    // Return address (RIP)
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+    /* AArch64 register numbering */
+    DWRF_REG_FP = 29,  // Frame Pointer
+    DWRF_REG_RA = 30,  // Link register (return address)
+    DWRF_REG_SP = 31,  // Stack pointer
+#else
+#    error "Unsupported target architecture"
+#endif
+};
+
+// =============================================================================
+//                              ELF OBJECT CONTEXT
+// =============================================================================
+
+/*
+ * Context for building ELF/DWARF structures
+ *
+ * This structure maintains state while constructing DWARF unwind information.
+ * It acts as a simple buffer manager with pointers to track current position
+ * and important landmarks within the buffer.
+ */
+typedef struct ELFObjectContext {
+    uint8_t* p;            // Current write position in buffer
+    uint8_t* startp;       // Start of buffer (for offset calculations)
+    uint8_t* fde_p;        // Start of FDE data (for PC-relative calculations)
+    uintptr_t code_addr;   // Address of the code section
+    size_t code_size;      // Size of the code section
+    const _PyJitUnwind_ShimCfi* shim_cfi;  // GDB emitter: NULL => executor
+} ELFObjectContext;
+
+// =============================================================================
+//                              DWARF GENERATION UTILITIES
+// =============================================================================
+
+/*
+ * Append a null-terminated string to the ELF context buffer.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   str: String to append (must be null-terminated)
+ *
+ * Returns: Offset from start of buffer where string was written
+ */
+static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
+    uint8_t* p = ctx->p;
+    uint32_t ofs = (uint32_t)(p - ctx->startp);
+
+    /* Copy string including null terminator */
+    do {
+        *p++ = (uint8_t)*str;
+    } while (*str++);
+
+    ctx->p = p;
+    return ofs;
+}
+
+/*
+ * Append a SLEB128 (Signed Little Endian Base 128) value
+ *
+ * SLEB128 is a variable-length encoding used extensively in DWARF.
+ * It efficiently encodes small numbers in fewer bytes.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Signed value to encode
+ */
+static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
+    uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
+    for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
+        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
+    }
+    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
+
+    ctx->p = p;
+}
+
+/*
+ * Append a ULEB128 (Unsigned Little Endian Base 128) value
+ *
+ * Similar to SLEB128 but for unsigned values.
+ *
+ * Args:
+ *   ctx: ELF object context
+ *   v: Unsigned value to encode
+ */
+static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
+    uint8_t* p = ctx->p;
+
+    /* Encode 7 bits at a time, with continuation bit in MSB */
+    for (; v >= 0x80; v >>= 7) {
+        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
+    }
+    *p++ = (char)v;  // Final byte without continuation bit
+
+    ctx->p = p;
+}
+
+/*
+ * Macros for generating DWARF structures
+ *
+ * These macros provide a convenient way to write various data types
+ * to the DWARF buffer while automatically advancing the pointer.
+ */
+#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
+#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
+#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
+#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
+#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
+#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
+#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
+#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
+#define DWRF_BYTES(buf, len) (memcpy(p, (buf), (len)), p += (len))                // Splice in raw bytes
+
+/* Align to specified boundary with NOP instructions */
+#define DWRF_ALIGNNOP(s)                                          \
+    while ((uintptr_t)p & ((s)-1)) {                              \
+        *p++ = DWRF_CFA_nop;                                       \
+    }
+
+/* Write a DWARF section with automatic size calculation */
+#define DWRF_SECTION(name, stmt)                                  \
+    {                                                             \
+        uint32_t* szp_##name = (uint32_t*)p;                      \
+        p += 4;                                                   \
+        stmt;                                                     \
+        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
+    }
+
+// =============================================================================
+//                              DWARF EH FRAME GENERATION
+// =============================================================================
+
+static void elf_init_ehframe_perf(ELFObjectContext* ctx);
+static void elf_init_ehframe_gdb(ELFObjectContext* ctx);
+
+static inline void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr) {
+    if (absolute_addr) {
+        elf_init_ehframe_gdb(ctx);
+    }
+    else {
+        elf_init_ehframe_perf(ctx);
+    }
+}
+
+size_t
+_PyJitUnwind_EhFrameSize(int absolute_addr,
+                         const _PyJitUnwind_ShimCfi *shim_cfi)
+{
+    /* The .eh_frame we emit is small and bounded; keep a generous buffer. */
+    uint8_t scratch[512];
+    _Static_assert(sizeof(scratch) >= 256,
+                   "scratch buffer may be too small for elf_init_ehframe");
+    ELFObjectContext ctx;
+    ctx.code_size = 1;
+    ctx.code_addr = 0;
+    ctx.startp = ctx.p = scratch;
+    ctx.fde_p = NULL;
+    ctx.shim_cfi = shim_cfi;
+    /* Generate once into scratch to learn the required size. */
+    elf_init_ehframe(&ctx, absolute_addr);
+    ptrdiff_t size = ctx.p - ctx.startp;
+    assert(size <= (ptrdiff_t)sizeof(scratch));
+    return (size_t)size;
+}
+
+size_t
+_PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size,
+                        const void *code_addr, size_t code_size,
+                        int absolute_addr,
+                        const _PyJitUnwind_ShimCfi *shim_cfi)
+{
+    if (buffer == NULL || code_addr == NULL || code_size == 0) {
+        return 0;
+    }
+    /* Generate the frame twice: once to size-check, once to write. */
+    size_t required = _PyJitUnwind_EhFrameSize(absolute_addr, shim_cfi);
+    if (required == 0 || required > buffer_size) {
+        return 0;
+    }
+    ELFObjectContext ctx;
+    ctx.code_size = code_size;
+    ctx.code_addr = (uintptr_t)code_addr;
+    ctx.startp = ctx.p = buffer;
+    ctx.fde_p = NULL;
+    ctx.shim_cfi = shim_cfi;
+    elf_init_ehframe(&ctx, absolute_addr);
+    size_t written = (size_t)(ctx.p - ctx.startp);
+    /* The frame size is independent of code_addr/code_size (fixed-width fields). */
+    assert(written == required);
+    return written;
+}
+
+/*
+ * Generate a minimal .eh_frame for a single JIT code region.
+ *
+ * The .eh_frame section contains Call Frame Information (CFI) that describes
+ * how to unwind the stack at any point in the code. This is essential for
+ * unwinding through JIT-generated code.
+ *
+ * The generated data contains:
+ * 1. A CIE (Common Information Entry) describing the calling convention.
+ * 2. An FDE (Frame Description Entry) describing how to unwind the JIT frame.
+ *
+ * Two flavors are emitted, dispatched on the absolute_addr flag:
+ *
+ * - absolute_addr == 0 (elf_init_ehframe_perf): PC-relative FDE address
+ *   encoding for perf's synthesized DSO layout. The CIE describes the
+ *   trampoline's entry state and the FDE walks through the prologue and
+ *   epilogue with advance_loc instructions. This matches the pre-existing
+ *   perf_jit_trampoline behavior byte-for-byte.
+ *
+ * - absolute_addr == 1 (elf_init_ehframe_gdb): absolute FDE address
+ *   encoding for the GDB JIT in-memory ELF. The CIE describes the
+ *   steady-state frame layout (CFA = %rbp+16 / x29+16, with saved fp and
+ *   return-address column at fixed offsets) and the FDE emits no further
+ *   CFI. The same rule applies at every PC in the registered region,
+ *   which is correct for executor stencils (they pin the frame pointer
+ *   across the region). This is the GDB-side fix; see elf_init_ehframe_gdb
+ *   for details.
+ */
+static void elf_init_ehframe_perf(ELFObjectContext* ctx) {
+    int fde_ptr_enc = DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4;
+    uint8_t* p = ctx->p;
+    uint8_t* framep = p;  // Remember start of frame data
+
+    /*
+    * DWARF Unwind Table for Trampoline Function
+    *
+    * This section defines DWARF Call Frame Information (CFI) using encoded macros
+    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
+    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
+    * and debuggers for stack unwinding in JIT-compiled code.
+    *
+    * -------------------------------------------------
+    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
+    * -------------------------------------------------
+    *
+    * 1. Create a trampoline source file (e.g., `trampoline.c`):
+    *
+    *      #include <Python.h>
+    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
+    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
+    *          return evaluator(ts, f, throwflag);
+    *      }
+    *
+    * 2. Compile to an object file with frame pointer preservation:
+    *
+    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * 3. Extract DWARF unwind info from the object file:
+    *
+    *      readelf -w trampoline.o
+    *
+    *    Example output from `.eh_frame`:
+    *
+    *      00000000 CIE
+    *        Version:               1
+    *        Augmentation:          "zR"
+    *        Code alignment factor: 4
+    *        Data alignment factor: -8
+    *        Return address column: 30
+    *        DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    *      00000014 FDE cie=00000000 pc=0..14
+    *        DW_CFA_advance_loc: 4
+    *        DW_CFA_def_cfa_offset: 16
+    *        DW_CFA_offset: r29 at cfa-16
+    *        DW_CFA_offset: r30 at cfa-8
+    *        DW_CFA_advance_loc: 12
+    *        DW_CFA_restore: r30
+    *        DW_CFA_restore: r29
+    *        DW_CFA_def_cfa_offset: 0
+    *
+    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
+    *
+    * ----------------------------------
+    * HOW TO TRANSLATE TO DWRF_* MACROS:
+    * ----------------------------------
+    *
+    * After compiling your trampoline with:
+    *
+    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
+    *
+    * run:
+    *
+    *     readelf -w trampoline.o
+    *
+    * to inspect the generated `.eh_frame` data. You will see two main components:
+    *
+    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
+    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
+    *
+    * ---------------------
+    * Translating the CIE:
+    * ---------------------
+    * From `readelf -w`, you might see:
+    *
+    *   00000000 0000000000000010 00000000 CIE
+    *     Version:               1
+    *     Augmentation:          "zR"
+    *     Code alignment factor: 4
+    *     Data alignment factor: -8
+    *     Return address column: 30
+    *     Augmentation data:     1b
+    *     DW_CFA_def_cfa: r31 (sp) ofs 0
+    *
+    * Map this to:
+    *
+    *     DWRF_SECTION(CIE,
+    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
+    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
+    *         DWRF_STR("zR");                         // Augmentation string "zR"
+    *         DWRF_UV(4);                             // Code alignment factor = 4
+    *         DWRF_SV(-8);                            // Data alignment factor = -8
+    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
+    *         DWRF_UV(1);                             // Augmentation data length = 1
+    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
+    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
+    *         DWRF_UV(0);                             // Offset = 0
+    *
+    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
+    *     )
+    *
+    * Notes:
+    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
+    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
+    *
+    * ---------------------
+    * Translating the FDE:
+    * ---------------------
+    * From `readelf -w`:
+    *
+    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
+    *     DW_CFA_advance_loc: 4
+    *     DW_CFA_def_cfa_offset: 16
+    *     DW_CFA_offset: r29 at cfa-16
+    *     DW_CFA_offset: r30 at cfa-8
+    *     DW_CFA_advance_loc: 12
+    *     DW_CFA_restore: r30
+    *     DW_CFA_restore: r29
+    *     DW_CFA_def_cfa_offset: 0
+    *
+    * Map the FDE header and instructions to:
+    *
+    *     DWRF_SECTION(FDE,
+    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
+    *         DWRF_U32(pc_relative_offset);           // PC-relative location of the code (calculated dynamically)
+    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
+    *         DWRF_U8(0);                             // Augmentation data length (none)
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
+    *         DWRF_UV(16);
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
+    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
+    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
+    *
+    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
+    *
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
+    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
+    *
+    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
+    *         DWRF_UV(0);
+    *     )
+    *
+    * To regenerate:
+    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
+    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
+    *      the code is in a different address space every time.
+    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
+    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
+    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
+    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
+    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
+    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
+    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
+    */
+
+    /*
+     * Emit DWARF EH CIE (Common Information Entry)
+     *
+     * The CIE describes the calling conventions and basic unwinding rules
+     * that apply to all functions in this compilation unit.
+     */
+    DWRF_SECTION(CIE,
+        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
+        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
+        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
+#ifdef __x86_64__
+        DWRF_UV(1);                           // Code alignment factor (x86_64: 1 byte)
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+        DWRF_UV(4);                           // Code alignment factor (AArch64: 4 bytes per instruction)
+#endif
+        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
+        DWRF_U8(DWRF_REG_RA);                 // Return address register number
+        DWRF_UV(1);                           // Augmentation data length
+        DWRF_U8(fde_ptr_enc);                 // FDE pointer encoding
+
+        /* Initial CFI instructions - describe default calling convention */
+#ifdef __x86_64__
+        /* x86_64 initial CFI state */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
+        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
+        DWRF_UV(1);                           // At offset 1 from CFA
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+        /* AArch64 initial CFI state */
+        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
+        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
+        DWRF_UV(0);                           // CFA = SP + 0 (AArch64 starts with offset 0)
+        // No initial register saves in AArch64 CIE
+#endif
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
+    )
+
+    /*
+     * Emit DWARF EH FDE (Frame Description Entry)
+     *
+     * The FDE describes unwinding information specific to this function.
+     * It references the CIE and provides function-specific CFI instructions.
+     *
+     * The PC-relative offset is calculated after the entire EH frame is built
+     * to ensure accurate positioning relative to the synthesized DSO layout.
+     */
+    DWRF_SECTION(FDE,
+        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
+        /*
+         * In perf jitdump mode the FDE PC field is encoded PC-relative and
+         * points back to code_start. Record where that field lives so we can
+         * patch in the final offset after the rest of the synthetic DSO
+         * layout is known.
+         */
+        ctx->fde_p = p;                       // Remember where PC offset field is located for later calculation
+        DWRF_U32(0);                          // Placeholder for PC-relative offset (calculated below)
+        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code length)
+        DWRF_U8(0);                           // Augmentation data length (none)
+
+        /*
+         * Architecture-specific CFI instructions
+         *
+         * These instructions describe how registers are saved and restored
+         * during function calls. Each architecture has different calling
+         * conventions and register usage patterns.
+         */
+#ifdef __x86_64__
+        /* x86_64 calling convention unwinding rules */
+#  if defined(__CET__) && (__CET__ & 1)
+        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance past endbr64 (4 bytes)
+#  endif
+        DWRF_U8(DWRF_CFA_advance_loc | 1);    // Advance past push %rbp (1 byte)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);     // def_cfa_offset 16
+        DWRF_UV(16);                          // New offset: SP + 16
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16
+        DWRF_UV(2);                           // Offset factor: 2 * 8 = 16 bytes
+        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past mov %rsp,%rbp (3 bytes)
+        DWRF_U8(DWRF_CFA_def_cfa_register);   // def_cfa_register r6
+        DWRF_UV(DWRF_REG_BP);                 // Use base pointer register
+        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
+        DWRF_U8(DWRF_CFA_def_cfa);            // def_cfa r7 ofs 8
+        DWRF_UV(DWRF_REG_SP);                 // Use stack pointer register
+        DWRF_UV(8);                           // New offset: SP + 8
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+        /* AArch64 calling convention unwinding rules */
+        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance by 1 instruction (4 bytes)
+        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 16
+        DWRF_UV(16);                              // Stack pointer moved by 16 bytes
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // x29 (frame pointer) saved
+        DWRF_UV(2);                               // At CFA-16 (2 * 8 = 16 bytes from CFA)
+        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // x30 (link register) saved
+        DWRF_UV(1);                               // At CFA-8 (1 * 8 = 8 bytes from CFA)
+        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (12 bytes)
+        DWRF_U8(DWRF_CFA_def_cfa_register);       // CFA = FP (x29) + 16
+        DWRF_UV(DWRF_REG_FP);
+        DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA);  // Restore x30 - NO DWRF_UV() after this!
+        DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP);  // Restore x29 - NO DWRF_UV() after this!
+        DWRF_U8(DWRF_CFA_def_cfa);                // CFA = SP + 0 (stack restored)
+        DWRF_UV(DWRF_REG_SP);
+        DWRF_UV(0);
+
+#else
+#    error "Unsupported target architecture"
+#endif
+
+        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
+    )
+
+    ctx->p = p;  // Update context pointer to end of generated data
+
+    /* Calculate and update the PC-relative offset in the FDE
+     *
+     * When perf processes the jitdump, it creates a synthesized DSO with this layout:
+     *
+     *     Synthesized DSO Memory Layout:
+     *     ┌─────────────────────────────────────────────────────────────┐ < code_start
+     *     │                        Code Section                         │
+     *     │                    (round_up(code_size, 8) bytes)           │
+     *     ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
+     *     │                      EH Frame Data                          │
+     *     │  ┌─────────────────────────────────────────────────────┐    │
+     *     │  │                 CIE data                            │    │
+     *     │  └─────────────────────────────────────────────────────┘    │
+     *     │  ┌─────────────────────────────────────────────────────┐    │
+     *     │  │ FDE Header:                                         │    │
+     *     │  │   - CIE offset (4 bytes)                            │    │
+     *     │  │   - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
+     *     │  │   - address range (4 bytes)                         │    │   (this specific field)
+     *     │  │ CFI Instructions...                                 │    │
+     *     │  └─────────────────────────────────────────────────────┘    │
+     *     ├─────────────────────────────────────────────────────────────┤ < reference_point
+     *     │                    EhFrameHeader                            │
+     *     │                 (navigation metadata)                       │
+     *     └─────────────────────────────────────────────────────────────┘
+     *
+     * The PC offset field in the FDE must contain the distance from itself to code_start:
+     *
+     *   distance = code_start - fde_pc_field
+     *
+     * Where:
+     *   fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
+     *   code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
+     *
+     * Therefore:
+     *   distance = code_start_location - fde_pc_field_location
+     *            = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
+     *            = -rounded_code_size - fde_offset_in_frame
+     *            = -(round_up(code_size, 8) + fde_offset_in_frame)
+     *
+     * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field.
+     *
+     */
+    int32_t rounded_code_size =
+        (int32_t)_Py_SIZE_ROUND_UP(ctx->code_size, 8);
+    int32_t fde_offset_in_frame = (int32_t)(ctx->fde_p - framep);
+    *(int32_t *)ctx->fde_p = -(rounded_code_size + fde_offset_in_frame);
+}
+
+/*
+ * Build .eh_frame data for the GDB JIT interface.
+ *
+ * Two region kinds share the same ELF layout but diverge on CFI source:
+ *
+ * - Executor (shim_cfi == NULL). The CIE's initial CFI hard-codes the
+ *   pinned-frame-pointer steady state (CFA = %rbp+16 / x29+96 with the
+ *   frame record saved there). Executor stencils never touch the frame
+ *   pointer — enforced by Tools/jit/_optimizers.py _validate() and
+ *   -mframe-pointer=reserved — so that rule is valid at every PC and
+ *   the FDE body is empty.
+ *
+ * - Shim (shim_cfi != NULL). Tools/jit captures the shim's compiled
+ *   .eh_frame at build time; we splice the CIE-initial-CFI and FDE-CFI
+ *   byte blobs into a freshly-built ELF with our chosen (absolute) FDE
+ *   pointer encoding. Whatever prologue clang emits is described
+ *   accurately, regardless of target or flags.
+ */
+static void elf_init_ehframe_gdb(ELFObjectContext* ctx) {
+    int fde_ptr_enc = DWRF_EH_PE_absptr;
+    uint8_t* p = ctx->p;
+    uint8_t* framep = p;
+    const _PyJitUnwind_ShimCfi* shim = ctx->shim_cfi;
+
+    DWRF_SECTION(CIE,
+        DWRF_U32(0);                          // CIE ID
+        DWRF_U8(DWRF_CIE_VERSION);
+        DWRF_STR("zR");                       // aug data length + FDE ptr encoding follow
+        if (shim != NULL) {
+            DWRF_UV(shim->code_align);
+            DWRF_SV(shim->data_align);
+            DWRF_U8(shim->ra_column);
+        }
+        else {
+#ifdef __x86_64__
+            DWRF_UV(1);                       // x86_64: 1 byte per instruction
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+            DWRF_UV(4);                       // AArch64: 4 bytes per instruction
+#endif
+            DWRF_SV(-(int64_t)sizeof(uintptr_t));
+            DWRF_U8(DWRF_REG_RA);
+        }
+        DWRF_UV(1);                           // Augmentation data length
+        DWRF_U8(fde_ptr_enc);                 // FDE pointer encoding
+
+        if (shim != NULL) {
+            DWRF_BYTES(shim->cie_init_cfi, shim->cie_init_cfi_size);
+        }
+        else {
+            /* Executor steady-state rule (our invariant, not the compiler's). */
+#ifdef __x86_64__
+            DWRF_U8(DWRF_CFA_def_cfa);        // CFA = %rbp + 16
+            DWRF_UV(DWRF_REG_BP);
+            DWRF_UV(16);
+            DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);
+            DWRF_UV(1);
+            DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP);
+            DWRF_UV(2);
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+            DWRF_U8(DWRF_CFA_def_cfa);        // CFA = x29 + 96
+            DWRF_UV(DWRF_REG_FP);
+            DWRF_UV(96);
+            DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);
+            DWRF_UV(12);                      // saved x29 at cfa-96
+            DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);
+            DWRF_UV(11);                      // x30 at cfa-88
+#else
+#    error "Unsupported target architecture"
+#endif
+        }
+        DWRF_ALIGNNOP(sizeof(uintptr_t));
+    )
+
+    DWRF_SECTION(FDE,
+        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
+        DWRF_ADDR(ctx->code_addr);            // Absolute code start
+        DWRF_ADDR((uintptr_t)ctx->code_size); // Code range covered
+        DWRF_U8(0);                           // Augmentation data length (none)
+        if (shim != NULL) {
+            DWRF_BYTES(shim->fde_cfi, shim->fde_cfi_size);
+        }
+        DWRF_ALIGNNOP(sizeof(uintptr_t));
+    )
+
+    ctx->p = p;
+}
+
+#if defined(__linux__) && defined(__ELF__)
+enum {
+    JIT_NOACTION = 0,
+    JIT_REGISTER_FN = 1,
+    JIT_UNREGISTER_FN = 2,
+};
+
+struct jit_code_entry {
+    struct jit_code_entry *next;
+    struct jit_code_entry *prev;
+    const char *symfile_addr;
+    uint64_t symfile_size;
+    const void *code_addr;
+};
+
+struct jit_descriptor {
+    uint32_t version;
+    uint32_t action_flag;
+    struct jit_code_entry *relevant_entry;
+    struct jit_code_entry *first_entry;
+};
+
+static PyMutex jit_debug_mutex = {0};
+
+Py_EXPORTED_SYMBOL volatile struct jit_descriptor __jit_debug_descriptor = {
+    1, JIT_NOACTION, NULL, NULL
+};
+
+Py_EXPORTED_SYMBOL void __attribute__((noinline))
+__jit_debug_register_code(void)
+{
+    /* Keep this call visible to debuggers and not optimized away. */
+    (void)__jit_debug_descriptor.action_flag;
+#if defined(__GNUC__) || defined(__clang__)
+    __asm__ __volatile__("" ::: "memory");
+#endif
+}
+
+static uint16_t
+gdb_jit_machine_id(void)
+{
+    /* Map the current target to ELF e_machine; return 0 to skip registration. */
+#if defined(__x86_64__) || defined(_M_X64)
+    return EM_X86_64;
+#elif defined(__aarch64__) && !defined(__ILP32__)
+    return EM_AARCH64;
+#else
+    return 0;
+#endif
+}
+
+static struct jit_code_entry *
+gdb_jit_register_code(
+    const void *code_addr,
+    size_t code_size,
+    const char *symname,
+    const uint8_t *eh_frame,
+    size_t eh_frame_size
+)
+{
+    /*
+     * Build a minimal in-memory ELF for GDB's JIT interface and link it into
+     * __jit_debug_descriptor so debuggers can resolve JIT code.
+     */
+    if (code_addr == NULL || code_size == 0 || symname == NULL) {
+        return NULL;
+    }
+
+    const uint16_t machine = gdb_jit_machine_id();
+    if (machine == 0) {
+        return NULL;
+    }
+
+    enum {
+        SH_NULL = 0,
+        SH_TEXT,
+        SH_EH_FRAME,
+        SH_SHSTRTAB,
+        SH_STRTAB,
+        SH_SYMTAB,
+        SH_NUM,
+    };
+    static const char shstrtab[] =
+        "\0.text\0.eh_frame\0.shstrtab\0.strtab\0.symtab";
+    _Static_assert(sizeof(shstrtab) ==
+        1 + sizeof(".text") + sizeof(".eh_frame") +
+            sizeof(".shstrtab") + sizeof(".strtab") + sizeof(".symtab"),
+        "shstrtab size mismatch");
+    const size_t shstrtab_size = sizeof(shstrtab);
+    const size_t sh_text = 1;
+    const size_t sh_eh_frame = sh_text + sizeof(".text");
+    const size_t sh_shstrtab = sh_eh_frame + sizeof(".eh_frame");
+    const size_t sh_strtab = sh_shstrtab + sizeof(".shstrtab");
+    const size_t sh_symtab = sh_strtab + sizeof(".strtab");
+    const size_t text_size = code_size;
+    const size_t text_padded = _Py_SIZE_ROUND_UP(text_size, 8);
+    const size_t strtab_size = 1 + strlen(symname) + 1;
+    const size_t symtab_size = 3 * sizeof(Elf64_Sym);
+
+    size_t offset = sizeof(Elf64_Ehdr);
+    offset = _Py_SIZE_ROUND_UP(offset, 16);
+    const size_t text_off = offset;
+    const size_t eh_off = text_off + text_padded;
+    offset = eh_off + eh_frame_size;
+    const size_t shstr_off = offset;
+    offset += shstrtab_size;
+    const size_t str_off = offset;
+    offset += strtab_size;
+    offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Sym));
+    const size_t sym_off = offset;
+    offset += symtab_size;
+    offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Shdr));
+    const size_t sh_off = offset;
+
+    const size_t shnum = SH_NUM;
+    const size_t total_size = sh_off + shnum * sizeof(Elf64_Shdr);
+    uint8_t *buf = (uint8_t *)PyMem_RawMalloc(total_size);
+    if (buf == NULL) {
+        return NULL;
+    }
+    memset(buf, 0, total_size);
+
+    Elf64_Ehdr *ehdr = (Elf64_Ehdr *)buf;
+    memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+    ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+    ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+    ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+    ehdr->e_ident[EI_OSABI] = ELFOSABI_NONE;
+    ehdr->e_type = ET_DYN;
+    ehdr->e_machine = machine;
+    ehdr->e_version = EV_CURRENT;
+    ehdr->e_entry = 0;
+    ehdr->e_phoff = 0;
+    ehdr->e_shoff = sh_off;
+    ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+    ehdr->e_shentsize = sizeof(Elf64_Shdr);
+    ehdr->e_shnum = shnum;
+    ehdr->e_shstrndx = SH_SHSTRTAB;
+
+    memcpy(buf + text_off, code_addr, text_size);
+    memcpy(buf + eh_off, eh_frame, eh_frame_size);
+
+    char *shstr = (char *)(buf + shstr_off);
+    memcpy(shstr, shstrtab, shstrtab_size);
+
+    char *strtab = (char *)(buf + str_off);
+    strtab[0] = '\0';
+    memcpy(strtab + 1, symname, strlen(symname));
+    strtab[strtab_size - 1] = '\0';
+
+    Elf64_Sym *syms = (Elf64_Sym *)(buf + sym_off);
+    memset(syms, 0, symtab_size);
+    /* Section symbol for .text (local) */
+    syms[1].st_info = ELF64_ST_INFO(STB_LOCAL, STT_SECTION);
+    syms[1].st_shndx = 1;
+    /* Function symbol */
+    syms[2].st_name = 1;
+    syms[2].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC);
+    syms[2].st_other = STV_DEFAULT;
+    syms[2].st_shndx = 1;
+    /* For ET_DYN/ET_EXEC, st_value is the absolute virtual address. */
+    syms[2].st_value = (Elf64_Addr)(uintptr_t)code_addr;
+    syms[2].st_size = code_size;
+
+    Elf64_Shdr *shdrs = (Elf64_Shdr *)(buf + sh_off);
+    memset(shdrs, 0, shnum * sizeof(Elf64_Shdr));
+
+    shdrs[SH_TEXT].sh_name = sh_text;
+    shdrs[SH_TEXT].sh_type = SHT_PROGBITS;
+    shdrs[SH_TEXT].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
+    shdrs[SH_TEXT].sh_addr = (Elf64_Addr)(uintptr_t)code_addr;
+    shdrs[SH_TEXT].sh_offset = text_off;
+    shdrs[SH_TEXT].sh_size = text_size;
+    shdrs[SH_TEXT].sh_addralign = 16;
+
+    shdrs[SH_EH_FRAME].sh_name = sh_eh_frame;
+    shdrs[SH_EH_FRAME].sh_type = SHT_PROGBITS;
+    shdrs[SH_EH_FRAME].sh_flags = SHF_ALLOC;
+    shdrs[SH_EH_FRAME].sh_addr =
+        (Elf64_Addr)((uintptr_t)code_addr + text_padded);
+    shdrs[SH_EH_FRAME].sh_offset = eh_off;
+    shdrs[SH_EH_FRAME].sh_size = eh_frame_size;
+    shdrs[SH_EH_FRAME].sh_addralign = 8;
+
+    shdrs[SH_SHSTRTAB].sh_name = sh_shstrtab;
+    shdrs[SH_SHSTRTAB].sh_type = SHT_STRTAB;
+    shdrs[SH_SHSTRTAB].sh_offset = shstr_off;
+    shdrs[SH_SHSTRTAB].sh_size = shstrtab_size;
+    shdrs[SH_SHSTRTAB].sh_addralign = 1;
+
+    shdrs[SH_STRTAB].sh_name = sh_strtab;
+    shdrs[SH_STRTAB].sh_type = SHT_STRTAB;
+    shdrs[SH_STRTAB].sh_offset = str_off;
+    shdrs[SH_STRTAB].sh_size = strtab_size;
+    shdrs[SH_STRTAB].sh_addralign = 1;
+
+    shdrs[SH_SYMTAB].sh_name = sh_symtab;
+    shdrs[SH_SYMTAB].sh_type = SHT_SYMTAB;
+    shdrs[SH_SYMTAB].sh_offset = sym_off;
+    shdrs[SH_SYMTAB].sh_size = symtab_size;
+    shdrs[SH_SYMTAB].sh_link = SH_STRTAB;
+    shdrs[SH_SYMTAB].sh_info = 2;
+    shdrs[SH_SYMTAB].sh_addralign = 8;
+    shdrs[SH_SYMTAB].sh_entsize = sizeof(Elf64_Sym);
+
+    struct jit_code_entry *entry = PyMem_RawMalloc(sizeof(*entry));
+    if (entry == NULL) {
+        PyMem_RawFree(buf);
+        return NULL;
+    }
+    entry->symfile_addr = (const char *)buf;
+    entry->symfile_size = total_size;
+    entry->code_addr = code_addr;
+
+    PyMutex_Lock(&jit_debug_mutex);
+    entry->prev = NULL;
+    entry->next = __jit_debug_descriptor.first_entry;
+    if (entry->next != NULL) {
+        entry->next->prev = entry;
+    }
+    __jit_debug_descriptor.first_entry = entry;
+    __jit_debug_descriptor.relevant_entry = entry;
+    __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
+    __jit_debug_register_code();
+    __jit_debug_descriptor.action_flag = JIT_NOACTION;
+    __jit_debug_descriptor.relevant_entry = NULL;
+    PyMutex_Unlock(&jit_debug_mutex);
+    return entry;
+}
+#endif  // __linux__ && __ELF__
+
+void *
+_PyJitUnwind_GdbRegisterCode(const void *code_addr,
+                             size_t code_size,
+                             const char *entry,
+                             const char *filename,
+                             const _PyJitUnwind_ShimCfi *shim_cfi)
+{
+#if defined(__linux__) && defined(__ELF__)
+    /* GDB expects a stable symbol name and absolute addresses in .eh_frame. */
+    if (entry == NULL) {
+        entry = "";
+    }
+    if (filename == NULL) {
+        filename = "";
+    }
+    size_t name_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
+    char *name = (char *)PyMem_RawMalloc(name_size);
+    if (name == NULL) {
+        return NULL;
+    }
+    snprintf(name, name_size, "py::%s:%s", entry, filename);
+
+    uint8_t buffer[1024];
+    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
+        buffer, sizeof(buffer), code_addr, code_size, 1, shim_cfi);
+    if (eh_frame_size == 0) {
+        PyMem_RawFree(name);
+        return NULL;
+    }
+
+    void *handle = gdb_jit_register_code(code_addr, code_size, name,
+                                         buffer, eh_frame_size);
+    PyMem_RawFree(name);
+    return handle;
+#else
+    (void)code_addr;
+    (void)code_size;
+    (void)entry;
+    (void)filename;
+    (void)shim_cfi;
+    return NULL;
+#endif
+}
+
+void
+_PyJitUnwind_GdbUnregisterCode(void *handle)
+{
+#if defined(__linux__) && defined(__ELF__)
+    struct jit_code_entry *entry = (struct jit_code_entry *)handle;
+    if (entry == NULL) {
+        return;
+    }
+
+    PyMutex_Lock(&jit_debug_mutex);
+    if (entry->prev != NULL) {
+        entry->prev->next = entry->next;
+    }
+    else {
+        __jit_debug_descriptor.first_entry = entry->next;
+    }
+    if (entry->next != NULL) {
+        entry->next->prev = entry->prev;
+    }
+
+    __jit_debug_descriptor.relevant_entry = entry;
+    __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN;
+    __jit_debug_register_code();
+    __jit_debug_descriptor.action_flag = JIT_NOACTION;
+    __jit_debug_descriptor.relevant_entry = NULL;
+
+    PyMutex_Unlock(&jit_debug_mutex);
+
+    PyMem_RawFree((void *)entry->symfile_addr);
+    PyMem_RawFree(entry);
+#else
+    (void)handle;
+#endif
+}
+
+#endif  // defined(PY_HAVE_PERF_TRAMPOLINE) || (defined(__linux__) && defined(__ELF__))
diff --git a/Python/optimizer.c b/Python/optimizer.c
index f09bf778587b12..ee8cff46481b71 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -1296,6 +1296,7 @@ allocate_executor(int exit_count, int length)
     res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
     res->code_size = length;
     res->exit_count = exit_count;
+    res->jit_gdb_handle = NULL;
     return res;
 }
 
@@ -1442,6 +1443,7 @@ make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, i
     // This is initialized to false so we can prevent the executor
     // from being immediately detected as cold and invalidated.
     executor->vm_data.cold = false;
+    executor->jit_gdb_handle = NULL;
     if (_PyJIT_Compile(executor, executor->trace, length)) {
         Py_DECREF(executor);
         return NULL;
@@ -1674,6 +1676,7 @@ make_cold_executor(uint16_t opcode)
 #ifdef _Py_JIT
     cold->jit_code = NULL;
     cold->jit_size = 0;
+    cold->jit_gdb_handle = NULL;
     if (_PyJIT_Compile(cold, cold->trace, 1)) {
         Py_DECREF(cold);
         Py_FatalError("Cannot allocate core JIT code");
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index 0ba856ea610e59..942c20fc2c69af 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -62,6 +62,7 @@
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 #include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
+#include "pycore_jit_unwind.h"
 #include "pycore_runtime.h"       // _PyRuntime
 
 #ifdef PY_HAVE_PERF_TRAMPOLINE
@@ -73,6 +74,7 @@
 #include <fcntl.h>                // File control operations
 #include <stdio.h>                // Standard I/O operations
 #include <stdlib.h>               // Standard library functions
+#include <string.h>               // memcpy, strlen
 #include <sys/mman.h>             // Memory mapping functions (mmap)
 #include <sys/types.h>            // System data types
 #include <unistd.h>               // System calls (sysconf, getpid)
@@ -246,6 +248,25 @@ typedef struct {
      */
 } CodeUnwindingInfoEvent;
 
+/*
+ * EH Frame Header structure for DWARF unwinding
+ *
+ * This header provides metadata about the .eh_frame data that follows.
+ * It uses PC-relative and data-relative encodings to keep the synthesized
+ * DSO self-contained when perf injects it.
+ */
+typedef struct __attribute__((packed)) {
+    uint8_t version;
+    uint8_t eh_frame_ptr_enc;
+    uint8_t fde_count_enc;
+    uint8_t table_enc;
+    int32_t eh_frame_ptr;
+    uint32_t eh_fde_count;
+    int32_t from;
+    int32_t to;
+} EhFrameHeader;
+_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");
+
 // =============================================================================
 //                              GLOBAL STATE MANAGEMENT
 // =============================================================================
@@ -262,7 +283,8 @@ typedef struct {
     PyThread_type_lock map_lock;  // Thread synchronization lock
     void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
     size_t mapped_size;      // Size of the mapped region
-    int code_id;             // Counter for unique code region identifiers
+    uint32_t code_id;        // Counter for unique code region identifiers
+    uint64_t build_id_salt;  // Per-process salt for unique synthetic DSOs
 } PerfMapJitState;
 
 /* Global singleton instance */
@@ -316,40 +338,6 @@ static int64_t get_current_time_microseconds(void) {
     return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
-// =============================================================================
-//                              UTILITY FUNCTIONS
-// =============================================================================
-
-/*
- * Round up a value to the next multiple of a given number
- *
- * This is essential for maintaining proper alignment requirements in the
- * jitdump format. Many structures need to be aligned to specific boundaries
- * (typically 8 or 16 bytes) for efficient processing by perf.
- *
- * Args:
- *   value: The value to round up
- *   multiple: The multiple to round up to
- *
- * Returns: The smallest value >= input that is a multiple of 'multiple'
- */
-static size_t round_up(int64_t value, int64_t multiple) {
-    if (multiple == 0) {
-        return value;  // Avoid division by zero
-    }
-
-    int64_t remainder = value % multiple;
-    if (remainder == 0) {
-        return value;  // Already aligned
-    }
-
-    /* Calculate how much to add to reach the next multiple */
-    int64_t difference = multiple - remainder;
-    int64_t rounded_up_value = value + difference;
-
-    return rounded_up_value;
-}
-
 // =============================================================================
 //                              FILE I/O UTILITIES
 // =============================================================================
@@ -406,623 +394,6 @@ static void perf_map_jit_write_header(int pid, FILE* out_file) {
     perf_map_jit_write_fully(&header, sizeof(header));
 }
 
-// =============================================================================
-//                              DWARF CONSTANTS AND UTILITIES
-// =============================================================================
-
-/*
- * DWARF (Debug With Arbitrary Record Formats) constants
- *
- * DWARF is a debugging data format used to provide stack unwinding information.
- * These constants define the various encoding types and opcodes used in
- * DWARF Call Frame Information (CFI) records.
- */
-
-/* DWARF Call Frame Information version */
-#define DWRF_CIE_VERSION 1
-
-/* DWARF CFA (Call Frame Address) opcodes */
-enum {
-    DWRF_CFA_nop = 0x0,                    // No operation
-    DWRF_CFA_offset_extended = 0x5,        // Extended offset instruction
-    DWRF_CFA_def_cfa = 0xc,               // Define CFA rule
-    DWRF_CFA_def_cfa_register = 0xd,      // Define CFA register
-    DWRF_CFA_def_cfa_offset = 0xe,        // Define CFA offset
-    DWRF_CFA_offset_extended_sf = 0x11,   // Extended signed offset
-    DWRF_CFA_advance_loc = 0x40,          // Advance location counter
-    DWRF_CFA_offset = 0x80,               // Simple offset instruction
-    DWRF_CFA_restore = 0xc0               // Restore register
-};
-
-/* DWARF Exception Handling pointer encodings */
-enum {
-    DWRF_EH_PE_absptr = 0x00,             // Absolute pointer
-    DWRF_EH_PE_omit = 0xff,               // Omitted value
-
-    /* Data type encodings */
-    DWRF_EH_PE_uleb128 = 0x01,            // Unsigned LEB128
-    DWRF_EH_PE_udata2 = 0x02,             // Unsigned 2-byte
-    DWRF_EH_PE_udata4 = 0x03,             // Unsigned 4-byte
-    DWRF_EH_PE_udata8 = 0x04,             // Unsigned 8-byte
-    DWRF_EH_PE_sleb128 = 0x09,            // Signed LEB128
-    DWRF_EH_PE_sdata2 = 0x0a,             // Signed 2-byte
-    DWRF_EH_PE_sdata4 = 0x0b,             // Signed 4-byte
-    DWRF_EH_PE_sdata8 = 0x0c,             // Signed 8-byte
-    DWRF_EH_PE_signed = 0x08,             // Signed flag
-
-    /* Reference type encodings */
-    DWRF_EH_PE_pcrel = 0x10,              // PC-relative
-    DWRF_EH_PE_textrel = 0x20,            // Text-relative
-    DWRF_EH_PE_datarel = 0x30,            // Data-relative
-    DWRF_EH_PE_funcrel = 0x40,            // Function-relative
-    DWRF_EH_PE_aligned = 0x50,            // Aligned
-    DWRF_EH_PE_indirect = 0x80            // Indirect
-};
-
-/* Additional DWARF constants for debug information */
-enum { DWRF_TAG_compile_unit = 0x11 };
-enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
-enum {
-    DWRF_AT_name = 0x03,         // Name attribute
-    DWRF_AT_stmt_list = 0x10,    // Statement list
-    DWRF_AT_low_pc = 0x11,       // Low PC address
-    DWRF_AT_high_pc = 0x12       // High PC address
-};
-enum {
-    DWRF_FORM_addr = 0x01,       // Address form
-    DWRF_FORM_data4 = 0x06,      // 4-byte data
-    DWRF_FORM_string = 0x08      // String form
-};
-
-/* Line number program opcodes */
-enum {
-    DWRF_LNS_extended_op = 0,    // Extended opcode
-    DWRF_LNS_copy = 1,           // Copy operation
-    DWRF_LNS_advance_pc = 2,     // Advance program counter
-    DWRF_LNS_advance_line = 3    // Advance line number
-};
-
-/* Line number extended opcodes */
-enum {
-    DWRF_LNE_end_sequence = 1,   // End of sequence
-    DWRF_LNE_set_address = 2     // Set address
-};
-
-/*
- * Architecture-specific DWARF register numbers
- *
- * These constants define the register numbering scheme used by DWARF
- * for each supported architecture. The numbers must match the ABI
- * specification for proper stack unwinding.
- */
-enum {
-#ifdef __x86_64__
-    /* x86_64 register numbering (note: order is defined by x86_64 ABI) */
-    DWRF_REG_AX,    // RAX
-    DWRF_REG_DX,    // RDX
-    DWRF_REG_CX,    // RCX
-    DWRF_REG_BX,    // RBX
-    DWRF_REG_SI,    // RSI
-    DWRF_REG_DI,    // RDI
-    DWRF_REG_BP,    // RBP
-    DWRF_REG_SP,    // RSP
-    DWRF_REG_8,     // R8
-    DWRF_REG_9,     // R9
-    DWRF_REG_10,    // R10
-    DWRF_REG_11,    // R11
-    DWRF_REG_12,    // R12
-    DWRF_REG_13,    // R13
-    DWRF_REG_14,    // R14
-    DWRF_REG_15,    // R15
-    DWRF_REG_RA,    // Return address (RIP)
-#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-    /* AArch64 register numbering */
-    DWRF_REG_FP = 29,  // Frame Pointer
-    DWRF_REG_RA = 30,  // Link register (return address)
-    DWRF_REG_SP = 31,  // Stack pointer
-#else
-#    error "Unsupported target architecture"
-#endif
-};
-
-/* DWARF encoding constants used in EH frame headers */
-static const uint8_t DwarfUData4 = 0x03;     // Unsigned 4-byte data
-static const uint8_t DwarfSData4 = 0x0b;     // Signed 4-byte data
-static const uint8_t DwarfPcRel = 0x10;      // PC-relative encoding
-static const uint8_t DwarfDataRel = 0x30;    // Data-relative encoding
-
-// =============================================================================
-//                              ELF OBJECT CONTEXT
-// =============================================================================
-
-/*
- * Context for building ELF/DWARF structures
- *
- * This structure maintains state while constructing DWARF unwind information.
- * It acts as a simple buffer manager with pointers to track current position
- * and important landmarks within the buffer.
- */
-typedef struct ELFObjectContext {
-    uint8_t* p;            // Current write position in buffer
-    uint8_t* startp;       // Start of buffer (for offset calculations)
-    uint8_t* eh_frame_p;   // Start of EH frame data (for relative offsets)
-    uint8_t* fde_p;        // Start of FDE data (for PC-relative calculations)
-    uint32_t code_size;    // Size of the code being described
-} ELFObjectContext;
-
-/*
- * EH Frame Header structure for DWARF unwinding
- *
- * This structure provides metadata about the DWARF unwinding information
- * that follows. It's required by the perf jitdump format to enable proper
- * stack unwinding during profiling.
- */
-typedef struct {
-    unsigned char version;           // EH frame version (always 1)
-    unsigned char eh_frame_ptr_enc;  // Encoding of EH frame pointer
-    unsigned char fde_count_enc;     // Encoding of FDE count
-    unsigned char table_enc;         // Encoding of table entries
-    int32_t eh_frame_ptr;           // Pointer to EH frame data
-    int32_t eh_fde_count;           // Number of FDEs (Frame Description Entries)
-    int32_t from;                   // Start address of code range
-    int32_t to;                     // End address of code range
-} EhFrameHeader;
-
-// =============================================================================
-//                              DWARF GENERATION UTILITIES
-// =============================================================================
-
-/*
- * Append a null-terminated string to the ELF context buffer
- *
- * Args:
- *   ctx: ELF object context
- *   str: String to append (must be null-terminated)
- *
- * Returns: Offset from start of buffer where string was written
- */
-static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
-    uint8_t* p = ctx->p;
-    uint32_t ofs = (uint32_t)(p - ctx->startp);
-
-    /* Copy string including null terminator */
-    do {
-        *p++ = (uint8_t)*str;
-    } while (*str++);
-
-    ctx->p = p;
-    return ofs;
-}
-
-/*
- * Append a SLEB128 (Signed Little Endian Base 128) value
- *
- * SLEB128 is a variable-length encoding used extensively in DWARF.
- * It efficiently encodes small numbers in fewer bytes.
- *
- * Args:
- *   ctx: ELF object context
- *   v: Signed value to encode
- */
-static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
-    uint8_t* p = ctx->p;
-
-    /* Encode 7 bits at a time, with continuation bit in MSB */
-    for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
-        *p++ = (uint8_t)((v & 0x7f) | 0x80);  // Set continuation bit
-    }
-    *p++ = (uint8_t)(v & 0x7f);  // Final byte without continuation bit
-
-    ctx->p = p;
-}
-
-/*
- * Append a ULEB128 (Unsigned Little Endian Base 128) value
- *
- * Similar to SLEB128 but for unsigned values.
- *
- * Args:
- *   ctx: ELF object context
- *   v: Unsigned value to encode
- */
-static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
-    uint8_t* p = ctx->p;
-
-    /* Encode 7 bits at a time, with continuation bit in MSB */
-    for (; v >= 0x80; v >>= 7) {
-        *p++ = (char)((v & 0x7f) | 0x80);  // Set continuation bit
-    }
-    *p++ = (char)v;  // Final byte without continuation bit
-
-    ctx->p = p;
-}
-
-/*
- * Macros for generating DWARF structures
- *
- * These macros provide a convenient way to write various data types
- * to the DWARF buffer while automatically advancing the pointer.
- */
-#define DWRF_U8(x) (*p++ = (x))                                    // Write unsigned 8-bit
-#define DWRF_I8(x) (*(int8_t*)p = (x), p++)                       // Write signed 8-bit
-#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)                 // Write unsigned 16-bit
-#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)                 // Write unsigned 32-bit
-#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
-#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
-#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
-#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
-
-/* Align to specified boundary with NOP instructions */
-#define DWRF_ALIGNNOP(s)                                          \
-    while ((uintptr_t)p & ((s)-1)) {                              \
-        *p++ = DWRF_CFA_nop;                                       \
-    }
-
-/* Write a DWARF section with automatic size calculation */
-#define DWRF_SECTION(name, stmt)                                  \
-    {                                                             \
-        uint32_t* szp_##name = (uint32_t*)p;                      \
-        p += 4;                                                   \
-        stmt;                                                     \
-        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
-    }
-
-// =============================================================================
-//                              DWARF EH FRAME GENERATION
-// =============================================================================
-
-static void elf_init_ehframe(ELFObjectContext* ctx);
-
-/*
- * Initialize DWARF .eh_frame section for a code region
- *
- * The .eh_frame section contains Call Frame Information (CFI) that describes
- * how to unwind the stack at any point in the code. This is essential for
- * proper profiling as it allows perf to generate accurate call graphs.
- *
- * The function generates two main components:
- * 1. CIE (Common Information Entry) - describes calling conventions
- * 2. FDE (Frame Description Entry) - describes specific function unwinding
- *
- * Args:
- *   ctx: ELF object context containing code size and buffer pointers
- */
-static size_t calculate_eh_frame_size(void) {
-    /* Calculate the EH frame size for the trampoline function */
-    extern void *_Py_trampoline_func_start;
-    extern void *_Py_trampoline_func_end;
-
-    size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start;
-
-    ELFObjectContext ctx;
-    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
-    ctx.code_size = code_size;
-    ctx.startp = ctx.p = (uint8_t*)buffer;
-    ctx.fde_p = NULL;
-
-    elf_init_ehframe(&ctx);
-    return ctx.p - ctx.startp;
-}
-
-static void elf_init_ehframe(ELFObjectContext* ctx) {
-    uint8_t* p = ctx->p;
-    uint8_t* framep = p;  // Remember start of frame data
-
-    /*
-    * DWARF Unwind Table for Trampoline Function
-    *
-    * This section defines DWARF Call Frame Information (CFI) using encoded macros
-    * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
-    * preserves and restores registers. This is used by profiling tools (e.g., `perf`)
-    * and debuggers for stack unwinding in JIT-compiled code.
-    *
-    * -------------------------------------------------
-    * TO REGENERATE THIS TABLE FROM GCC OBJECTS:
-    * -------------------------------------------------
-    *
-    * 1. Create a trampoline source file (e.g., `trampoline.c`):
-    *
-    *      #include <Python.h>
-    *      typedef PyObject* (*py_evaluator)(void*, void*, int);
-    *      PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
-    *          return evaluator(ts, f, throwflag);
-    *      }
-    *
-    * 2. Compile to an object file with frame pointer preservation:
-    *
-    *      gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
-    *
-    * 3. Extract DWARF unwind info from the object file:
-    *
-    *      readelf -w trampoline.o
-    *
-    *    Example output from `.eh_frame`:
-    *
-    *      00000000 CIE
-    *        Version:               1
-    *        Augmentation:          "zR"
-    *        Code alignment factor: 4
-    *        Data alignment factor: -8
-    *        Return address column: 30
-    *        DW_CFA_def_cfa: r31 (sp) ofs 0
-    *
-    *      00000014 FDE cie=00000000 pc=0..14
-    *        DW_CFA_advance_loc: 4
-    *        DW_CFA_def_cfa_offset: 16
-    *        DW_CFA_offset: r29 at cfa-16
-    *        DW_CFA_offset: r30 at cfa-8
-    *        DW_CFA_advance_loc: 12
-    *        DW_CFA_restore: r30
-    *        DW_CFA_restore: r29
-    *        DW_CFA_def_cfa_offset: 0
-    *
-    * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
-    *
-    * ----------------------------------
-    * HOW TO TRANSLATE TO DWRF_* MACROS:
-    * ----------------------------------
-    *
-    * After compiling your trampoline with:
-    *
-    *     gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
-    *
-    * run:
-    *
-    *     readelf -w trampoline.o
-    *
-    * to inspect the generated `.eh_frame` data. You will see two main components:
-    *
-    *     1. A CIE (Common Information Entry): shared configuration used by all FDEs.
-    *     2. An FDE (Frame Description Entry): function-specific unwind instructions.
-    *
-    * ---------------------
-    * Translating the CIE:
-    * ---------------------
-    * From `readelf -w`, you might see:
-    *
-    *   00000000 0000000000000010 00000000 CIE
-    *     Version:               1
-    *     Augmentation:          "zR"
-    *     Code alignment factor: 4
-    *     Data alignment factor: -8
-    *     Return address column: 30
-    *     Augmentation data:     1b
-    *     DW_CFA_def_cfa: r31 (sp) ofs 0
-    *
-    * Map this to:
-    *
-    *     DWRF_SECTION(CIE,
-    *         DWRF_U32(0);                             // CIE ID (always 0 for CIEs)
-    *         DWRF_U8(DWRF_CIE_VERSION);              // Version: 1
-    *         DWRF_STR("zR");                         // Augmentation string "zR"
-    *         DWRF_UV(4);                             // Code alignment factor = 4
-    *         DWRF_SV(-8);                            // Data alignment factor = -8
-    *         DWRF_U8(DWRF_REG_RA);                   // Return address register (e.g., x30 = 30)
-    *         DWRF_UV(1);                             // Augmentation data length = 1
-    *         DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
-    *
-    *         DWRF_U8(DWRF_CFA_def_cfa);              // DW_CFA_def_cfa
-    *         DWRF_UV(DWRF_REG_SP);                   // Register: SP (r31)
-    *         DWRF_UV(0);                             // Offset = 0
-    *
-    *         DWRF_ALIGNNOP(sizeof(uintptr_t));       // Align to pointer size boundary
-    *     )
-    *
-    * Notes:
-    *   - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
-    *   - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
-    *
-    * ---------------------
-    * Translating the FDE:
-    * ---------------------
-    * From `readelf -w`:
-    *
-    *   00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
-    *     DW_CFA_advance_loc: 4
-    *     DW_CFA_def_cfa_offset: 16
-    *     DW_CFA_offset: r29 at cfa-16
-    *     DW_CFA_offset: r30 at cfa-8
-    *     DW_CFA_advance_loc: 12
-    *     DW_CFA_restore: r30
-    *     DW_CFA_restore: r29
-    *     DW_CFA_def_cfa_offset: 0
-    *
-    * Map the FDE header and instructions to:
-    *
-    *     DWRF_SECTION(FDE,
-    *         DWRF_U32((uint32_t)(p - framep));       // Offset to CIE (relative from here)
-    *         DWRF_U32(pc_relative_offset);           // PC-relative location of the code (calculated dynamically)
-    *         DWRF_U32(ctx->code_size);               // Code range covered by this FDE
-    *         DWRF_U8(0);                             // Augmentation data length (none)
-    *
-    *         DWRF_U8(DWRF_CFA_advance_loc | 1);      // Advance location by 1 unit (1 * 4 = 4 bytes)
-    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP + 16
-    *         DWRF_UV(16);
-    *
-    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
-    *         DWRF_UV(2);                             // At offset 2 * 8 = 16 bytes
-    *
-    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
-    *         DWRF_UV(1);                             // At offset 1 * 8 = 8 bytes
-    *
-    *         DWRF_U8(DWRF_CFA_advance_loc | 3);      // Advance location by 3 units (3 * 4 = 12 bytes)
-    *
-    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
-    *         DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
-    *
-    *         DWRF_U8(DWRF_CFA_def_cfa_offset);       // CFA = SP
-    *         DWRF_UV(0);
-    *     )
-    *
-    * To regenerate:
-    *   1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
-    *   2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
-    *      the code is in a different address space every time.
-    *   3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
-    *        - `DW_CFA_def_cfa_offset`     → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
-    *        - `DW_CFA_offset: rX`         → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
-    *        - `DW_CFA_restore: rX`        → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
-    *        - `DW_CFA_advance_loc: N`     → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
-    *   4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
-    *   5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
-    */
-
-    /*
-     * Emit DWARF EH CIE (Common Information Entry)
-     *
-     * The CIE describes the calling conventions and basic unwinding rules
-     * that apply to all functions in this compilation unit.
-     */
-    DWRF_SECTION(CIE,
-        DWRF_U32(0);                           // CIE ID (0 indicates this is a CIE)
-        DWRF_U8(DWRF_CIE_VERSION);            // CIE version (1)
-        DWRF_STR("zR");                       // Augmentation string ("zR" = has LSDA)
-#ifdef __x86_64__
-        DWRF_UV(1);                           // Code alignment factor (x86_64: 1 byte)
-#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-        DWRF_UV(4);                           // Code alignment factor (AArch64: 4 bytes per instruction)
-#endif
-        DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
-        DWRF_U8(DWRF_REG_RA);                 // Return address register number
-        DWRF_UV(1);                           // Augmentation data length
-        DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
-
-        /* Initial CFI instructions - describe default calling convention */
-#ifdef __x86_64__
-        /* x86_64 initial CFI state */
-        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
-        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
-        DWRF_UV(sizeof(uintptr_t));           // CFA = SP + pointer_size
-        DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
-        DWRF_UV(1);                           // At offset 1 from CFA
-#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-        /* AArch64 initial CFI state */
-        DWRF_U8(DWRF_CFA_def_cfa);            // Define CFA (Call Frame Address)
-        DWRF_UV(DWRF_REG_SP);                 // CFA = SP register
-        DWRF_UV(0);                           // CFA = SP + 0 (AArch64 starts with offset 0)
-        // No initial register saves in AArch64 CIE
-#endif
-        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
-    )
-
-    ctx->eh_frame_p = p;  // Remember start of FDE data
-
-    /*
-     * Emit DWARF EH FDE (Frame Description Entry)
-     *
-     * The FDE describes unwinding information specific to this function.
-     * It references the CIE and provides function-specific CFI instructions.
-     *
-     * The PC-relative offset is calculated after the entire EH frame is built
-     * to ensure accurate positioning relative to the synthesized DSO layout.
-     */
-    DWRF_SECTION(FDE,
-        DWRF_U32((uint32_t)(p - framep));     // Offset to CIE (backwards reference)
-        ctx->fde_p = p;                        // Remember where PC offset field is located for later calculation
-        DWRF_U32(0);                           // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe)
-        DWRF_U32(ctx->code_size);             // Address range covered by this FDE (code length)
-        DWRF_U8(0);                           // Augmentation data length (none)
-
-        /*
-         * Architecture-specific CFI instructions
-         *
-         * These instructions describe how registers are saved and restored
-         * during function calls. Each architecture has different calling
-         * conventions and register usage patterns.
-         */
-#ifdef __x86_64__
-        /* x86_64 calling convention unwinding rules with frame pointer */
-#  if defined(__CET__) && (__CET__ & 1)
-        DWRF_U8(DWRF_CFA_advance_loc | 4);    // Advance past endbr64 (4 bytes)
-#  endif
-        DWRF_U8(DWRF_CFA_advance_loc | 1);    // Advance past push %rbp (1 byte)
-        DWRF_U8(DWRF_CFA_def_cfa_offset);     // def_cfa_offset 16
-        DWRF_UV(16);                          // New offset: SP + 16
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16
-        DWRF_UV(2);                           // Offset factor: 2 * 8 = 16 bytes
-        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past mov %rsp,%rbp (3 bytes)
-        DWRF_U8(DWRF_CFA_def_cfa_register);   // def_cfa_register r6
-        DWRF_UV(DWRF_REG_BP);                 // Use base pointer register
-        DWRF_U8(DWRF_CFA_advance_loc | 3);    // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
-        DWRF_U8(DWRF_CFA_def_cfa);            // def_cfa r7 ofs 8
-        DWRF_UV(DWRF_REG_SP);                 // Use stack pointer register
-        DWRF_UV(8);                           // New offset: SP + 8
-#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-        /* AArch64 calling convention unwinding rules */
-        DWRF_U8(DWRF_CFA_advance_loc | 1);        // Advance by 1 instruction (4 bytes)
-        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 16
-        DWRF_UV(16);                              // Stack pointer moved by 16 bytes
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP);   // x29 (frame pointer) saved
-        DWRF_UV(2);                               // At CFA-16 (2 * 8 = 16 bytes from CFA)
-        DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA);   // x30 (link register) saved
-        DWRF_UV(1);                               // At CFA-8 (1 * 8 = 8 bytes from CFA)
-        DWRF_U8(DWRF_CFA_advance_loc | 3);        // Advance by 3 instructions (12 bytes)
-        DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA);  // Restore x30 - NO DWRF_UV() after this!
-        DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP);  // Restore x29 - NO DWRF_UV() after this!
-        DWRF_U8(DWRF_CFA_def_cfa_offset);         // CFA = SP + 0 (stack restored)
-        DWRF_UV(0);                               // Back to original stack position
-#else
-#    error "Unsupported target architecture"
-#endif
-
-        DWRF_ALIGNNOP(sizeof(uintptr_t));     // Align to pointer boundary
-    )
-
-    ctx->p = p;  // Update context pointer to end of generated data
-
-    /* Calculate and update the PC-relative offset in the FDE
-     *
-     * When perf processes the jitdump, it creates a synthesized DSO with this layout:
-     *
-     *     Synthesized DSO Memory Layout:
-     *     ┌─────────────────────────────────────────────────────────────┐ < code_start
-     *     │                        Code Section                         │
-     *     │                    (round_up(code_size, 8) bytes)           │
-     *     ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
-     *     │                      EH Frame Data                          │
-     *     │  ┌─────────────────────────────────────────────────────┐    │
-     *     │  │                 CIE data                            │    │
-     *     │  └─────────────────────────────────────────────────────┘    │
-     *     │  ┌─────────────────────────────────────────────────────┐    │
-     *     │  │ FDE Header:                                         │    │
-     *     │  │   - CIE offset (4 bytes)                            │    │
-     *     │  │   - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
-     *     │  │   - address range (4 bytes)                         │    │   (this specific field)
-     *     │  │ CFI Instructions...                                 │    │
-     *     │  └─────────────────────────────────────────────────────┘    │
-     *     ├─────────────────────────────────────────────────────────────┤ < reference_point
-     *     │                    EhFrameHeader                            │
-     *     │                 (navigation metadata)                       │
-     *     └─────────────────────────────────────────────────────────────┘
-     *
-     * The PC offset field in the FDE must contain the distance from itself to code_start:
-     *
-     *   distance = code_start - fde_pc_field
-     *
-     * Where:
-     *   fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
-     *   code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
-     *
-     * Therefore:
-     *   distance = code_start_location - fde_pc_field_location
-     *            = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
-     *            = -rounded_code_size - fde_offset_in_frame
-     *            = -(round_up(code_size, 8) + fde_offset_in_frame)
-     *
-     * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field,
-     *
-     */
-    if (ctx->fde_p != NULL) {
-        int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp);
-        int32_t rounded_code_size = round_up(ctx->code_size, 8);
-        int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame);
-
-
-        // Update the PC-relative offset in the FDE
-        *(int32_t*)ctx->fde_p = pc_relative_offset;
-    }
-}
-
 // =============================================================================
 //                              JITDUMP INITIALIZATION
 // =============================================================================
@@ -1128,11 +499,13 @@ static void* perf_map_jit_init(void) {
 
     /* Initialize code ID counter */
     perf_jit_map_state.code_id = 0;
+    perf_jit_map_state.build_id_salt =
+        ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();
 
     /* Calculate padding size based on actual unwind info requirements */
-    size_t eh_frame_size = calculate_eh_frame_size();
+    size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0, NULL);
     size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
-    trampoline_api.code_padding = round_up(unwind_data_size, 16);
+    trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
     trampoline_api.code_alignment = 32;
 
     return &perf_jit_map_state;
@@ -1143,30 +516,19 @@ static void* perf_map_jit_init(void) {
 // =============================================================================
 
 /*
- * Write a complete jitdump entry for a Python function
- *
- * This is the main function called by Python's trampoline system whenever
- * a new piece of JIT-compiled code needs to be recorded. It writes both
- * the unwinding information and the code load event to the jitdump file.
- *
- * The function performs these steps:
- * 1. Initialize jitdump system if not already done
- * 2. Extract function name and filename from Python code object
- * 3. Generate DWARF unwinding information
- * 4. Write unwinding info event to jitdump file
- * 5. Write code load event to jitdump file
+ * Write a complete jitdump entry for a code region with a provided name.
  *
- * Args:
- *   state: Jitdump state (currently unused, uses global state)
- *   code_addr: Address where the compiled code resides
- *   code_size: Size of the compiled code in bytes
- *   co: Python code object containing metadata
- *
- * IMPORTANT: This function signature is part of Python's internal API
- * and must not be changed without coordinating with core Python development.
+ * This shares the same implementation as the trampoline callback, but
+ * allows callers that don't have a PyCodeObject to reuse the jitdump
+ * infrastructure.
  */
-static void perf_map_jit_write_entry(void *state, const void *code_addr,
-                                    unsigned int code_size, PyCodeObject *co)
+static void perf_map_jit_write_entry_with_name(
+    void *state,
+    const void *code_addr,
+    size_t code_size,
+    const char *entry,
+    const char *filename
+)
 {
     /* Initialize jitdump system on first use */
     if (perf_jit_map_state.perf_map == NULL) {
@@ -1176,21 +538,11 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
         }
     }
 
-    /*
-     * Extract function information from Python code object
-     *
-     * We create a human-readable function name by combining the qualified
-     * name (includes class/module context) with the filename. This helps
-     * developers identify functions in perf reports.
-     */
-    const char *entry = "";
-    if (co->co_qualname != NULL) {
-        entry = PyUnicode_AsUTF8(co->co_qualname);
+    if (entry == NULL) {
+        entry = "";
     }
-
-    const char *filename = "";
-    if (co->co_filename != NULL) {
-        filename = PyUnicode_AsUTF8(co->co_filename);
+    if (filename == NULL) {
+        filename = "";
     }
 
     /*
@@ -1218,16 +570,13 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
      * Without it, perf cannot generate accurate call graphs, especially
      * in optimized code where frame pointers may be omitted.
      */
-    ELFObjectContext ctx;
-    char buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
-    ctx.code_size = code_size;
-    ctx.startp = ctx.p = (uint8_t*)buffer;
-    ctx.fde_p = NULL;  // Initialize to NULL, will be set when FDE is written
-
-    /* Generate EH frame (Exception Handling frame) data */
-    elf_init_ehframe(&ctx);
-    int eh_frame_size = ctx.p - ctx.startp;
-
+    uint8_t buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
+    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
+        buffer, sizeof(buffer), code_addr, code_size, 0, NULL);
+    if (eh_frame_size == 0) {
+        PyMem_RawFree(perf_map_entry);
+        return;
+    }
     /*
      * Write Code Unwinding Information Event
      *
@@ -1244,12 +593,12 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
     assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
 
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
-    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);  // 16-byte alignment
+    ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16);  // 16-byte alignment
 
     /* Calculate total event size with padding */
-    int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
-    int padding_size = round_up(content_size, 8) - content_size;  // 8-byte align
-    ev2.base.size = content_size + padding_size;
+    int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
+    int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size;  // 8-byte align
+    ev2.base.size = (uint32_t)(content_size + padding_size);
 
     /* Write the unwinding info event header */
     perf_map_jit_write_fully(&ev2, sizeof(ev2));
@@ -1263,20 +612,21 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
      */
     EhFrameHeader f;
     f.version = 1;
-    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;  // PC-relative signed 4-byte
-    f.fde_count_enc = DwarfUData4;                  // Unsigned 4-byte count
-    f.table_enc = DwarfSData4 | DwarfDataRel;       // Data-relative signed 4-byte
+    f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
+    f.fde_count_enc = DWRF_EH_PE_udata4;
+    f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;
 
     /* Calculate relative offsets for EH frame navigation */
-    f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
+    f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
     f.eh_fde_count = 1;  // We generate exactly one FDE per function
-    f.from = -(round_up(code_size, 8) + eh_frame_size);
-
-    int cie_size = ctx.eh_frame_p - ctx.startp;
-    f.to = -(eh_frame_size - cie_size);
+    f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
+    uint32_t cie_payload_size;
+    memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
+    int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
+    f.to = -(int32_t)(eh_frame_size - cie_size);
 
     /* Write EH frame data and header */
-    perf_map_jit_write_fully(ctx.startp, eh_frame_size);
+    perf_map_jit_write_fully(buffer, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));
 
     /* Write padding to maintain alignment */
@@ -1313,12 +663,85 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
     /* Write code load event and associated data */
     perf_map_jit_write_fully(&ev, sizeof(ev));
     perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
-    perf_map_jit_write_fully((void*)(base), size);           // Copy actual machine code
+    /*
+     * Ensure each synthetic DSO has unique .text bytes.
+     *
+     * perf merges DSOs that share a build-id. Since trampolines can share
+     * identical code and unwind bytes, perf may resolve all JIT frames to
+     * the first symbol it saw (including entries from previous runs when
+     * build-id caching is enabled). Patch a small marker in the emitted
+     * bytes to make the build-id depend on a per-process salt and code id
+     * without modifying the live code.
+     */
+    uint64_t marker = perf_jit_map_state.build_id_salt ^
+        ((uint64_t)perf_jit_map_state.code_id << 32) ^
+        (uint64_t)code_size;
+    if (size >= sizeof(marker)) {
+        size_t prefix = size - sizeof(marker);
+        perf_map_jit_write_fully((void *)(base), prefix);
+        perf_map_jit_write_fully(&marker, sizeof(marker));
+    }
+    else if (size > 0) {
+        uint8_t tmp[sizeof(marker)];
+        memcpy(tmp, (void *)(base), size);
+        for (size_t i = 0; i < size; i++) {
+            tmp[i] ^= (uint8_t)(marker >> (i * 8));
+        }
+        perf_map_jit_write_fully(tmp, size);
+    }
 
     /* Clean up allocated memory */
     PyMem_RawFree(perf_map_entry);
 }
 
+/*
+ * Write a complete jitdump entry for a Python function
+ *
+ * This is the main function called by Python's trampoline system whenever
+ * a new piece of JIT-compiled code needs to be recorded. It writes both
+ * the unwinding information and the code load event to the jitdump file.
+ *
+ * The function performs these steps:
+ * 1. Initialize jitdump system if not already done
+ * 2. Extract function name and filename from Python code object
+ * 3. Generate DWARF unwinding information
+ * 4. Write unwinding info event to jitdump file
+ * 5. Write code load event to jitdump file
+ *
+ * Args:
+ *   state: Jitdump state (currently unused, uses global state)
+ *   code_addr: Address where the compiled code resides
+ *   code_size: Size of the compiled code in bytes
+ *   co: Python code object containing metadata
+ *
+ * IMPORTANT: This function signature is part of Python's internal API
+ * and must not be changed without coordinating with core Python development.
+ */
+static void perf_map_jit_write_entry(void *state, const void *code_addr,
+                                     size_t code_size, PyCodeObject *co)
+{
+    const char *entry = "";
+    const char *filename = "";
+    if (co != NULL) {
+        if (co->co_qualname != NULL) {
+            entry = PyUnicode_AsUTF8(co->co_qualname);
+        }
+        if (co->co_filename != NULL) {
+            filename = PyUnicode_AsUTF8(co->co_filename);
+        }
+    }
+    perf_map_jit_write_entry_with_name(state, code_addr, code_size,
+                                       entry, filename);
+}
+
+void
+_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
+                          const char *entry, const char *filename)
+{
+    perf_map_jit_write_entry_with_name(
+        NULL, code_addr, code_size, entry, filename);
+}
+
 // =============================================================================
 //                              CLEANUP AND FINALIZATION
 // =============================================================================
diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c
index 0d835f3b7f56a9..58c61e64bfc4e9 100644
--- a/Python/perf_trampoline.c
+++ b/Python/perf_trampoline.c
@@ -243,7 +243,7 @@ perf_trampoline_code_watcher(PyCodeEvent event, PyCodeObject *co)
 
 static void
 perf_map_write_entry(void *state, const void *code_addr,
-                         unsigned int code_size, PyCodeObject *co)
+                         size_t code_size, PyCodeObject *co)
 {
     const char *entry = "";
     if (co->co_qualname != NULL) {
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index ce9c03bda7bd57..dfc3158f702bfe 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2706,7 +2706,7 @@ PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void) {
 
 PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(
     const void *code_addr,
-    unsigned int code_size,
+    size_t code_size,
     const char *entry_name
 ) {
 #ifndef MS_WINDOWS
@@ -2717,7 +2717,7 @@ PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(
         }
     }
     PyThread_acquire_lock(perf_map_state.map_lock, 1);
-    fprintf(perf_map_state.perf_map, "%" PRIxPTR " %x %s\n", (uintptr_t) code_addr, code_size, entry_name);
+    fprintf(perf_map_state.perf_map, "%" PRIxPTR " %zx %s\n", (uintptr_t) code_addr, code_size, entry_name);
     fflush(perf_map_state.perf_map);
     PyThread_release_lock(perf_map_state.map_lock);
 #endif
diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py
index 7197d70bc8bd0c..17547d4d916e9f 100755
--- a/Tools/build/smelly.py
+++ b/Tools/build/smelly.py
@@ -25,6 +25,8 @@
 # "Legacy": some old symbols are prefixed by "PY_".
 EXCEPTIONS = frozenset({
     'PY_TIMEOUT_MAX',
+    '__jit_debug_descriptor',
+    '__jit_debug_register_code',
 })
 
 IGNORED_EXTENSION = "_ctypes_test"
diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py
index a251a045b91144..a16d5773d5544c 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -324,8 +324,10 @@ def format_tsv_lines(lines):
     _abs('Objects/stringlib/unicode_format.h'): (10_000, 400),
     _abs('Objects/typeobject.c'): (380_000, 13_000),
     _abs('Python/compile.c'): (20_000, 500),
+    _abs('Python/jit_unwind.c'): (20_000, 300),
     _abs('Python/optimizer.c'): (100_000, 5_000),
     _abs('Python/parking_lot.c'): (40_000, 1000),
+    _abs('Python/perf_jit_trampoline.c'): (40_000, 1000),
     _abs('Python/pylifecycle.c'): (750_000, 5000),
     _abs('Python/pystate.c'): (750_000, 5000),
     _abs('Python/initconfig.c'): (50_000, 500),
diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv
index d2489387f46caa..1b1a208dcfde0f 100644
--- a/Tools/c-analyzer/cpython/ignored.tsv
+++ b/Tools/c-analyzer/cpython/ignored.tsv
@@ -386,6 +386,8 @@ Python/intrinsics.c	-	_PyIntrinsics_UnaryFunctions	-
 Python/intrinsics.c	-	_PyIntrinsics_BinaryFunctions	-
 Python/lock.c	-	TIME_TO_BE_FAIR_NS	-
 Python/opcode_targets.h	-	opcode_targets	-
+Python/jit_unwind.c	-	__jit_debug_descriptor	-
+Python/jit_unwind.c	-	jit_debug_mutex	-
 Python/perf_trampoline.c	-	_Py_perfmap_callbacks	-
 Python/perf_jit_trampoline.c	-	_Py_perfmap_jit_callbacks	-
 Python/perf_jit_trampoline.c	-	perf_jit_map_state	-
diff --git a/Tools/jit/_eh_frame.py b/Tools/jit/_eh_frame.py
new file mode 100644
index 00000000000000..a211c1e352a726
--- /dev/null
+++ b/Tools/jit/_eh_frame.py
@@ -0,0 +1,112 @@
+"""Minimal DWARF .eh_frame parser for the JIT shim's CFI extraction.
+
+Reads a compiled object's .eh_frame section and returns the bytes we
+need to splice into the runtime-emitted EH frame in Python/jit_unwind.c:
+the CIE's initial CFI, the FDE's CFI, and the CIE's alignment factors
+and RA column.
+"""
+
+import _stencils
+
+
+def _read_uleb128(data: bytes, pos: int) -> tuple[int, int]:
+    result = shift = n = 0
+    while True:
+        b = data[pos + n]
+        n += 1
+        result |= (b & 0x7F) << shift
+        if not (b & 0x80):
+            return result, n
+        shift += 7
+
+
+def _read_sleb128(data: bytes, pos: int) -> tuple[int, int]:
+    result = shift = n = 0
+    while True:
+        b = data[pos + n]
+        n += 1
+        result |= (b & 0x7F) << shift
+        shift += 7
+        if not (b & 0x80):
+            if b & 0x40:
+                result -= 1 << shift
+            return result, n
+
+
+def parse(data: bytes) -> _stencils.ShimCfi:
+    """
+    Parse a compiled .eh_frame section and return the shim's CFI bytes.
+
+    Extracts the CIE's initial CFI (state at function entry) and the FDE's
+    CFI (prologue transitions) as raw bytes, plus the CIE's CAF/DAF/RA
+    column so jit_unwind.c can emit a matching synthetic CIE at runtime.
+
+    Assumptions (verified as they're read):
+    - The object has exactly one CIE covering one FDE (the shim).
+    - FDE pointer encoding is DW_EH_PE_absptr or sdata4 (compilers always
+      pick one of these for an unlinked object); we skip the PC+range
+      fields either way since we re-emit them at runtime.
+    """
+    pos = 0
+    cie_init_cfi: bytes | None = None
+    code_align = data_align = ra_column = None
+    fde_cfi: bytes | None = None
+    cie_has_z = False
+    # sdata4 default; adjusted below if the CIE's R augmentation says absptr.
+    fde_ptr_len = 4
+    while pos < len(data):
+        length = int.from_bytes(data[pos : pos + 4], "little")
+        if length == 0:
+            break
+        entry_end = pos + 4 + length
+        pos += 4
+        cie_id = int.from_bytes(data[pos : pos + 4], "little")
+        pos += 4
+        if cie_id == 0:
+            assert data[pos] == 1, "only DWARF CIE version 1 supported"
+            pos += 1
+            aug_end = data.index(0, pos)
+            augmentation = data[pos:aug_end].decode("ascii")
+            pos = aug_end + 1
+            code_align, n = _read_uleb128(data, pos)
+            pos += n
+            data_align, n = _read_sleb128(data, pos)
+            pos += n
+            ra_column = data[pos]
+            pos += 1
+            if "z" in augmentation:
+                cie_has_z = True
+                aug_data_len, n = _read_uleb128(data, pos)
+                pos += n
+                aug_data_start = pos
+                for ch in augmentation[1:]:  # skip 'z'
+                    if ch == "R":
+                        enc = data[pos]
+                        if enc == 0x00:  # DW_EH_PE_absptr
+                            fde_ptr_len = 8
+                        pos += 1
+                    elif ch in ("L", "P"):
+                        raise AssertionError(
+                            f"shim .eh_frame has augmentation {ch!r}; unsupported"
+                        )
+                pos = aug_data_start + aug_data_len
+            # Trailing 0-bytes are DW_CFA_nop alignment padding; strip
+            # them so the splice at runtime re-aligns without double-pad.
+            cie_init_cfi = bytes(data[pos:entry_end]).rstrip(b"\x00")
+        else:
+            assert cie_has_z, "FDE before CIE"
+            pos += fde_ptr_len  # pc_start (we re-emit with absolute encoding)
+            pos += fde_ptr_len  # pc_range
+            aug_data_len, n = _read_uleb128(data, pos)
+            pos += n
+            pos += aug_data_len
+            fde_cfi = bytes(data[pos:entry_end]).rstrip(b"\x00")  # nop padding
+        pos = entry_end
+    assert cie_init_cfi is not None and fde_cfi is not None, "no CIE/FDE in .eh_frame"
+    return _stencils.ShimCfi(
+        cie_init_cfi=cie_init_cfi,
+        fde_cfi=fde_cfi,
+        code_align=code_align,
+        data_align=data_align,
+        ra_column=ra_column,
+    )
diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py
index 258de8ab3136a4..ad50933b92b605 100644
--- a/Tools/jit/_stencils.py
+++ b/Tools/jit/_stencils.py
@@ -246,6 +246,22 @@ def pad(self, alignment: int) -> None:
         self.body.extend([0] * padding)
 
 
+@dataclasses.dataclass
+class ShimCfi:
+    """
+    DWARF CFI bytes captured from the compiled shim's .eh_frame.
+
+    These let jit_unwind.c emit an accurate GDB EH frame for the shim
+    without hand-rolling DWARF that tracks the compiler's prologue.
+    """
+
+    cie_init_cfi: bytes
+    fde_cfi: bytes
+    code_align: int
+    data_align: int
+    ra_column: int
+
+
 @dataclasses.dataclass
 class StencilGroup:
     """
@@ -259,6 +275,7 @@ class StencilGroup:
     symbols: dict[int | str, tuple[HoleValue, int]] = dataclasses.field(
         default_factory=dict, init=False
     )
+    shim_cfi: ShimCfi | None = dataclasses.field(default=None, init=False)
     _jit_symbol_table: dict[str, int] = dataclasses.field(
         default_factory=dict, init=False
     )
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
index 787fcf53260f3d..e94f351966f1d3 100644
--- a/Tools/jit/_targets.py
+++ b/Tools/jit/_targets.py
@@ -12,6 +12,7 @@
 import typing
 import shlex
 
+import _eh_frame
 import _llvm
 import _optimizers
 import _schema
@@ -163,10 +164,10 @@ async def _compile(
             # __FILE__ macro and assert failure messages) for reproducibility:
             f"-ffile-prefix-map={CPYTHON}=.",
             f"-ffile-prefix-map={tempdir}=.",
-            # This debug info isn't necessary, and bloats out the JIT'ed code.
-            # We *may* be able to re-enable this, process it, and JIT it for a
-            # nicer debugging experience... but that needs a lot more research:
-            "-fno-asynchronous-unwind-tables",
+            # Unwind info is per-stencil (see below): disabled for
+            # executors (bloats the JIT'ed code with info we'd have to
+            # process and re-JIT), enabled for the shim because its
+            # .eh_frame is what jit_unwind.c ships to GDB.
             # Don't call built-in functions that we can't find or patch:
             "-fno-builtin",
             # Don't call stack-smashing canaries that we can't find or patch:
@@ -177,6 +178,16 @@ async def _compile(
             f"{c}",
         ]
         is_shim = opname == "shim"
+        # Executors ride the pinned-frame-pointer invariant so we can
+        # synthesize their CFI by hand. Only the Linux/ELF shim consumes
+        # compiler-emitted unwind tables; enabling them on COFF/Mach-O
+        # would introduce platform unwind relocations our parsers do not
+        # handle.
+        args_s.append(
+            "-fasynchronous-unwind-tables"
+            if is_shim and isinstance(self, _ELF)
+            else "-fno-asynchronous-unwind-tables"
+        )
         if self.frame_pointers:
             frame_pointer = "all" if is_shim else "reserved"
             args_s += ["-Xclang", f"-mframe-pointer={frame_pointer}"]
@@ -388,6 +399,7 @@ def _handle_section(
         self, section: _schema.ELFSection, group: _stencils.StencilGroup
     ) -> None:
         section_type = section["Type"]["Name"]
+        section_name = section["Name"]["Name"]
         flags = {flag["Name"] for flag in section["Flags"]["Flags"]}
         if section_type == "SHT_RELA":
             assert "SHF_INFO_LINK" in flags, flags
@@ -406,6 +418,15 @@ def _handle_section(
                 relocation = wrapped_relocation["Relocation"]
                 hole = self._handle_relocation(base, relocation, stencil.body)
                 stencil.holes.append(hole)
+        elif section_name == ".eh_frame":
+            if "SHF_ALLOC" not in flags:
+                return
+            # LLVM 21 emits x86_64 .eh_frame as SHT_X86_64_UNWIND.
+            assert section_type in {"SHT_PROGBITS", "SHT_X86_64_UNWIND"}, (
+                section_type
+            )
+            group.shim_cfi = _eh_frame.parse(bytes(section["SectionData"]["Bytes"]))
+            return
         elif section_type == "SHT_PROGBITS":
             if "SHF_ALLOC" not in flags:
                 return
diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py
index 5fd9a2ee2d6e58..2768811ab75a9d 100644
--- a/Tools/jit/_writer.py
+++ b/Tools/jit/_writer.py
@@ -7,6 +7,38 @@
 import _stencils
 
 
+def _byte_rows(blob: bytes, per_row: int) -> typing.Iterator[str]:
+    for i in range(0, len(blob), per_row):
+        yield " ".join(f"{b:#04x}," for b in blob[i : i + per_row])
+
+
+def _dump_shim_cfi(group: _stencils.StencilGroup) -> typing.Iterator[str]:
+    # Only ELF objects carry a .eh_frame we can capture; on Mach-O and
+    # COFF the unwind info lives in different sections (and the GDB JIT
+    # interface this feeds is Linux+ELF-only anyway — see the #ifdef in
+    # Python/jit.c jit_record_code()). Custom CFLAGS may also suppress
+    # the shim's .eh_frame even on Linux+ELF, so always emit a feature
+    # guard before any optional symbol definitions.
+    cfi = group.shim_cfi
+    if cfi is None:
+        yield "#define _Py_JIT_HAS_SHIM_CFI 0"
+        yield ""
+        return
+    yield "#define _Py_JIT_HAS_SHIM_CFI 1"
+    for name, blob in (
+        ("_Py_jit_shim_cie_init_cfi", cfi.cie_init_cfi),
+        ("_Py_jit_shim_fde_cfi",      cfi.fde_cfi),
+    ):
+        yield f"static const uint8_t {name}[{len(blob)}] = {{"
+        for row in _byte_rows(blob, per_row=12):
+            yield f"    {row}"
+        yield "};"
+    yield f"#define _Py_jit_shim_code_align {cfi.code_align}"
+    yield f"#define _Py_jit_shim_data_align {cfi.data_align}"
+    yield f"#define _Py_jit_shim_ra_column  {cfi.ra_column}"
+    yield ""
+
+
 def _dump_footer(
     groups: dict[str, _stencils.StencilGroup], symbols: dict[str, int]
 ) -> typing.Iterator[str]:
@@ -23,6 +55,7 @@ def _dump_footer(
     yield "    symbol_mask got_mask;"
     yield "} StencilGroup;"
     yield ""
+    yield from _dump_shim_cfi(groups["shim"])
     yield f"static const StencilGroup shim = {groups['shim'].as_c('shim')};"
     yield ""
     yield "static const StencilGroup stencil_groups[MAX_UOP_REGS_ID + 1] = {"
@@ -53,8 +86,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator
         stripped = stencil.body.rstrip(b"\x00")
         if stripped:
             yield f"    const unsigned char {part}_body[{len(stencil.body)}] = {{"
-            for i in range(0, len(stripped), 8):
-                row = " ".join(f"{byte:#04x}," for byte in stripped[i : i + 8])
+            for row in _byte_rows(stripped, per_row=8):
                 yield f"        {row}"
             yield "    };"
     # Data is written first (so relaxations in the code work properly):