Skip to content

Commit 1554a94

Browse files
Merge branch 'main' into stackref_tests
2 parents 1cb4607 + 209eaff commit 1554a94

44 files changed

Lines changed: 2479 additions & 1078 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/jit.yml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,10 @@ jobs:
5757
fail-fast: false
5858
matrix:
5959
target:
60-
- i686-pc-windows-msvc/msvc
61-
- x86_64-pc-windows-msvc/msvc
62-
- aarch64-pc-windows-msvc/msvc
60+
# To re-enable later when we support these.
61+
# - i686-pc-windows-msvc/msvc
62+
# - x86_64-pc-windows-msvc/msvc
63+
# - aarch64-pc-windows-msvc/msvc
6364
- x86_64-apple-darwin/clang
6465
- aarch64-apple-darwin/clang
6566
- x86_64-unknown-linux-gnu/gcc
@@ -70,15 +71,16 @@ jobs:
7071
llvm:
7172
- 21
7273
include:
73-
- target: i686-pc-windows-msvc/msvc
74-
architecture: Win32
75-
runner: windows-2022
76-
- target: x86_64-pc-windows-msvc/msvc
77-
architecture: x64
78-
runner: windows-2022
79-
- target: aarch64-pc-windows-msvc/msvc
80-
architecture: ARM64
81-
runner: windows-11-arm
74+
# To re-enable later when we support these.
75+
# - target: i686-pc-windows-msvc/msvc
76+
# architecture: Win32
77+
# runner: windows-2022
78+
# - target: x86_64-pc-windows-msvc/msvc
79+
# architecture: x64
80+
# runner: windows-2022
81+
# - target: aarch64-pc-windows-msvc/msvc
82+
# architecture: ARM64
83+
# runner: windows-11-arm
8284
- target: x86_64-apple-darwin/clang
8385
architecture: x86_64
8486
runner: macos-15-intel

Include/cpython/pystats.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ typedef struct _optimization_stats {
150150
uint64_t optimized_trace_length_hist[_Py_UOP_HIST_SIZE];
151151
uint64_t optimizer_attempts;
152152
uint64_t optimizer_successes;
153+
uint64_t optimizer_contradiction;
154+
uint64_t optimizer_frame_overflow;
153155
uint64_t optimizer_failure_reason_no_memory;
154156
uint64_t remove_globals_builtins_changed;
155157
uint64_t remove_globals_incorrect_keys;

Include/internal/pycore_backoff.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,24 @@ backoff_counter_triggers(_Py_BackoffCounter counter)
9595
return counter.value_and_backoff < UNREACHABLE_BACKOFF;
9696
}
9797

98+
static inline _Py_BackoffCounter
99+
trigger_backoff_counter(void)
100+
{
101+
_Py_BackoffCounter result;
102+
result.value_and_backoff = 0;
103+
return result;
104+
}
105+
98106
// Initial JUMP_BACKWARD counter.
99107
// Must be larger than ADAPTIVE_COOLDOWN_VALUE, otherwise when JIT code is
100108
// invalidated we may construct a new trace before the bytecode has properly
101109
// re-specialized:
102-
#define JUMP_BACKWARD_INITIAL_VALUE 4095
110+
// Note: this should be a prime number-1. This increases the likelihood of
111+
// finding a "good" loop iteration to trace.
112+
// For example, 4095 does not work for the nqueens benchmark on pyperformance
113+
// as we always end up tracing the loop iteration's
114+
// exhaustion iteration. Which aborts our current tracer.
115+
#define JUMP_BACKWARD_INITIAL_VALUE 4000
103116
#define JUMP_BACKWARD_INITIAL_BACKOFF 12
104117
static inline _Py_BackoffCounter
105118
initial_jump_backoff_counter(void)
@@ -112,7 +125,7 @@ initial_jump_backoff_counter(void)
112125
* Must be larger than ADAPTIVE_COOLDOWN_VALUE,
113126
* otherwise when a side exit warms up we may construct
114127
* a new trace before the Tier 1 code has properly re-specialized. */
115-
#define SIDE_EXIT_INITIAL_VALUE 4095
128+
#define SIDE_EXIT_INITIAL_VALUE 4000
116129
#define SIDE_EXIT_INITIAL_BACKOFF 12
117130

118131
static inline _Py_BackoffCounter

Include/internal/pycore_ceval.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,8 @@ _PyForIter_VirtualIteratorNext(PyThreadState* tstate, struct _PyInterpreterFrame
392392
#define SPECIAL___AEXIT__ 3
393393
#define SPECIAL_MAX 3
394394

395+
PyAPI_DATA(const _Py_CODEUNIT *) _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR;
396+
395397
#ifdef __cplusplus
396398
}
397399
#endif

Include/internal/pycore_interp_structs.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ extern "C" {
1414
#include "pycore_structs.h" // PyHamtObject
1515
#include "pycore_tstate.h" // _PyThreadStateImpl
1616
#include "pycore_typedefs.h" // _PyRuntimeState
17-
#include "pycore_uop.h" // struct _PyUOpInstruction
18-
1917

2018
#define CODE_MAX_WATCHERS 8
2119
#define CONTEXT_MAX_WATCHERS 8
@@ -934,10 +932,10 @@ struct _is {
934932
PyObject *common_consts[NUM_COMMON_CONSTANTS];
935933
bool jit;
936934
bool compiling;
937-
struct _PyUOpInstruction *jit_uop_buffer;
938935
struct _PyExecutorObject *executor_list_head;
939936
struct _PyExecutorObject *executor_deletion_list_head;
940937
struct _PyExecutorObject *cold_executor;
938+
struct _PyExecutorObject *cold_dynamic_executor;
941939
int executor_deletion_list_remaining_capacity;
942940
size_t executor_creation_counter;
943941
_rare_events rare_events;

Include/internal/pycore_opcode_metadata.h

Lines changed: 39 additions & 32 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_optimizer.h

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,6 @@ typedef struct _PyExecutorLinkListNode {
2121
} _PyExecutorLinkListNode;
2222

2323

24-
/* Bloom filter with m = 256
25-
* https://en.wikipedia.org/wiki/Bloom_filter */
26-
#define _Py_BLOOM_FILTER_WORDS 8
27-
28-
typedef struct {
29-
uint32_t bits[_Py_BLOOM_FILTER_WORDS];
30-
} _PyBloomFilter;
31-
3224
typedef struct {
3325
uint8_t opcode;
3426
uint8_t oparg;
@@ -44,7 +36,9 @@ typedef struct {
4436

4537
typedef struct _PyExitData {
4638
uint32_t target;
47-
uint16_t index;
39+
uint16_t index:14;
40+
uint16_t is_dynamic:1;
41+
uint16_t is_control_flow:1;
4842
_Py_BackoffCounter temperature;
4943
struct _PyExecutorObject *executor;
5044
} _PyExitData;
@@ -94,9 +88,8 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp);
9488
// This value is arbitrary and was not optimized.
9589
#define JIT_CLEANUP_THRESHOLD 1000
9690

97-
#define TRACE_STACK_SIZE 5
98-
99-
int _Py_uop_analyze_and_optimize(_PyInterpreterFrame *frame,
91+
int _Py_uop_analyze_and_optimize(
92+
PyFunctionObject *func,
10093
_PyUOpInstruction *trace, int trace_len, int curr_stackentries,
10194
_PyBloomFilter *dependencies);
10295

@@ -130,7 +123,7 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
130123
#define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5)
131124

132125
// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
133-
#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
126+
#define MAX_ABSTRACT_FRAME_DEPTH (16)
134127

135128
// The maximum number of side exits that we can take before requiring forward
136129
// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this
@@ -258,6 +251,7 @@ struct _Py_UOpsAbstractFrame {
258251
int stack_len;
259252
int locals_len;
260253
PyFunctionObject *func;
254+
PyCodeObject *code;
261255

262256
JitOptRef *stack_pointer;
263257
JitOptRef *stack;
@@ -333,11 +327,11 @@ extern _Py_UOpsAbstractFrame *_Py_uop_frame_new(
333327
int curr_stackentries,
334328
JitOptRef *args,
335329
int arg_len);
336-
extern int _Py_uop_frame_pop(JitOptContext *ctx);
330+
extern int _Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co, int curr_stackentries);
337331

338332
PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
339333

340-
PyAPI_FUNC(int) _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyExecutorObject **exec_ptr, int chain_depth);
334+
PyAPI_FUNC(int) _PyOptimizer_Optimize(_PyInterpreterFrame *frame, PyThreadState *tstate);
341335

342336
static inline _PyExecutorObject *_PyExecutor_FromExit(_PyExitData *exit)
343337
{
@@ -346,6 +340,7 @@ static inline _PyExecutorObject *_PyExecutor_FromExit(_PyExitData *exit)
346340
}
347341

348342
extern _PyExecutorObject *_PyExecutor_GetColdExecutor(void);
343+
extern _PyExecutorObject *_PyExecutor_GetColdDynamicExecutor(void);
349344

350345
PyAPI_FUNC(void) _PyExecutor_ClearExit(_PyExitData *exit);
351346

@@ -354,7 +349,9 @@ static inline int is_terminator(const _PyUOpInstruction *uop)
354349
int opcode = uop->opcode;
355350
return (
356351
opcode == _EXIT_TRACE ||
357-
opcode == _JUMP_TO_TOP
352+
opcode == _DEOPT ||
353+
opcode == _JUMP_TO_TOP ||
354+
opcode == _DYNAMIC_EXIT
358355
);
359356
}
360357

@@ -365,6 +362,18 @@ PyAPI_FUNC(int) _PyDumpExecutors(FILE *out);
365362
extern void _Py_ClearExecutorDeletionList(PyInterpreterState *interp);
366363
#endif
367364

365+
int _PyJit_translate_single_bytecode_to_trace(PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *next_instr, bool stop_tracing);
366+
367+
int
368+
_PyJit_TryInitializeTracing(PyThreadState *tstate, _PyInterpreterFrame *frame,
369+
_Py_CODEUNIT *curr_instr, _Py_CODEUNIT *start_instr,
370+
_Py_CODEUNIT *close_loop_instr, int curr_stackdepth, int chain_depth, _PyExitData *exit,
371+
int oparg);
372+
373+
void _PyJit_FinalizeTracing(PyThreadState *tstate);
374+
375+
void _PyJit_Tracer_InvalidateDependency(PyThreadState *old_tstate, void *obj);
376+
368377
#ifdef __cplusplus
369378
}
370379
#endif

Include/internal/pycore_tstate.h

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ extern "C" {
1212
#include "pycore_freelist_state.h" // struct _Py_freelists
1313
#include "pycore_mimalloc.h" // struct _mimalloc_thread_state
1414
#include "pycore_qsbr.h" // struct qsbr
15-
15+
#include "pycore_uop.h" // struct _PyUOpInstruction
16+
#include "pycore_structs.h"
1617

1718
#ifdef Py_GIL_DISABLED
1819
struct _gc_thread_state {
@@ -21,6 +22,38 @@ struct _gc_thread_state {
2122
};
2223
#endif
2324

25+
#if _Py_TIER2
26+
typedef struct _PyJitTracerInitialState {
27+
int stack_depth;
28+
int chain_depth;
29+
struct _PyExitData *exit;
30+
PyCodeObject *code; // Strong
31+
PyFunctionObject *func; // Strong
32+
_Py_CODEUNIT *start_instr;
33+
_Py_CODEUNIT *close_loop_instr;
34+
_Py_CODEUNIT *jump_backward_instr;
35+
} _PyJitTracerInitialState;
36+
37+
typedef struct _PyJitTracerPreviousState {
38+
bool dependencies_still_valid;
39+
bool instr_is_super;
40+
int code_max_size;
41+
int code_curr_size;
42+
int instr_oparg;
43+
int instr_stacklevel;
44+
_Py_CODEUNIT *instr;
45+
PyCodeObject *instr_code; // Strong
46+
struct _PyInterpreterFrame *instr_frame;
47+
_PyBloomFilter dependencies;
48+
} _PyJitTracerPreviousState;
49+
50+
typedef struct _PyJitTracerState {
51+
_PyUOpInstruction *code_buffer;
52+
_PyJitTracerInitialState initial_state;
53+
_PyJitTracerPreviousState prev_state;
54+
} _PyJitTracerState;
55+
#endif
56+
2457
// Every PyThreadState is actually allocated as a _PyThreadStateImpl. The
2558
// PyThreadState fields are exposed as part of the C API, although most fields
2659
// are intended to be private. The _PyThreadStateImpl fields not exposed.
@@ -85,7 +118,9 @@ typedef struct _PyThreadStateImpl {
85118
#if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED)
86119
Py_ssize_t reftotal; // this thread's total refcount operations
87120
#endif
88-
121+
#if _Py_TIER2
122+
_PyJitTracerState jit_tracer_state;
123+
#endif
89124
} _PyThreadStateImpl;
90125

91126
#ifdef __cplusplus

Include/internal/pycore_uop.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,18 @@ typedef struct _PyUOpInstruction{
3535
#endif
3636
} _PyUOpInstruction;
3737

38-
// This is the length of the trace we project initially.
39-
#define UOP_MAX_TRACE_LENGTH 1200
38+
// This is the length of the trace we translate initially.
39+
#define UOP_MAX_TRACE_LENGTH 3000
4040
#define UOP_BUFFER_SIZE (UOP_MAX_TRACE_LENGTH * sizeof(_PyUOpInstruction))
4141

42+
/* Bloom filter with m = 256
43+
* https://en.wikipedia.org/wiki/Bloom_filter */
44+
#define _Py_BLOOM_FILTER_WORDS 8
45+
46+
typedef struct {
47+
uint32_t bits[_Py_BLOOM_FILTER_WORDS];
48+
} _PyBloomFilter;
49+
4250
#ifdef __cplusplus
4351
}
4452
#endif

0 commit comments

Comments
 (0)