Skip to content

Commit e0cf241

Browse files
committed
Address code review
1 parent a08df36 commit e0cf241

File tree

2 files changed

+103
-34
lines changed

2 files changed

+103
-34
lines changed

Python/jit.c

Lines changed: 99 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -582,16 +582,29 @@ combine_symbol_mask(const symbol_mask src, symbol_mask dest)
582582

583583
// Manual code emission for _LOAD_FAST_BORROW.
584584
// Instead of using stencils, we directly encode the load + borrow-tag sequence.
585-
// This covers ALL oparg values with no data section.
586-
// See https://godbolt.org/z/P66e5dP3z for reference.
585+
// Stencil generation for _LOAD_FAST_BORROW register variants is skipped
586+
// in Tools/jit/_targets.py since manual codegen handles all reachable opargs.
587+
// References:
588+
// x86-64: https://godbolt.org/z/5oWMTeqod
589+
// i686: https://godbolt.org/z/1EEP75Wda
590+
// TODO: With dynasm, the per-architecture #ifdef branches below could be
591+
// replaced by a single portable emission sequence.
592+
593+
// Max oparg for manual codegen. Conservative (AArch64 imm12 limit).
594+
// TODO: could be set per-architecture (x86 disp32 has a much higher limit).
595+
#define _LOAD_FAST_BORROW_MAX_OPARG 4085
587596

588597
// Decode a _LOAD_FAST_BORROW* opcode into register variant and oparg.
589598
// Returns 1 if the opcode is a _LOAD_FAST_BORROW variant, 0 otherwise.
599+
// Falls back to stencil pipeline for oparg values too large for manual codegen.
590600
static int
591601
_decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
592602
int *reg_variant, int *oparg)
593603
{
594604
if (opcode >= _LOAD_FAST_BORROW_r01 && opcode <= _LOAD_FAST_BORROW_r23) {
605+
if (insn_oparg > _LOAD_FAST_BORROW_MAX_OPARG) {
606+
return 0;
607+
}
595608
*reg_variant = opcode - _LOAD_FAST_BORROW_r01;
596609
*oparg = insn_oparg;
597610
return 1;
@@ -603,7 +616,13 @@ _decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
603616

604617
// AArch64: ldr x8, [x21, #off] ; orr xDST, x8, #1 (8 bytes, no data)
605618
// preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
606-
#define LOAD_FAST_BORROW_CODE_SIZE 8
619+
620+
static int
621+
_load_fast_borrow_code_size(int oparg)
622+
{
623+
(void)oparg;
624+
return 8;
625+
}
607626

608627
static const uint32_t _aarch64_cache_regs[3] = {24, 25, 26};
609628

@@ -627,9 +646,11 @@ _emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
627646

628647
#elif defined(__x86_64__) || defined(_M_X64)
629648

630-
// x86_64: mov rDST, [r13 + disp32] ; or rDST, 1 (11 bytes, no data)
649+
// x86_64: mov rDST, [r13 + disp] ; or rDST, 1
650+
// disp8 (8 bytes) when byte_offset <= 127, disp32 (11 bytes) otherwise.
631651
// preserve_none CC (Clang 19+): r13=frame, rdi/rsi/rdx=cache0/1/2
632-
#define LOAD_FAST_BORROW_CODE_SIZE 11
652+
#define LOAD_FAST_BORROW_CODE_SIZE_DISP8 8
653+
#define LOAD_FAST_BORROW_CODE_SIZE_DISP32 11
633654

634655
// 3-bit register encodings for ModRM
635656
static const uint8_t _x86_64_cache_regs[3] = {
@@ -638,33 +659,60 @@ static const uint8_t _x86_64_cache_regs[3] = {
638659
2, // RDX (cache2, r23)
639660
};
640661

662+
static int
663+
_load_fast_borrow_code_size(int oparg)
664+
{
665+
uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
666+
+ (unsigned)oparg * sizeof(_PyStackRef));
667+
return byte_offset <= 127
668+
? LOAD_FAST_BORROW_CODE_SIZE_DISP8
669+
: LOAD_FAST_BORROW_CODE_SIZE_DISP32;
670+
}
671+
641672
static void
642673
_emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
643674
{
644675
uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
645676
+ (unsigned)oparg * sizeof(_PyStackRef));
646677
uint8_t dst = _x86_64_cache_regs[reg_variant];
647678

648-
// mov rDST, [r13 + disp32]
649-
code[0] = 0x49; // REX.W=1, REX.B=1 (r13)
650-
code[1] = 0x8B; // MOV r64, r/m64
651-
code[2] = 0x85 | (dst << 3); // ModRM: mod=10, reg=dst, r/m=101(r13)
652-
memcpy(code + 3, &byte_offset, 4); // disp32
679+
if (byte_offset <= 127) {
680+
// mov rDST, [r13 + disp8]
681+
code[0] = 0x49; // REX.W=1, REX.B=1 (r13)
682+
code[1] = 0x8B; // MOV r64, r/m64
683+
code[2] = 0x45 | (dst << 3); // ModRM: mod=01, reg=dst, r/m=101(r13)
684+
code[3] = (uint8_t)byte_offset; // disp8
653685

654-
// or rDST, 1
655-
code[7] = 0x48; // REX.W=1
656-
code[8] = 0x83; // OR r/m64, imm8
657-
code[9] = 0xC8 | dst; // ModRM: mod=11, reg=001(/1), r/m=dst
658-
code[10] = 0x01; // imm8 = 1
686+
// or rDST, 1
687+
code[4] = 0x48; // REX.W=1
688+
code[5] = 0x83; // OR r/m64, imm8
689+
code[6] = 0xC8 | dst; // ModRM: mod=11, reg=001(/1), r/m=dst
690+
code[7] = 0x01; // imm8 = 1
691+
}
692+
else {
693+
// mov rDST, [r13 + disp32]
694+
code[0] = 0x49; // REX.W=1, REX.B=1 (r13)
695+
code[1] = 0x8B; // MOV r64, r/m64
696+
code[2] = 0x85 | (dst << 3); // ModRM: mod=10, reg=dst, r/m=101(r13)
697+
memcpy(code + 3, &byte_offset, 4); // disp32
698+
699+
// or rDST, 1
700+
code[7] = 0x48; // REX.W=1
701+
code[8] = 0x83; // OR r/m64, imm8
702+
code[9] = 0xC8 | dst; // ModRM: mod=11, reg=001(/1), r/m=dst
703+
code[10] = 0x01; // imm8 = 1
704+
}
659705
}
660706

661707
#elif defined(_M_IX86) || defined(__i386__)
662708

663709
// i686: movl 8(%esp),%ecx ; movl off(%ecx),%ecx ; orl $1,%ecx ;
664-
// movl %ecx,cache(%esp) (17 bytes, no data)
710+
// movl %ecx,cache(%esp)
711+
// disp8 (14 bytes) when byte_offset <= 127, disp32 (17 bytes) otherwise.
665712
// i686 does not use preserve_none (unsupported by MSVC).
666713
// Stack layout: 8(%esp)=frame, 20/24/28(%esp)=cache0/1/2
667-
#define LOAD_FAST_BORROW_CODE_SIZE 17
714+
#define LOAD_FAST_BORROW_CODE_SIZE_DISP8 14
715+
#define LOAD_FAST_BORROW_CODE_SIZE_DISP32 17
668716

669717
// Stack offsets for cache slots (from %esp)
670718
static const uint8_t _i686_cache_offsets[3] = {
@@ -673,35 +721,52 @@ static const uint8_t _i686_cache_offsets[3] = {
673721
28, // c2 (r23)
674722
};
675723

724+
static int
725+
_load_fast_borrow_code_size(int oparg)
726+
{
727+
uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
728+
+ (unsigned)oparg * sizeof(_PyStackRef));
729+
return byte_offset <= 127
730+
? LOAD_FAST_BORROW_CODE_SIZE_DISP8
731+
: LOAD_FAST_BORROW_CODE_SIZE_DISP32;
732+
}
733+
676734
static void
677735
_emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
678736
{
679737
uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
680738
+ (unsigned)oparg * sizeof(_PyStackRef));
681739
uint8_t cache_off = _i686_cache_offsets[reg_variant];
740+
int p = 0;
682741

683742
// movl 8(%esp), %ecx — load frame
684-
code[0] = 0x8B; // MOV r32, r/m32
685-
code[1] = 0x4C; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
686-
code[2] = 0x24; // SIB: scale=00, index=100(none), base=100(esp)
687-
code[3] = 0x08; // disp8 = 8
743+
code[p++] = 0x8B; // MOV r32, r/m32
744+
code[p++] = 0x4C; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
745+
code[p++] = 0x24; // SIB: scale=00, index=100(none), base=100(esp)
746+
code[p++] = 0x08; // disp8 = 8
688747

689748
// movl byte_offset(%ecx), %ecx — load localsplus[oparg]
690-
code[4] = 0x8B; // MOV r32, r/m32
691-
code[5] = 0x89; // ModRM: mod=10(disp32), reg=ecx(001), r/m=001(ecx)
692-
memcpy(code + 6, &byte_offset, 4); // disp32
749+
code[p++] = 0x8B; // MOV r32, r/m32
750+
if (byte_offset <= 127) {
751+
code[p++] = 0x49; // ModRM: mod=01(disp8), reg=ecx(001), r/m=001(ecx)
752+
code[p++] = (uint8_t)byte_offset; // disp8
753+
}
754+
else {
755+
code[p++] = 0x89; // ModRM: mod=10(disp32), reg=ecx(001), r/m=001(ecx)
756+
memcpy(code + p, &byte_offset, 4); // disp32
757+
p += 4;
758+
}
693759

694760
// orl $1, %ecx — borrow tag
695-
code[10] = 0x83; // OR r/m32, imm8
696-
code[11] = 0xC9; // ModRM: mod=11, reg=001(/1), r/m=001(ecx)
697-
code[12] = 0x01; // imm8 = 1
761+
code[p++] = 0x83; // OR r/m32, imm8
762+
code[p++] = 0xC9; // ModRM: mod=11, reg=001(/1), r/m=001(ecx)
763+
code[p++] = 0x01; // imm8 = 1
698764

699765
// movl %ecx, cache_off(%esp) — write to cache slot
700-
code[13] = 0x89; // MOV r/m32, r32
701-
code[14] = 0x4C; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
702-
code[15] = 0x24; // SIB: scale=00, index=100(none), base=100(esp)
703-
code[16] = cache_off; // disp8
704-
766+
code[p++] = 0x89; // MOV r/m32, r32
767+
code[p++] = 0x4C; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
768+
code[p++] = 0x24; // SIB: scale=00, index=100(none), base=100(esp)
769+
code[p++] = cache_off; // disp8
705770
}
706771

707772
#else
@@ -723,7 +788,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
723788
int _lfb_reg, _lfb_oparg;
724789
if (_decode_load_fast_borrow(instruction->opcode, instruction->oparg,
725790
&_lfb_reg, &_lfb_oparg)) {
726-
code_size += LOAD_FAST_BORROW_CODE_SIZE;
791+
code_size += _load_fast_borrow_code_size(_lfb_oparg);
727792
continue;
728793
}
729794
group = &stencil_groups[instruction->opcode];
@@ -778,7 +843,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
778843
if (_decode_load_fast_borrow(instruction->opcode, instruction->oparg,
779844
&_lfb_reg, &_lfb_oparg)) {
780845
_emit_load_fast_borrow(code, _lfb_reg, _lfb_oparg);
781-
code += LOAD_FAST_BORROW_CODE_SIZE;
846+
code += _load_fast_borrow_code_size(_lfb_oparg);
782847
continue;
783848
}
784849
group = &stencil_groups[instruction->opcode];

Tools/jit/_targets.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,10 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
215215
tasks.append(group.create_task(coro, name="shim"))
216216
template = TOOLS_JIT_TEMPLATE_C.read_text()
217217
for case, opname in cases_and_opnames:
218+
# _LOAD_FAST_BORROW uses manual codegen in jit.c,
219+
# so skip stencil generation for its register variants.
220+
if opname.startswith("_LOAD_FAST_BORROW_r"):
221+
continue
218222
# Write out a copy of the template with *only* this case
219223
# inserted. This is about twice as fast as #include'ing all
220224
# of executor_cases.c.h each time we compile (since the C

0 commit comments

Comments
 (0)