@@ -582,16 +582,29 @@ combine_symbol_mask(const symbol_mask src, symbol_mask dest)
582582
583583// Manual code emission for _LOAD_FAST_BORROW.
584584// Instead of using stencils, we directly encode the load + borrow-tag sequence.
585- // This covers ALL oparg values with no data section.
586- // See https://godbolt.org/z/P66e5dP3z for reference.
585+ // Stencil generation for _LOAD_FAST_BORROW register variants is skipped
586+ // in Tools/jit/_targets.py since manual codegen handles all reachable opargs.
587+ // References:
588+ // x86-64: https://godbolt.org/z/5oWMTeqod
589+ // i686: https://godbolt.org/z/1EEP75Wda
590+ // TODO: With dynasm, the per-architecture #ifdef branches below could be
591+ // replaced by a single portable emission sequence.
592+
593+ // Max oparg for manual codegen. Conservative (AArch64 imm12 limit).
594+ // TODO: could be set per-architecture (x86 disp32 has a much higher limit).
595+ #define _LOAD_FAST_BORROW_MAX_OPARG 4085
587596
588597// Decode a _LOAD_FAST_BORROW* opcode into register variant and oparg.
589598// Returns 1 if the opcode is a _LOAD_FAST_BORROW variant, 0 otherwise.
599+ // Falls back to stencil pipeline for oparg values too large for manual codegen.
590600static int
591601_decode_load_fast_borrow (uint16_t opcode , uint16_t insn_oparg ,
592602 int * reg_variant , int * oparg )
593603{
594604 if (opcode >= _LOAD_FAST_BORROW_r01 && opcode <= _LOAD_FAST_BORROW_r23 ) {
605+ if (insn_oparg > _LOAD_FAST_BORROW_MAX_OPARG ) {
606+ return 0 ;
607+ }
595608 * reg_variant = opcode - _LOAD_FAST_BORROW_r01 ;
596609 * oparg = insn_oparg ;
597610 return 1 ;
@@ -603,7 +616,13 @@ _decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
603616
604617// AArch64: ldr x8, [x21, #off] ; orr xDST, x8, #1 (8 bytes, no data)
605618// preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
606- #define LOAD_FAST_BORROW_CODE_SIZE 8
619+
620+ static int
621+ _load_fast_borrow_code_size (int oparg )
622+ {
623+ (void )oparg ;
624+ return 8 ;
625+ }
607626
608627static const uint32_t _aarch64_cache_regs [3 ] = {24 , 25 , 26 };
609628
@@ -627,9 +646,11 @@ _emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
627646
628647#elif defined(__x86_64__ ) || defined(_M_X64 )
629648
630- // x86_64: mov rDST, [r13 + disp32] ; or rDST, 1 (11 bytes, no data)
649+ // x86_64: mov rDST, [r13 + disp] ; or rDST, 1
650+ // disp8 (8 bytes) when byte_offset <= 127, disp32 (11 bytes) otherwise.
631651// preserve_none CC (Clang 19+): r13=frame, rdi/rsi/rdx=cache0/1/2
632- #define LOAD_FAST_BORROW_CODE_SIZE 11
652+ #define LOAD_FAST_BORROW_CODE_SIZE_DISP8 8
653+ #define LOAD_FAST_BORROW_CODE_SIZE_DISP32 11
633654
634655// 3-bit register encodings for ModRM
635656static const uint8_t _x86_64_cache_regs [3 ] = {
@@ -638,33 +659,60 @@ static const uint8_t _x86_64_cache_regs[3] = {
638659 2 , // RDX (cache2, r23)
639660};
640661
662+ static int
663+ _load_fast_borrow_code_size (int oparg )
664+ {
665+ uint32_t byte_offset = (uint32_t )(offsetof(_PyInterpreterFrame , localsplus )
666+ + (unsigned )oparg * sizeof (_PyStackRef ));
667+ return byte_offset <= 127
668+ ? LOAD_FAST_BORROW_CODE_SIZE_DISP8
669+ : LOAD_FAST_BORROW_CODE_SIZE_DISP32 ;
670+ }
671+
641672static void
642673_emit_load_fast_borrow (unsigned char * code , int reg_variant , int oparg )
643674{
644675 uint32_t byte_offset = (uint32_t )(offsetof(_PyInterpreterFrame , localsplus )
645676 + (unsigned )oparg * sizeof (_PyStackRef ));
646677 uint8_t dst = _x86_64_cache_regs [reg_variant ];
647678
648- // mov rDST, [r13 + disp32]
649- code [0 ] = 0x49 ; // REX.W=1, REX.B=1 (r13)
650- code [1 ] = 0x8B ; // MOV r64, r/m64
651- code [2 ] = 0x85 | (dst << 3 ); // ModRM: mod=10, reg=dst, r/m=101(r13)
652- memcpy (code + 3 , & byte_offset , 4 ); // disp32
679+ if (byte_offset <= 127 ) {
680+ // mov rDST, [r13 + disp8]
681+ code [0 ] = 0x49 ; // REX.W=1, REX.B=1 (r13)
682+ code [1 ] = 0x8B ; // MOV r64, r/m64
683+ code [2 ] = 0x45 | (dst << 3 ); // ModRM: mod=01, reg=dst, r/m=101(r13)
684+ code [3 ] = (uint8_t )byte_offset ; // disp8
653685
654- // or rDST, 1
655- code [7 ] = 0x48 ; // REX.W=1
656- code [8 ] = 0x83 ; // OR r/m64, imm8
657- code [9 ] = 0xC8 | dst ; // ModRM: mod=11, reg=001(/1), r/m=dst
658- code [10 ] = 0x01 ; // imm8 = 1
686+ // or rDST, 1
687+ code [4 ] = 0x48 ; // REX.W=1
688+ code [5 ] = 0x83 ; // OR r/m64, imm8
689+ code [6 ] = 0xC8 | dst ; // ModRM: mod=11, reg=001(/1), r/m=dst
690+ code [7 ] = 0x01 ; // imm8 = 1
691+ }
692+ else {
693+ // mov rDST, [r13 + disp32]
694+ code [0 ] = 0x49 ; // REX.W=1, REX.B=1 (r13)
695+ code [1 ] = 0x8B ; // MOV r64, r/m64
696+ code [2 ] = 0x85 | (dst << 3 ); // ModRM: mod=10, reg=dst, r/m=101(r13)
697+ memcpy (code + 3 , & byte_offset , 4 ); // disp32
698+
699+ // or rDST, 1
700+ code [7 ] = 0x48 ; // REX.W=1
701+ code [8 ] = 0x83 ; // OR r/m64, imm8
702+ code [9 ] = 0xC8 | dst ; // ModRM: mod=11, reg=001(/1), r/m=dst
703+ code [10 ] = 0x01 ; // imm8 = 1
704+ }
659705}
660706
661707#elif defined(_M_IX86 ) || defined(__i386__ )
662708
663709// i686: movl 8(%esp),%ecx ; movl off(%ecx),%ecx ; orl $1,%ecx ;
664- // movl %ecx,cache(%esp) (17 bytes, no data)
710+ // movl %ecx,cache(%esp)
711+ // disp8 (14 bytes) when byte_offset <= 127, disp32 (17 bytes) otherwise.
665712// i686 does not use preserve_none (unsupported by MSVC).
666713// Stack layout: 8(%esp)=frame, 20/24/28(%esp)=cache0/1/2
667- #define LOAD_FAST_BORROW_CODE_SIZE 17
714+ #define LOAD_FAST_BORROW_CODE_SIZE_DISP8 14
715+ #define LOAD_FAST_BORROW_CODE_SIZE_DISP32 17
668716
669717// Stack offsets for cache slots (from %esp)
670718static const uint8_t _i686_cache_offsets [3 ] = {
@@ -673,35 +721,52 @@ static const uint8_t _i686_cache_offsets[3] = {
673721 28 , // c2 (r23)
674722};
675723
724+ static int
725+ _load_fast_borrow_code_size (int oparg )
726+ {
727+ uint32_t byte_offset = (uint32_t )(offsetof(_PyInterpreterFrame , localsplus )
728+ + (unsigned )oparg * sizeof (_PyStackRef ));
729+ return byte_offset <= 127
730+ ? LOAD_FAST_BORROW_CODE_SIZE_DISP8
731+ : LOAD_FAST_BORROW_CODE_SIZE_DISP32 ;
732+ }
733+
676734static void
677735_emit_load_fast_borrow (unsigned char * code , int reg_variant , int oparg )
678736{
679737 uint32_t byte_offset = (uint32_t )(offsetof(_PyInterpreterFrame , localsplus )
680738 + (unsigned )oparg * sizeof (_PyStackRef ));
681739 uint8_t cache_off = _i686_cache_offsets [reg_variant ];
740+ int p = 0 ;
682741
683742 // movl 8(%esp), %ecx — load frame
684- code [0 ] = 0x8B ; // MOV r32, r/m32
685- code [1 ] = 0x4C ; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
686- code [2 ] = 0x24 ; // SIB: scale=00, index=100(none), base=100(esp)
687- code [3 ] = 0x08 ; // disp8 = 8
743+ code [p ++ ] = 0x8B ; // MOV r32, r/m32
744+ code [p ++ ] = 0x4C ; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
745+ code [p ++ ] = 0x24 ; // SIB: scale=00, index=100(none), base=100(esp)
746+ code [p ++ ] = 0x08 ; // disp8 = 8
688747
689748 // movl byte_offset(%ecx), %ecx — load localsplus[oparg]
690- code [4 ] = 0x8B ; // MOV r32, r/m32
691- code [5 ] = 0x89 ; // ModRM: mod=10(disp32), reg=ecx(001), r/m=001(ecx)
692- memcpy (code + 6 , & byte_offset , 4 ); // disp32
749+ code [p ++ ] = 0x8B ; // MOV r32, r/m32
750+ if (byte_offset <= 127 ) {
751+ code [p ++ ] = 0x49 ; // ModRM: mod=01(disp8), reg=ecx(001), r/m=001(ecx)
752+ code [p ++ ] = (uint8_t )byte_offset ; // disp8
753+ }
754+ else {
755+ code [p ++ ] = 0x89 ; // ModRM: mod=10(disp32), reg=ecx(001), r/m=001(ecx)
756+ memcpy (code + p , & byte_offset , 4 ); // disp32
757+ p += 4 ;
758+ }
693759
694760 // orl $1, %ecx — borrow tag
695- code [10 ] = 0x83 ; // OR r/m32, imm8
696- code [11 ] = 0xC9 ; // ModRM: mod=11, reg=001(/1), r/m=001(ecx)
697- code [12 ] = 0x01 ; // imm8 = 1
761+ code [p ++ ] = 0x83 ; // OR r/m32, imm8
762+ code [p ++ ] = 0xC9 ; // ModRM: mod=11, reg=001(/1), r/m=001(ecx)
763+ code [p ++ ] = 0x01 ; // imm8 = 1
698764
699765 // movl %ecx, cache_off(%esp) — write to cache slot
700- code [13 ] = 0x89 ; // MOV r/m32, r32
701- code [14 ] = 0x4C ; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
702- code [15 ] = 0x24 ; // SIB: scale=00, index=100(none), base=100(esp)
703- code [16 ] = cache_off ; // disp8
704-
766+ code [p ++ ] = 0x89 ; // MOV r/m32, r32
767+ code [p ++ ] = 0x4C ; // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
768+ code [p ++ ] = 0x24 ; // SIB: scale=00, index=100(none), base=100(esp)
769+ code [p ++ ] = cache_off ; // disp8
705770}
706771
707772#else
@@ -723,7 +788,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
723788 int _lfb_reg , _lfb_oparg ;
724789 if (_decode_load_fast_borrow (instruction -> opcode , instruction -> oparg ,
725790 & _lfb_reg , & _lfb_oparg )) {
726- code_size += LOAD_FAST_BORROW_CODE_SIZE ;
791+ code_size += _load_fast_borrow_code_size ( _lfb_oparg ) ;
727792 continue ;
728793 }
729794 group = & stencil_groups [instruction -> opcode ];
@@ -778,7 +843,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
778843 if (_decode_load_fast_borrow (instruction -> opcode , instruction -> oparg ,
779844 & _lfb_reg , & _lfb_oparg )) {
780845 _emit_load_fast_borrow (code , _lfb_reg , _lfb_oparg );
781- code += LOAD_FAST_BORROW_CODE_SIZE ;
846+ code += _load_fast_borrow_code_size ( _lfb_oparg ) ;
782847 continue ;
783848 }
784849 group = & stencil_groups [instruction -> opcode ];
0 commit comments