Address code review

corona10 · corona10 · commit e0cf2415f699 · 2026-04-12T19:02:54.000+09:00
diff --git a/Python/jit.c b/Python/jit.c
@@ -582,16 +582,29 @@ combine_symbol_mask(const symbol_mask src, symbol_mask dest)
 
 // Manual code emission for _LOAD_FAST_BORROW.
 // Instead of using stencils, we directly encode the load + borrow-tag sequence.
-// This covers ALL oparg values with no data section.
-// See https://godbolt.org/z/P66e5dP3z for reference.
+// Stencil generation for _LOAD_FAST_BORROW register variants is skipped
+// in Tools/jit/_targets.py since manual codegen handles all reachable opargs.
+// References:
+//   x86-64: https://godbolt.org/z/5oWMTeqod
+//   i686:   https://godbolt.org/z/1EEP75Wda
+// TODO: With dynasm, the per-architecture #ifdef branches below could be
+// replaced by a single portable emission sequence.
+
+// Max oparg for manual codegen. Conservative (AArch64 imm12 limit).
+// TODO: could be set per-architecture (x86 disp32 has a much higher limit).
+#define _LOAD_FAST_BORROW_MAX_OPARG 4085
 
 // Decode a _LOAD_FAST_BORROW* opcode into register variant and oparg.
 // Returns 1 if the opcode is a _LOAD_FAST_BORROW variant, 0 otherwise.
+// Falls back to stencil pipeline for oparg values too large for manual codegen.
 static int
 _decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
                          int *reg_variant, int *oparg)
 {
     if (opcode >= _LOAD_FAST_BORROW_r01 && opcode <= _LOAD_FAST_BORROW_r23) {
+        if (insn_oparg > _LOAD_FAST_BORROW_MAX_OPARG) {
+            return 0;
+        }
         *reg_variant = opcode - _LOAD_FAST_BORROW_r01;
         *oparg = insn_oparg;
         return 1;
@@ -603,7 +616,13 @@ _decode_load_fast_borrow(uint16_t opcode, uint16_t insn_oparg,
 
 // AArch64: ldr x8, [x21, #off] ; orr xDST, x8, #1  (8 bytes, no data)
 // preserve_none CC: x21=frame, x24/x25/x26=cache0/1/2
-#define LOAD_FAST_BORROW_CODE_SIZE 8
+
+static int
+_load_fast_borrow_code_size(int oparg)
+{
+    (void)oparg;
+    return 8;
+}
 
 static const uint32_t _aarch64_cache_regs[3] = {24, 25, 26};
 
@@ -627,9 +646,11 @@ _emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
 
 #elif defined(__x86_64__) || defined(_M_X64)
 
-// x86_64: mov rDST, [r13 + disp32] ; or rDST, 1  (11 bytes, no data)
+// x86_64: mov rDST, [r13 + disp] ; or rDST, 1
+// disp8 (8 bytes) when byte_offset <= 127, disp32 (11 bytes) otherwise.
 // preserve_none CC (Clang 19+): r13=frame, rdi/rsi/rdx=cache0/1/2
-#define LOAD_FAST_BORROW_CODE_SIZE 11
+#define LOAD_FAST_BORROW_CODE_SIZE_DISP8  8
+#define LOAD_FAST_BORROW_CODE_SIZE_DISP32 11
 
 // 3-bit register encodings for ModRM
 static const uint8_t _x86_64_cache_regs[3] = {
@@ -638,33 +659,60 @@ static const uint8_t _x86_64_cache_regs[3] = {
     2,  // RDX (cache2, r23)
 };
 
+static int
+_load_fast_borrow_code_size(int oparg)
+{
+    uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
+                                      + (unsigned)oparg * sizeof(_PyStackRef));
+    return byte_offset <= 127
+        ? LOAD_FAST_BORROW_CODE_SIZE_DISP8
+        : LOAD_FAST_BORROW_CODE_SIZE_DISP32;
+}
+
 static void
 _emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
 {
     uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
                                       + (unsigned)oparg * sizeof(_PyStackRef));
     uint8_t dst = _x86_64_cache_regs[reg_variant];
 
-    // mov rDST, [r13 + disp32]
-    code[0] = 0x49;                    // REX.W=1, REX.B=1 (r13)
-    code[1] = 0x8B;                    // MOV r64, r/m64
-    code[2] = 0x85 | (dst << 3);      // ModRM: mod=10, reg=dst, r/m=101(r13)
-    memcpy(code + 3, &byte_offset, 4); // disp32
+    if (byte_offset <= 127) {
+        // mov rDST, [r13 + disp8]
+        code[0] = 0x49;                // REX.W=1, REX.B=1 (r13)
+        code[1] = 0x8B;                // MOV r64, r/m64
+        code[2] = 0x45 | (dst << 3);   // ModRM: mod=01, reg=dst, r/m=101(r13)
+        code[3] = (uint8_t)byte_offset; // disp8
 
-    // or rDST, 1
-    code[7] = 0x48;                    // REX.W=1
-    code[8] = 0x83;                    // OR r/m64, imm8
-    code[9] = 0xC8 | dst;             // ModRM: mod=11, reg=001(/1), r/m=dst
-    code[10] = 0x01;                   // imm8 = 1
+        // or rDST, 1
+        code[4] = 0x48;                // REX.W=1
+        code[5] = 0x83;                // OR r/m64, imm8
+        code[6] = 0xC8 | dst;          // ModRM: mod=11, reg=001(/1), r/m=dst
+        code[7] = 0x01;                // imm8 = 1
+    }
+    else {
+        // mov rDST, [r13 + disp32]
+        code[0] = 0x49;                    // REX.W=1, REX.B=1 (r13)
+        code[1] = 0x8B;                    // MOV r64, r/m64
+        code[2] = 0x85 | (dst << 3);      // ModRM: mod=10, reg=dst, r/m=101(r13)
+        memcpy(code + 3, &byte_offset, 4); // disp32
+
+        // or rDST, 1
+        code[7] = 0x48;                    // REX.W=1
+        code[8] = 0x83;                    // OR r/m64, imm8
+        code[9] = 0xC8 | dst;             // ModRM: mod=11, reg=001(/1), r/m=dst
+        code[10] = 0x01;                   // imm8 = 1
+    }
 }
 
 #elif defined(_M_IX86) || defined(__i386__)
 
 // i686: movl 8(%esp),%ecx ; movl off(%ecx),%ecx ; orl $1,%ecx ;
-//       movl %ecx,cache(%esp)  (17 bytes, no data)
+//       movl %ecx,cache(%esp)
+// disp8 (14 bytes) when byte_offset <= 127, disp32 (17 bytes) otherwise.
 // i686 does not use preserve_none (unsupported by MSVC).
 // Stack layout: 8(%esp)=frame, 20/24/28(%esp)=cache0/1/2
-#define LOAD_FAST_BORROW_CODE_SIZE 17
+#define LOAD_FAST_BORROW_CODE_SIZE_DISP8  14
+#define LOAD_FAST_BORROW_CODE_SIZE_DISP32 17
 
 // Stack offsets for cache slots (from %esp)
 static const uint8_t _i686_cache_offsets[3] = {
@@ -673,35 +721,52 @@ static const uint8_t _i686_cache_offsets[3] = {
     28,  // c2 (r23)
 };
 
+static int
+_load_fast_borrow_code_size(int oparg)
+{
+    uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
+                                      + (unsigned)oparg * sizeof(_PyStackRef));
+    return byte_offset <= 127
+        ? LOAD_FAST_BORROW_CODE_SIZE_DISP8
+        : LOAD_FAST_BORROW_CODE_SIZE_DISP32;
+}
+
 static void
 _emit_load_fast_borrow(unsigned char *code, int reg_variant, int oparg)
 {
     uint32_t byte_offset = (uint32_t)(offsetof(_PyInterpreterFrame, localsplus)
                                       + (unsigned)oparg * sizeof(_PyStackRef));
     uint8_t cache_off = _i686_cache_offsets[reg_variant];
+    int p = 0;
 
     // movl 8(%esp), %ecx              — load frame
-    code[0] = 0x8B;                    // MOV r32, r/m32
-    code[1] = 0x4C;                    // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
-    code[2] = 0x24;                    // SIB: scale=00, index=100(none), base=100(esp)
-    code[3] = 0x08;                    // disp8 = 8
+    code[p++] = 0x8B;                  // MOV r32, r/m32
+    code[p++] = 0x4C;                  // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
+    code[p++] = 0x24;                  // SIB: scale=00, index=100(none), base=100(esp)
+    code[p++] = 0x08;                  // disp8 = 8
 
     // movl byte_offset(%ecx), %ecx    — load localsplus[oparg]
-    code[4] = 0x8B;                    // MOV r32, r/m32
-    code[5] = 0x89;                    // ModRM: mod=10(disp32), reg=ecx(001), r/m=001(ecx)
-    memcpy(code + 6, &byte_offset, 4); // disp32
+    code[p++] = 0x8B;                  // MOV r32, r/m32
+    if (byte_offset <= 127) {
+        code[p++] = 0x49;              // ModRM: mod=01(disp8), reg=ecx(001), r/m=001(ecx)
+        code[p++] = (uint8_t)byte_offset; // disp8
+    }
+    else {
+        code[p++] = 0x89;              // ModRM: mod=10(disp32), reg=ecx(001), r/m=001(ecx)
+        memcpy(code + p, &byte_offset, 4); // disp32
+        p += 4;
+    }
 
     // orl $1, %ecx                    — borrow tag
-    code[10] = 0x83;                   // OR r/m32, imm8
-    code[11] = 0xC9;                   // ModRM: mod=11, reg=001(/1), r/m=001(ecx)
-    code[12] = 0x01;                   // imm8 = 1
+    code[p++] = 0x83;                  // OR r/m32, imm8
+    code[p++] = 0xC9;                  // ModRM: mod=11, reg=001(/1), r/m=001(ecx)
+    code[p++] = 0x01;                  // imm8 = 1
 
     // movl %ecx, cache_off(%esp)      — write to cache slot
-    code[13] = 0x89;                   // MOV r/m32, r32
-    code[14] = 0x4C;                   // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
-    code[15] = 0x24;                   // SIB: scale=00, index=100(none), base=100(esp)
-    code[16] = cache_off;              // disp8
-
+    code[p++] = 0x89;                  // MOV r/m32, r32
+    code[p++] = 0x4C;                  // ModRM: mod=01, reg=ecx(001), r/m=100(SIB)
+    code[p++] = 0x24;                  // SIB: scale=00, index=100(none), base=100(esp)
+    code[p++] = cache_off;             // disp8
 }
 
 #else
@@ -723,7 +788,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
         int _lfb_reg, _lfb_oparg;
         if (_decode_load_fast_borrow(instruction->opcode, instruction->oparg,
                                      &_lfb_reg, &_lfb_oparg)) {
-            code_size += LOAD_FAST_BORROW_CODE_SIZE;
+            code_size += _load_fast_borrow_code_size(_lfb_oparg);
             continue;
         }
         group = &stencil_groups[instruction->opcode];
@@ -778,7 +843,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
         if (_decode_load_fast_borrow(instruction->opcode, instruction->oparg,
                                      &_lfb_reg, &_lfb_oparg)) {
             _emit_load_fast_borrow(code, _lfb_reg, _lfb_oparg);
-            code += LOAD_FAST_BORROW_CODE_SIZE;
+            code += _load_fast_borrow_code_size(_lfb_oparg);
             continue;
         }
         group = &stencil_groups[instruction->opcode];
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
@@ -215,6 +215,10 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
                 tasks.append(group.create_task(coro, name="shim"))
                 template = TOOLS_JIT_TEMPLATE_C.read_text()
                 for case, opname in cases_and_opnames:
+                    # _LOAD_FAST_BORROW uses manual codegen in jit.c,
+                    # so skip stencil generation for its register variants.
+                    if opname.startswith("_LOAD_FAST_BORROW_r"):
+                        continue
                     # Write out a copy of the template with *only* this case
                     # inserted. This is about twice as fast as #include'ing all
                     # of executor_cases.c.h each time we compile (since the C