Skip to content

Commit a591e3b

Browse files
authored
feat: optimize frame layout for tail-call-only functions (#11608)
* feat: optimize frame layout for tail-call-only functions Reduce frame size from 16 to 8 bytes for functions that only make tail calls (FunctionCalls::TailOnly). This optimization: - Uses single register operations (str/ldr fp) instead of pair operations (stp/ldp fp,lr) - Applies when no other frame requirements exist (no frame pointers, stack args, etc.) - Is instruction-based: functions containing only return_call instructions get optimized - Maintains ABI compatibility and includes comprehensive test coverage * fix(aarch64): update tail-call filetest expectations Update AArch64 return-call and pointer-auth filetest outputs to match frameless tail-call lowering from the previous frame-layout change. This restores arm64 filetest stability by checking the emitted direct branch sequences.
1 parent 6ad2458 commit a591e3b

10 files changed

Lines changed: 188 additions & 92 deletions

File tree

cranelift/codegen/src/isa/aarch64/abi.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,15 +1199,17 @@ impl ABIMachineSpec for AArch64MachineDeps {
11991199
// Compute clobber size.
12001200
let clobber_size = compute_clobber_size(call_conv, &regs);
12011201

1202+
let needs_linkage_frame = flags.preserve_frame_pointers()
1203+
// The function arguments that are passed on the stack are addressed
1204+
// relative to the Frame Pointer.
1205+
|| incoming_args_size > 0
1206+
|| tail_args_size > incoming_args_size
1207+
|| clobber_size > 0
1208+
|| fixed_frame_storage_size > 0
1209+
|| outgoing_args_size > 0;
1210+
12021211
// Compute linkage frame size.
1203-
let setup_area_size = if flags.preserve_frame_pointers()
1204-
|| function_calls != FunctionCalls::None
1205-
// The function arguments that are passed on the stack are addressed
1206-
// relative to the Frame Pointer.
1207-
|| incoming_args_size > 0
1208-
|| clobber_size > 0
1209-
|| fixed_frame_storage_size > 0
1210-
{
1212+
let setup_area_size = if needs_linkage_frame || function_calls == FunctionCalls::Regular {
12111213
16 // FP, LR
12121214
} else {
12131215
0

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3692,7 +3692,9 @@ fn emit_return_call_common_sequence<T>(
36923692
}
36933693
}
36943694

3695-
if let Some(key) = info.key {
3695+
if (setup_area_size > 0 || info.sign_return_address_all)
3696+
&& let Some(key) = info.key
3697+
{
36963698
sink.put4(key.enc_auti_hint());
36973699
}
36983700
}

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ pub struct ReturnCallInfo<T> {
8888
pub new_stack_arg_size: u32,
8989
/// API key to use to restore the return address, if any.
9090
pub key: Option<APIKey>,
91+
/// Whether pointer-auth return addresses are signed even without frame setup.
92+
pub sign_return_address_all: bool,
9193
}
9294

9395
fn count_zero_half_words(mut value: u64, num_half_words: u8) -> usize {

cranelift/codegen/src/isa/aarch64/lower/isle.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
135135
dest,
136136
uses,
137137
key,
138+
sign_return_address_all: self.backend.isa_flags.sign_return_address_all(),
138139
new_stack_arg_size,
139140
})
140141
}
@@ -157,6 +158,7 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
157158
dest,
158159
uses,
159160
key,
161+
sign_return_address_all: self.backend.isa_flags.sign_return_address_all(),
160162
new_stack_arg_size,
161163
})
162164
}

cranelift/filetests/filetests/isa/aarch64/call-pauth-bkey.clif

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -95,24 +95,15 @@ block0(v0: i64):
9595
}
9696

9797
; VCode:
98-
; pacibz
99-
; stp fp, lr, [sp, #-16]!
100-
; mov fp, sp
10198
; block0:
10299
; load_ext_name_far x1, TestCase(%g)+0
103100
; return_call_ind x1 new_stack_arg_size:0 x2=x2
104101
;
105102
; Disassembled:
106103
; block0: ; offset 0x0
107-
; pacibz
108-
; stp x29, x30, [sp, #-0x10]!
109-
; mov x29, sp
110-
; block1: ; offset 0xc
111-
; ldr x1, #0x14
112-
; b #0x1c
104+
; ldr x1, #8
105+
; b #0x10
113106
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %g 0
114107
; .byte 0x00, 0x00, 0x00, 0x00
115-
; ldp x29, x30, [sp], #0x10
116-
; autibz
117108
; br x1
118109

cranelift/filetests/filetests/isa/aarch64/call-pauth.clif

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -96,24 +96,15 @@ block0(v0: i64):
9696
}
9797

9898
; VCode:
99-
; paciaz
100-
; stp fp, lr, [sp, #-16]!
101-
; mov fp, sp
10299
; block0:
103100
; load_ext_name_far x1, TestCase(%g)+0
104101
; return_call_ind x1 new_stack_arg_size:0 x2=x2
105102
;
106103
; Disassembled:
107104
; block0: ; offset 0x0
108-
; paciaz
109-
; stp x29, x30, [sp, #-0x10]!
110-
; mov x29, sp
111-
; block1: ; offset 0xc
112-
; ldr x1, #0x14
113-
; b #0x1c
105+
; ldr x1, #8
106+
; b #0x10
114107
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %g 0
115108
; .byte 0x00, 0x00, 0x00, 0x00
116-
; ldp x29, x30, [sp], #0x10
117-
; autiaz
118109
; br x1
119110

cranelift/filetests/filetests/isa/aarch64/return-call-indirect.clif

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,16 @@ block0(v0: i64):
3030
}
3131

3232
; VCode:
33-
; stp fp, lr, [sp, #-16]!
34-
; mov fp, sp
3533
; block0:
3634
; load_ext_name_far x1, TestCase(%callee_i64)+0
3735
; return_call_ind x1 new_stack_arg_size:0 x2=x2
3836
;
3937
; Disassembled:
4038
; block0: ; offset 0x0
41-
; stp x29, x30, [sp, #-0x10]!
42-
; mov x29, sp
43-
; block1: ; offset 0x8
44-
; ldr x1, #0x10
45-
; b #0x18
39+
; ldr x1, #8
40+
; b #0x10
4641
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i64 0
4742
; .byte 0x00, 0x00, 0x00, 0x00
48-
; ldp x29, x30, [sp], #0x10
4943
; br x1
5044

5145
;;;; Test colocated tail calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -60,20 +54,14 @@ block0(v0: i64):
6054
}
6155

6256
; VCode:
63-
; stp fp, lr, [sp, #-16]!
64-
; mov fp, sp
6557
; block0:
6658
; load_ext_name_near x1, TestCase(%callee_i64)+0
6759
; return_call_ind x1 new_stack_arg_size:0 x2=x2
6860
;
6961
; Disassembled:
7062
; block0: ; offset 0x0
71-
; stp x29, x30, [sp, #-0x10]!
72-
; mov x29, sp
73-
; block1: ; offset 0x8
7463
; adrp x1, #0 ; reloc_external Aarch64AdrPrelPgHi21 %callee_i64 0
7564
; add x1, x1, #0 ; reloc_external Aarch64AddAbsLo12Nc %callee_i64 0
76-
; ldp x29, x30, [sp], #0x10
7765
; br x1
7866

7967
;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -107,22 +95,16 @@ block0(v0: f64):
10795
}
10896

10997
; VCode:
110-
; stp fp, lr, [sp, #-16]!
111-
; mov fp, sp
11298
; block0:
11399
; load_ext_name_far x1, TestCase(%callee_f64)+0
114100
; return_call_ind x1 new_stack_arg_size:0 v0=v0
115101
;
116102
; Disassembled:
117103
; block0: ; offset 0x0
118-
; stp x29, x30, [sp, #-0x10]!
119-
; mov x29, sp
120-
; block1: ; offset 0x8
121-
; ldr x1, #0x10
122-
; b #0x18
104+
; ldr x1, #8
105+
; b #0x10
123106
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_f64 0
124107
; .byte 0x00, 0x00, 0x00, 0x00
125-
; ldp x29, x30, [sp], #0x10
126108
; br x1
127109

128110
;;;; Test passing `i8`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -158,22 +140,16 @@ block0(v0: i8):
158140
}
159141

160142
; VCode:
161-
; stp fp, lr, [sp, #-16]!
162-
; mov fp, sp
163143
; block0:
164144
; load_ext_name_far x1, TestCase(%callee_i8)+0
165145
; return_call_ind x1 new_stack_arg_size:0 x2=x2
166146
;
167147
; Disassembled:
168148
; block0: ; offset 0x0
169-
; stp x29, x30, [sp, #-0x10]!
170-
; mov x29, sp
171-
; block1: ; offset 0x8
172-
; ldr x1, #0x10
173-
; b #0x18
149+
; ldr x1, #8
150+
; b #0x10
174151
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i8 0
175152
; .byte 0x00, 0x00, 0x00, 0x00
176-
; ldp x29, x30, [sp], #0x10
177153
; br x1
178154

179155
;;;; Test passing many arguments on stack ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

cranelift/filetests/filetests/isa/aarch64/return-call.clif

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,16 @@ block0(v0: i64):
2828
}
2929

3030
; VCode:
31-
; stp fp, lr, [sp, #-16]!
32-
; mov fp, sp
3331
; block0:
3432
; load_ext_name_far x1, TestCase(%callee_i64)+0
3533
; return_call_ind x1 new_stack_arg_size:0 x2=x2
3634
;
3735
; Disassembled:
3836
; block0: ; offset 0x0
39-
; stp x29, x30, [sp, #-0x10]!
40-
; mov x29, sp
41-
; block1: ; offset 0x8
42-
; ldr x1, #0x10
43-
; b #0x18
37+
; ldr x1, #8
38+
; b #0x10
4439
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i64 0
4540
; .byte 0x00, 0x00, 0x00, 0x00
46-
; ldp x29, x30, [sp], #0x10
4741
; br x1
4842

4943
;;;; Test colocated tail calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -56,18 +50,12 @@ block0(v0: i64):
5650
}
5751

5852
; VCode:
59-
; stp fp, lr, [sp, #-16]!
60-
; mov fp, sp
6153
; block0:
6254
; return_call TestCase(%callee_i64) new_stack_arg_size:0 x2=x2
6355
;
6456
; Disassembled:
6557
; block0: ; offset 0x0
66-
; stp x29, x30, [sp, #-0x10]!
67-
; mov x29, sp
68-
; block1: ; offset 0x8
69-
; ldp x29, x30, [sp], #0x10
70-
; b #0xc ; reloc_external Call %callee_i64 0
58+
; b #0 ; reloc_external Call %callee_i64 0
7159

7260
;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7361

@@ -98,22 +86,16 @@ block0(v0: f64):
9886
}
9987

10088
; VCode:
101-
; stp fp, lr, [sp, #-16]!
102-
; mov fp, sp
10389
; block0:
10490
; load_ext_name_far x1, TestCase(%callee_f64)+0
10591
; return_call_ind x1 new_stack_arg_size:0 v0=v0
10692
;
10793
; Disassembled:
10894
; block0: ; offset 0x0
109-
; stp x29, x30, [sp, #-0x10]!
110-
; mov x29, sp
111-
; block1: ; offset 0x8
112-
; ldr x1, #0x10
113-
; b #0x18
95+
; ldr x1, #8
96+
; b #0x10
11497
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_f64 0
11598
; .byte 0x00, 0x00, 0x00, 0x00
116-
; ldp x29, x30, [sp], #0x10
11799
; br x1
118100

119101
;;;; Test passing `i8`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -147,22 +129,16 @@ block0(v0: i8):
147129
}
148130

149131
; VCode:
150-
; stp fp, lr, [sp, #-16]!
151-
; mov fp, sp
152132
; block0:
153133
; load_ext_name_far x1, TestCase(%callee_i8)+0
154134
; return_call_ind x1 new_stack_arg_size:0 x2=x2
155135
;
156136
; Disassembled:
157137
; block0: ; offset 0x0
158-
; stp x29, x30, [sp, #-0x10]!
159-
; mov x29, sp
160-
; block1: ; offset 0x8
161-
; ldr x1, #0x10
162-
; b #0x18
138+
; ldr x1, #8
139+
; b #0x10
163140
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i8 0
164141
; .byte 0x00, 0x00, 0x00, 0x00
165-
; ldp x29, x30, [sp], #0x10
166142
; br x1
167143

168144
;;;; Test passing many arguments on stack ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
test compile precise-output
2+
set unwind_info=true
3+
set preserve_frame_pointers=false
4+
target aarch64
5+
6+
;; With unwind enabled, tail-only functions that do not need frame storage
7+
;; should still avoid FP/LR frame setup.
8+
function %tail_only_with_unwind() -> i64 tail {
9+
fn0 = colocated %target_func() -> i64 tail
10+
11+
block0:
12+
return_call fn0()
13+
}
14+
15+
; VCode:
16+
; block0:
17+
; return_call TestCase(%target_func) new_stack_arg_size:0
18+
;
19+
; Disassembled:
20+
; block0: ; offset 0x0
21+
; b #0 ; reloc_external Call %target_func 0
22+
23+
function %target_func() -> i64 {
24+
block0:
25+
v0 = iconst.i64 42
26+
return v0
27+
}
28+
29+
; VCode:
30+
; block0:
31+
; movz x0, #42
32+
; ret
33+
;
34+
; Disassembled:
35+
; block0: ; offset 0x0
36+
; mov x0, #0x2a
37+
; ret

0 commit comments

Comments
 (0)