Skip to content

Commit d3f444c

Browse files
authored
x64: Fix sinking loads band/bor/bxor (#13071)
This commit fixes the x64 backend in Cranelift when sinking loads into band/bor/bxor instructions. If this happens for scalar types like `f32` and `f64` this means that the generated instruction will load too many bytes, similar to #13011 for example. These rules aren't reachable from WebAssembly but are still good to have fixed.
1 parent 40da1d8 commit d3f444c

File tree

2 files changed

+189
-9
lines changed

2 files changed

+189
-9
lines changed

cranelift/codegen/src/isa/x64/lower.isle

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,11 @@
335335

336336
;; f32 and f64
337337

338+
;; Note that `x` and `y` are both constrained to being in registers here because
339+
;; x86 instructions with a memory operand will load 128 bits, not the desired
340+
;; bit width of 32 or 64 bits here.
338341
(rule 5 (lower (has_type (ty_scalar_float ty) (band _ x y)))
339-
(sse_and ty x y))
342+
(sse_and ty x (put_in_xmm y)))
340343

341344
;; SSE.
342345

@@ -459,8 +462,11 @@
459462

460463
;; f32 and f64
461464

465+
;; Note that `x` and `y` are both constrained to being in registers here because
466+
;; x86 instructions with a memory operand will load 128 bits, not the desired
467+
;; bit width of 32 or 64 bits here.
462468
(rule 5 (lower (has_type (ty_scalar_float ty) (bor _ x y)))
463-
(sse_or ty x y))
469+
(sse_or ty x (put_in_xmm y)))
464470

465471
;; SSE.
466472

@@ -530,8 +536,11 @@
530536

531537
;; f32 and f64
532538

539+
;; Note that `x` and `y` are both constrained to being in registers here because
540+
;; x86 instructions with a memory operand will load 128 bits, not the desired
541+
;; bit width of 32 or 64 bits here.
533542
(rule 5 (lower (has_type (ty_scalar_float ty) (bxor _ x y)))
534-
(x64_xor_vector ty x y))
543+
(x64_xor_vector ty x (put_in_xmm y)))
535544

536545
;; SSE.
537546

@@ -624,7 +633,7 @@
624633
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
625634
(mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
626635
(mask Reg (x64_movdqu_load mask_addr)))
627-
(sse_and $I8X16 unmasked (RegMem.Reg mask))))
636+
(x64_pand unmasked (RegMem.Reg mask))))
628637

629638
;; Get the address of the mask to use when fixing up the lanes that weren't
630639
;; correctly generated by the 16x8 shift.
@@ -728,9 +737,7 @@
728737
;; correct for half of the lanes; the others must be fixed up with
729738
;; the mask below.
730739
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
731-
(sse_and $I8X16
732-
unmasked
733-
(ushr_i8x16_mask masked_amt))))
740+
(x64_pand unmasked (ushr_i8x16_mask masked_amt))))
734741

735742
;; Get the address of the mask to use when fixing up the lanes that weren't
736743
;; correctly generated by the 16x8 shift.
@@ -2491,12 +2498,12 @@
24912498
(rule 1 (lower (has_type $I8X16 (popcnt _ src)))
24922499
(if-let true (has_ssse3))
24932500
(let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
2494-
(low_nibbles Xmm (sse_and $I8X16 src low_mask))
2501+
(low_nibbles Xmm (x64_pand src low_mask))
24952502
;; Note that this is a 16x8 shift, but that's OK; we mask
24962503
;; off anything that traverses from one byte to the next
24972504
;; with the low_mask below.
24982505
(shifted_src Xmm (x64_psrlw src (xmi_imm 4)))
2499-
(high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask))
2506+
(high_nibbles Xmm (x64_pand shifted_src low_mask))
25002507
(lookup Xmm (x64_xmm_load_const $I8X16
25012508
(emit_u128_le_const 0x04030302_03020201_03020201_02010100)))
25022509
(bit_counts_low Xmm (x64_pshufb lookup low_nibbles))
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
test compile precise-output
2+
target x86_64
3+
4+
function %f32_band(i64, f32) -> f32 {
5+
block0(v0: i64, v1: f32):
6+
v2 = load.f32 aligned v0
7+
v3 = band v1, v2
8+
return v3
9+
}
10+
11+
; VCode:
12+
; pushq %rbp
13+
; movq %rsp, %rbp
14+
; block0:
15+
; movss (%rdi), %xmm4
16+
; andps %xmm4, %xmm0
17+
; movq %rbp, %rsp
18+
; popq %rbp
19+
; retq
20+
;
21+
; Disassembled:
22+
; block0: ; offset 0x0
23+
; pushq %rbp
24+
; movq %rsp, %rbp
25+
; block1: ; offset 0x4
26+
; movss (%rdi), %xmm4 ; trap: heap_oob
27+
; andps %xmm4, %xmm0
28+
; movq %rbp, %rsp
29+
; popq %rbp
30+
; retq
31+
32+
33+
function %f64_band(i64, f64) -> f64 {
34+
block0(v0: i64, v1: f64):
35+
v2 = load.f64 aligned v0
36+
v3 = band v1, v2
37+
return v3
38+
}
39+
40+
; VCode:
41+
; pushq %rbp
42+
; movq %rsp, %rbp
43+
; block0:
44+
; movsd (%rdi), %xmm4
45+
; andpd %xmm4, %xmm0
46+
; movq %rbp, %rsp
47+
; popq %rbp
48+
; retq
49+
;
50+
; Disassembled:
51+
; block0: ; offset 0x0
52+
; pushq %rbp
53+
; movq %rsp, %rbp
54+
; block1: ; offset 0x4
55+
; movsd (%rdi), %xmm4 ; trap: heap_oob
56+
; andpd %xmm4, %xmm0
57+
; movq %rbp, %rsp
58+
; popq %rbp
59+
; retq
60+
61+
62+
function %f32_bor(i64, f32) -> f32 {
63+
block0(v0: i64, v1: f32):
64+
v2 = load.f32 aligned v0
65+
v3 = bor v1, v2
66+
return v3
67+
}
68+
69+
; VCode:
70+
; pushq %rbp
71+
; movq %rsp, %rbp
72+
; block0:
73+
; movss (%rdi), %xmm4
74+
; orps %xmm4, %xmm0
75+
; movq %rbp, %rsp
76+
; popq %rbp
77+
; retq
78+
;
79+
; Disassembled:
80+
; block0: ; offset 0x0
81+
; pushq %rbp
82+
; movq %rsp, %rbp
83+
; block1: ; offset 0x4
84+
; movss (%rdi), %xmm4 ; trap: heap_oob
85+
; orps %xmm4, %xmm0
86+
; movq %rbp, %rsp
87+
; popq %rbp
88+
; retq
89+
90+
function %f64_bor(i64, f64) -> f64 {
91+
block0(v0: i64, v1: f64):
92+
v2 = load.f64 aligned v0
93+
v3 = bor v1, v2
94+
return v3
95+
}
96+
97+
; VCode:
98+
; pushq %rbp
99+
; movq %rsp, %rbp
100+
; block0:
101+
; movsd (%rdi), %xmm4
102+
; orpd %xmm4, %xmm0
103+
; movq %rbp, %rsp
104+
; popq %rbp
105+
; retq
106+
;
107+
; Disassembled:
108+
; block0: ; offset 0x0
109+
; pushq %rbp
110+
; movq %rsp, %rbp
111+
; block1: ; offset 0x4
112+
; movsd (%rdi), %xmm4 ; trap: heap_oob
113+
; orpd %xmm4, %xmm0
114+
; movq %rbp, %rsp
115+
; popq %rbp
116+
; retq
117+
118+
function %f32_bxor(i64, f32) -> f32 {
119+
block0(v0: i64, v1: f32):
120+
v2 = load.f32 aligned v0
121+
v3 = bxor v1, v2
122+
return v3
123+
}
124+
125+
; VCode:
126+
; pushq %rbp
127+
; movq %rsp, %rbp
128+
; block0:
129+
; movss (%rdi), %xmm4
130+
; xorps %xmm4, %xmm0
131+
; movq %rbp, %rsp
132+
; popq %rbp
133+
; retq
134+
;
135+
; Disassembled:
136+
; block0: ; offset 0x0
137+
; pushq %rbp
138+
; movq %rsp, %rbp
139+
; block1: ; offset 0x4
140+
; movss (%rdi), %xmm4 ; trap: heap_oob
141+
; xorps %xmm4, %xmm0
142+
; movq %rbp, %rsp
143+
; popq %rbp
144+
; retq
145+
146+
function %f64_bxor(i64, f64) -> f64 {
147+
block0(v0: i64, v1: f64):
148+
v2 = load.f64 aligned v0
149+
v3 = bxor v1, v2
150+
return v3
151+
}
152+
153+
; VCode:
154+
; pushq %rbp
155+
; movq %rsp, %rbp
156+
; block0:
157+
; movsd (%rdi), %xmm4
158+
; xorpd %xmm4, %xmm0
159+
; movq %rbp, %rsp
160+
; popq %rbp
161+
; retq
162+
;
163+
; Disassembled:
164+
; block0: ; offset 0x0
165+
; pushq %rbp
166+
; movq %rsp, %rbp
167+
; block1: ; offset 0x4
168+
; movsd (%rdi), %xmm4 ; trap: heap_oob
169+
; xorpd %xmm4, %xmm0
170+
; movq %rbp, %rsp
171+
; popq %rbp
172+
; retq
173+

0 commit comments

Comments
 (0)