x64: Fix sinking loads band/bor/bxor (#13071)

alexcrichton · web-flow · commit d3f444c876d7 · 2026-04-15T14:41:54.000Z
This commit fixes the x64 backend in Cranelift when sinking loads into band/bor/bxor instructions. If this happens for scalar types like `f32` and `f64` this means that the generated instruction will load too many bytes, similar to #13011 for example. These rules aren't reachable from WebAssembly but are still good to have fixed.
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -335,8 +335,11 @@
 
 ;; f32 and f64
 
+;; Note that `x` and `y` are both constrained to being in registers here because
+;; x86 instructions with a memory operand will load 128 bits, not the desired
+;; bit width of 32 or 64 bits here.
 (rule 5 (lower (has_type (ty_scalar_float ty) (band _ x y)))
-      (sse_and ty x y))
+      (sse_and ty x (put_in_xmm y)))
 
 ;; SSE.
 
@@ -459,8 +462,11 @@
 
 ;; f32 and f64
 
+;; Note that `x` and `y` are both constrained to being in registers here because
+;; x86 instructions with a memory operand will load 128 bits, not the desired
+;; bit width of 32 or 64 bits here.
 (rule 5 (lower (has_type (ty_scalar_float ty) (bor _ x y)))
-      (sse_or ty x y))
+      (sse_or ty x (put_in_xmm y)))
 
 ;; SSE.
 
@@ -530,8 +536,11 @@
 
 ;; f32 and f64
 
+;; Note that `x` and `y` are both constrained to being in registers here because
+;; x86 instructions with a memory operand will load 128 bits, not the desired
+;; bit width of 32 or 64 bits here.
 (rule 5 (lower (has_type (ty_scalar_float ty) (bxor _ x y)))
-      (x64_xor_vector ty x y))
+      (x64_xor_vector ty x (put_in_xmm y)))
 
 ;; SSE.
 
@@ -624,7 +633,7 @@
             (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
             (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
             (mask Reg (x64_movdqu_load mask_addr)))
-        (sse_and $I8X16 unmasked (RegMem.Reg mask))))
+        (x64_pand unmasked (RegMem.Reg mask))))
 
 ;; Get the address of the mask to use when fixing up the lanes that weren't
 ;; correctly generated by the 16x8 shift.
@@ -728,9 +737,7 @@
             ;; correct for half of the lanes; the others must be fixed up with
             ;; the mask below.
             (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
-        (sse_and $I8X16
-                 unmasked
-                 (ushr_i8x16_mask masked_amt))))
+        (x64_pand unmasked (ushr_i8x16_mask masked_amt))))
 
 ;; Get the address of the mask to use when fixing up the lanes that weren't
 ;; correctly generated by the 16x8 shift.
@@ -2491,12 +2498,12 @@
 (rule 1 (lower (has_type $I8X16 (popcnt _ src)))
       (if-let true (has_ssse3))
       (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
-            (low_nibbles Xmm (sse_and $I8X16 src low_mask))
+            (low_nibbles Xmm (x64_pand src low_mask))
             ;; Note that this is a 16x8 shift, but that's OK; we mask
             ;; off anything that traverses from one byte to the next
             ;; with the low_mask below.
             (shifted_src Xmm (x64_psrlw src (xmi_imm 4)))
-            (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask))
+            (high_nibbles Xmm (x64_pand shifted_src low_mask))
             (lookup Xmm (x64_xmm_load_const $I8X16
               (emit_u128_le_const 0x04030302_03020201_03020201_02010100)))
             (bit_counts_low Xmm (x64_pshufb lookup low_nibbles))
diff --git a/cranelift/filetests/filetests/isa/x64/float-bitops-mem-operand.clif b/cranelift/filetests/filetests/isa/x64/float-bitops-mem-operand.clif
@@ -0,0 +1,173 @@
+test compile precise-output
+target x86_64
+
+function %f32_band(i64, f32) -> f32 {
+block0(v0: i64, v1: f32):
+    v2 = load.f32 aligned v0
+    v3 = band v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block0:
+;   movss (%rdi), %xmm4
+;   andps %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movss (%rdi), %xmm4 ; trap: heap_oob
+;   andps %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+
+function %f64_band(i64, f64) -> f64 {
+block0(v0: i64, v1: f64):
+    v2 = load.f64 aligned v0
+    v3 = band v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block0:
+;   movsd (%rdi), %xmm4
+;   andpd %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movsd (%rdi), %xmm4 ; trap: heap_oob
+;   andpd %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+
+function %f32_bor(i64, f32) -> f32 {
+block0(v0: i64, v1: f32):
+    v2 = load.f32 aligned v0
+    v3 = bor v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block0:
+;   movss (%rdi), %xmm4
+;   orps %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movss (%rdi), %xmm4 ; trap: heap_oob
+;   orps %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f64_bor(i64, f64) -> f64 {
+block0(v0: i64, v1: f64):
+    v2 = load.f64 aligned v0
+    v3 = bor v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block0:
+;   movsd (%rdi), %xmm4
+;   orpd %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movsd (%rdi), %xmm4 ; trap: heap_oob
+;   orpd %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f32_bxor(i64, f32) -> f32 {
+block0(v0: i64, v1: f32):
+    v2 = load.f32 aligned v0
+    v3 = bxor v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block0:
+;   movss (%rdi), %xmm4
+;   xorps %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movss (%rdi), %xmm4 ; trap: heap_oob
+;   xorps %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f64_bxor(i64, f64) -> f64 {
+block0(v0: i64, v1: f64):
+    v2 = load.f64 aligned v0
+    v3 = bxor v1, v2
+    return v3
+}
+
+; VCode:
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block0:
+;   movsd (%rdi), %xmm4
+;   xorpd %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movsd (%rdi), %xmm4 ; trap: heap_oob
+;   xorpd %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+