Skip to content

Commit 6c35bdb

Browse files
[SLP] Normalize copyable operand order to group loads for better vectorization
When building operands for entries with copyable elements, non-copyable lanes may have inconsistent operand order (e.g., some lanes have load,add while others have add,load for commutative ops). This prevents VLOperands::reorder() from grouping consecutive loads on one side, degrading downstream vectorization. Normalize in two steps during buildOperands: 1) Majority voting: swap lanes that are the exact inverse of the majority operand-type pattern. 2) Load preference: if the majority pattern has loads at OpIdx 1 (strict majority), swap to put loads at OpIdx 0, enabling vector load + copyable patterns. Reviewers: hiraditya, RKSimon Pull Request: #189181
1 parent 500e913 commit 6c35bdb

File tree

2 files changed

+38
-33
lines changed

2 files changed

+38
-33
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12085,18 +12085,37 @@ class InstructionsCompatibilityAnalysis {
1208512085
}
1208612086
}
1208712087
}
12088-
// For commutative ops, swap lanes whose operand types are the
12089-
// exact inverse of the majority pattern, making the non-copyable
12090-
// lanes consistent.
12091-
if (BestCount > 0) {
12092-
for (auto [Idx, V] : enumerate(VL)) {
12093-
if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12094-
continue;
12088+
// For commutative ops, normalize non-copyable lanes in two steps:
12089+
// 1) Swap lanes whose operand types are the exact inverse of the
12090+
// majority pattern, making the non-copyable lanes consistent.
12091+
// 2) Independently, if a strict majority of non-copyable lanes
12092+
// have loads at OpIdx 1, swap those lanes to put loads at
12093+
// OpIdx 0 for better downstream vectorization.
12094+
unsigned LAt0 = 0, LAt1 = 0, TotalNC = 0;
12095+
for (auto [Idx, V] : enumerate(VL)) {
12096+
if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12097+
continue;
12098+
// Step 1: swap exact-inverse lanes.
12099+
if (BestCount > 0) {
1209512100
unsigned ID0 = Operands[0][Idx]->getValueID();
1209612101
unsigned ID1 = Operands[1][Idx]->getValueID();
1209712102
if (ID0 == MajID1 && ID1 == MajID0)
1209812103
std::swap(Operands[0][Idx], Operands[1][Idx]);
1209912104
}
12105+
++TotalNC;
12106+
LAt0 += isa<LoadInst>(Operands[0][Idx]);
12107+
LAt1 += isa<LoadInst>(Operands[1][Idx]);
12108+
}
12109+
// Step 2: if most non-copyable lanes have loads at OpIdx 1,
12110+
// swap those lanes to put loads at OpIdx 0.
12111+
if (TotalNC > 1 && LAt1 > LAt0 && LAt1 * 2 > TotalNC) {
12112+
for (auto [Idx, V] : enumerate(VL)) {
12113+
if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12114+
continue;
12115+
if (!isa<LoadInst>(Operands[0][Idx]) &&
12116+
isa<LoadInst>(Operands[1][Idx]))
12117+
std::swap(Operands[0][Idx], Operands[1][Idx]);
12118+
}
1210012119
}
1210112120
} else {
1210212121
buildOriginalOperands(S, VL, Operands);

llvm/test/Transforms/SLPVectorizer/X86/copyable_reorder.ll

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -177,21 +177,14 @@ entry:
177177
define void @test_add_udiv_commuted(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
178178
; CHECK-LABEL: @test_add_udiv_commuted(
179179
; CHECK-NEXT: entry:
180-
; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
181-
; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
182-
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
183-
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
184-
; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
185-
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
186-
; CHECK-NEXT: [[RES2:%.*]] = udiv i32 [[V2]], [[Y2]]
187-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
180+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[ARR1:%.*]], align 4
181+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[A2:%.*]], i32 2
182+
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP4]], <i32 1, i32 1, i32 42, i32 1>
183+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
188184
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
189185
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3
190-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[RES2]], i32 2
191-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], <i32 1146, i32 146, i32 0, i32 0>
192-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[V3]], i32 3
193-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
194-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
186+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP3]]
187+
; CHECK-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP0]], [[TMP6]]
195188
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP8]]
196189
; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[ARR2:%.*]], align 4
197190
; CHECK-NEXT: ret void
@@ -348,21 +341,14 @@ entry:
348341
define void @test_add_udiv_sub_commuted(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
349342
; CHECK-LABEL: @test_add_udiv_sub_commuted(
350343
; CHECK-NEXT: entry:
351-
; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
352-
; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
353-
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
354-
; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
355-
; CHECK-NEXT: [[Y2:%.*]] = sub i32 [[A2:%.*]], 42
356-
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
357-
; CHECK-NEXT: [[RES2:%.*]] = udiv i32 [[V2]], [[Y2]]
358-
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
344+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[ARR1:%.*]], align 4
345+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> <i32 1, i32 1, i32 poison, i32 1>, i32 [[A2:%.*]], i32 2
346+
; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], <i32 0, i32 0, i32 42, i32 0>
347+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
359348
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
360349
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3
361-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[RES2]], i32 2
362-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], <i32 1146, i32 146, i32 0, i32 0>
363-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[V3]], i32 3
364-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
365-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
350+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP3]]
351+
; CHECK-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP0]], [[TMP6]]
366352
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP8]]
367353
; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[ARR2:%.*]], align 4
368354
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)