Skip to content

Commit 1d6c9b8

Browse files
[SLP][REVEC] Honor slot type when computing NumberOfParts
The getNumberOfParts() helper split VecTy without considering that a REVEC slot is a FixedVectorType, so NumParts could fall on a non-slot boundary. Add an explicit ScalarTy argument, require (Sz / NumParts) to be a multiple of getNumElements(ScalarTy), and use ScalarTy for the hasFullVectorsOrPowerOf2 check. For non-REVEC callers ScalarSz == 1 and behavior is unchanged. Fixes #192963. Reviewers: Pull Request: #193085
1 parent a80dd15 commit 1d6c9b8

3 files changed

Lines changed: 142 additions & 91 deletions

File tree

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 62 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,13 +2021,15 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
20212021
/// registers, returns 1.
20222022
static unsigned
20232023
getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
2024+
Type *ScalarTy,
20242025
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
20252026
unsigned NumParts = TTI.getNumberOfParts(VecTy);
20262027
if (NumParts == 0 || NumParts >= Limit)
20272028
return 1;
20282029
unsigned Sz = getNumElements(VecTy);
2029-
if (NumParts >= Sz || Sz % NumParts != 0 ||
2030-
!hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
2030+
unsigned ScalarSz = getNumElements(ScalarTy);
2031+
if (NumParts >= Sz || Sz % NumParts != 0 || (Sz / NumParts) % ScalarSz != 0 ||
2032+
!hasFullVectorsOrPowerOf2(TTI, ScalarTy, Sz / NumParts))
20312033
return 1;
20322034
return NumParts;
20332035
}
@@ -3902,8 +3904,10 @@ class slpvectorizer::BoUpSLP {
39023904
SmallPtrSetImpl<Value *> &CheckedExtracts);
39033905

39043906
/// Estimates spill/reload cost from vector register pressure for \p E at the
3905-
/// point of emitting its vector result type \p FinalVecTy.
3906-
InstructionCost getVectorSpillReloadCost(const TreeEntry *E,
3907+
/// point of emitting its vector result type \p FinalVecTy. \p ScalarTy is the
3908+
/// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
3909+
/// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
3910+
InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
39073911
VectorType *VecTy,
39083912
VectorType *FinalVecTy,
39093913
TTI::TargetCostKind CostKind) const;
@@ -6594,7 +6598,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
65946598
if (!isValidElementType(ScalarTy))
65956599
return std::nullopt;
65966600
auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6597-
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6601+
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy, NumScalars);
65986602
SmallVector<int> ExtractMask;
65996603
SmallVector<int> Mask;
66006604
SmallVector<SmallVector<const TreeEntry *>> Entries;
@@ -8159,8 +8163,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
81598163
}
81608164
}
81618165
if (Sz == 2 && TE.getVectorFactor() == 4 &&
8162-
::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
8163-
2 * TE.getVectorFactor())) == 1)
8166+
::getNumberOfParts(*TTI,
8167+
getWidenedType(getValueType(TE.Scalars.front()),
8168+
2 * TE.getVectorFactor()),
8169+
getValueType(TE.Scalars.front())) == 1)
81648170
return std::nullopt;
81658171
if (TE.ReuseShuffleIndices.size() % Sz != 0)
81668172
return std::nullopt;
@@ -14429,7 +14435,8 @@ void BoUpSLP::transformNodes() {
1442914435
bool IsTwoRegisterSplat = true;
1443014436
if (IsSplat && VF == 2) {
1443114437
unsigned NumRegs2VF = ::getNumberOfParts(
14432-
*TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
14438+
*TTI, getWidenedType(getValueType(Slice.front()), 2 * VF),
14439+
getValueType(Slice.front()));
1443314440
IsTwoRegisterSplat = NumRegs2VF == 2;
1443414441
}
1443514442
if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
@@ -15563,7 +15570,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1556315570
}
1556415571
assert(!CommonMask.empty() && "Expected non-empty common mask.");
1556515572
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15566-
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
15573+
unsigned NumParts =
15574+
::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
1556715575
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1556815576
const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
1556915577
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -15577,7 +15585,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1557715585
}
1557815586
assert(!CommonMask.empty() && "Expected non-empty common mask.");
1557915587
auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15580-
unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
15588+
unsigned NumParts =
15589+
::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
1558115590
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1558215591
const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
1558315592
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -15892,8 +15901,8 @@ unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
1589215901
}
1589315902

1589415903
InstructionCost
15895-
BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
15896-
VectorType *FinalVecTy,
15904+
BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
15905+
VectorType *VecTy, VectorType *FinalVecTy,
1589715906
TTI::TargetCostKind CostKind) const {
1589815907
InstructionCost SpillsReloads = 0;
1589915908

@@ -15919,7 +15928,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1591915928
PressureByClass[RegClass] += Parts;
1592015929
};
1592115930

15922-
auto GetEntryVecTy = [&](const TreeEntry *TE) -> VectorType * {
15931+
auto GetEntryVecTy =
15932+
[&](const TreeEntry *TE) -> std::pair<Type *, VectorType *> {
1592315933
Type *ScalarTy = getValueType(TE->Scalars.front());
1592415934
auto BWIt = MinBWs.find(TE);
1592515935
if (BWIt != MinBWs.end()) {
@@ -15928,7 +15938,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1592815938
if (VTy)
1592915939
ScalarTy = getWidenedType(ScalarTy, VTy->getNumElements());
1593015940
}
15931-
return getWidenedType(ScalarTy, TE->getVectorFactor());
15941+
return std::make_pair(ScalarTy,
15942+
getWidenedType(ScalarTy, TE->getVectorFactor()));
1593215943
};
1593315944

1593415945
if (E->State == TreeEntry::SplitVectorize) {
@@ -15937,8 +15948,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1593715948

1593815949
if (!CountedOpEntries.insert(OpTE).second)
1593915950
continue;
15940-
auto *OpVecTy = GetEntryVecTy(OpTE);
15941-
const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy);
15951+
auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
15952+
const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy);
1594215953
if (Parts == 0)
1594315954
continue;
1594415955
const unsigned RC =
@@ -15951,8 +15962,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1595115962
SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
1595215963
for (unsigned Idx : seq<unsigned>(E->getNumOperands())) {
1595315964
const TreeEntry *OpTE = getOperandEntry(E, Idx);
15954-
auto *OpVecTy = GetEntryVecTy(OpTE);
15955-
const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy);
15965+
auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
15966+
const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy);
1595615967
if (Parts == 0)
1595715968
continue;
1595815969
const unsigned RC =
@@ -15978,7 +15989,7 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1597815989
if (!CountedOpEntries.insert(OpTE).second)
1597915990
continue;
1598015991
auto *OpVecTy = getWidenedType(Op->getType(), Ops.size());
15981-
const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy);
15992+
const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, Op->getType());
1598215993
if (Parts == 0)
1598315994
continue;
1598415995
const unsigned RC =
@@ -15988,13 +15999,14 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1598815999
}
1598916000

1599016001
if (E->getOpcode() != Instruction::Load) {
15991-
const unsigned ResParts = ::getNumberOfParts(*TTI, VecTy);
16002+
const unsigned ResParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy);
1599216003
if (ResParts != 0) {
1599316004
const unsigned RC = TTI->getRegisterClassForType(/*Vector=*/true, VecTy);
1599416005
AddPartsToClass(RC, ResParts);
1599516006
}
1599616007
if (VecTy != FinalVecTy) {
15997-
const unsigned FinalResParts = ::getNumberOfParts(*TTI, FinalVecTy);
16008+
const unsigned FinalResParts =
16009+
::getNumberOfParts(*TTI, FinalVecTy, ScalarTy);
1599816010
if (FinalResParts != 0) {
1599916011
const unsigned RC =
1600016012
TTI->getRegisterClassForType(/*Vector=*/true, FinalVecTy);
@@ -16052,7 +16064,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1605216064
auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
1605316065

1605416066
const InstructionCost SpillsReloads =
16055-
getVectorSpillReloadCost(E, VecTy, FinalVecTy, CostKind);
16067+
getVectorSpillReloadCost(E, ScalarTy, VecTy, FinalVecTy, CostKind);
1605616068
if (E->isGather() || TransformedToGatherNodes.contains(E)) {
1605716069
if (allConstant(VL))
1605816070
return 0;
@@ -16342,7 +16354,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1634216354
unsigned const NumElts = SrcVecTy->getNumElements();
1634316355
unsigned const NumScalars = VL.size();
1634416356

16345-
unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
16357+
unsigned NumOfParts =
16358+
::getNumberOfParts(*TTI, SrcVecTy, VL0->getOperand(1)->getType());
1634616359

1634716360
SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
1634816361
unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -21133,7 +21146,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
2113321146
SmallVector<SmallVector<const TreeEntry *>> Entries;
2113421147
Type *OrigScalarTy = GatheredScalars.front()->getType();
2113521148
auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
21136-
unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
21149+
unsigned NumParts =
21150+
::getNumberOfParts(*TTI, VecTy, ScalarTy, GatheredScalars.size());
2113721151
if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
2113821152
// Check for gathered extracts.
2113921153
bool Resized = false;
@@ -21166,8 +21180,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
2116621180
Resized = true;
2116721181
GatheredScalars.append(VF - GatheredScalars.size(),
2116821182
PoisonValue::get(OrigScalarTy));
21169-
NumParts =
21170-
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
21183+
NumParts = ::getNumberOfParts(
21184+
*TTI, getWidenedType(OrigScalarTy, VF), OrigScalarTy, VF);
2117121185
}
2117221186
}
2117321187
}
@@ -21395,9 +21409,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
2139521409
}
2139621410
}
2139721411
if (!GatherShuffles.empty()) {
21398-
unsigned SliceSize =
21399-
getPartNumElems(E->Scalars.size(),
21400-
::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
21412+
unsigned SliceSize = getPartNumElems(
21413+
E->Scalars.size(),
21414+
::getNumberOfParts(*TTI, VecTy, ScalarTy, E->Scalars.size()));
2140121415
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
2140221416
for (const auto [I, TEs] : enumerate(Entries)) {
2140321417
if (TEs.empty()) {
@@ -23631,10 +23645,13 @@ void BoUpSLP::optimizeGatherSequence() {
2363123645
// Check if the last undefs actually change the final number of used vector
2363223646
// registers.
2363323647
return SM1.size() - LastUndefsCnt > 1 &&
23634-
::getNumberOfParts(*TTI, SI1->getType()) ==
23648+
::getNumberOfParts(*TTI, SI1->getType(),
23649+
SI1->getType()->getElementType()) ==
2363523650
::getNumberOfParts(
23636-
*TTI, getWidenedType(SI1->getType()->getElementType(),
23637-
SM1.size() - LastUndefsCnt));
23651+
*TTI,
23652+
getWidenedType(SI1->getType()->getElementType(),
23653+
SM1.size() - LastUndefsCnt),
23654+
SI1->getType()->getElementType());
2363823655
};
2363923656
// Perform O(N^2) search over the gather/shuffle sequences and merge identical
2364023657
// instructions. TODO: We can further optimize this scan if we split the
@@ -24948,12 +24965,14 @@ bool BoUpSLP::collectValuesToDemote(
2494824965
const unsigned VF = E.Scalars.size();
2494924966
Type *OrigScalarTy = E.Scalars.front()->getType();
2495024967
if (UniqueBases.size() <= 2 ||
24951-
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
24968+
::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF),
24969+
OrigScalarTy) >=
2495224970
::getNumberOfParts(
2495324971
*TTI,
2495424972
getWidenedType(
2495524973
IntegerType::get(OrigScalarTy->getContext(), BitWidth),
24956-
VF))) {
24974+
VF),
24975+
IntegerType::get(OrigScalarTy->getContext(), BitWidth))) {
2495724976
ToDemote.push_back(E.Idx);
2495824977
return true;
2495924978
}
@@ -25395,7 +25414,6 @@ void BoUpSLP::computeMinimumValueSizes() {
2539525414

2539625415
unsigned VF = E.getVectorFactor();
2539725416
Type *ScalarTy = E.Scalars.front()->getType();
25398-
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
2539925417
auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
2540025418
if (!TreeRootIT)
2540125419
return 0u;
@@ -25404,8 +25422,8 @@ void BoUpSLP::computeMinimumValueSizes() {
2540425422
[&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
2540525423
return 0u;
2540625424

25407-
unsigned NumParts = ::getNumberOfParts(
25408-
*TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
25425+
unsigned NumParts =
25426+
::getNumberOfParts(*TTI, getWidenedType(ScalarTy, VF), ScalarTy);
2540925427

2541025428
// The maximum bit width required to represent all the values that can be
2541125429
// demoted without loss of precision. It would be safe to truncate the roots
@@ -25479,9 +25497,11 @@ void BoUpSLP::computeMinimumValueSizes() {
2547925497
if (NumParts > 1 &&
2548025498
NumParts ==
2548125499
::getNumberOfParts(
25482-
*TTI, getWidenedType(IntegerType::get(F->getContext(),
25483-
bit_ceil(MaxBitWidth)),
25484-
VF)))
25500+
*TTI,
25501+
getWidenedType(
25502+
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)),
25503+
VF),
25504+
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth))))
2548525505
return 0u;
2548625506

2548725507
unsigned Opcode = E.getOpcode();
@@ -27818,14 +27838,14 @@ class HorizontalReduction {
2781827838
ReduxWidth =
2781927839
getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
2782027840
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
27821-
NumParts = ::getNumberOfParts(TTI, Tp);
27841+
NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy);
2782227842
NumRegs =
2782327843
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2782427844
while (NumParts > NumRegs) {
2782527845
assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
2782627846
ReduxWidth = bit_floor(ReduxWidth - 1);
2782727847
VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
27828-
NumParts = ::getNumberOfParts(TTI, Tp);
27848+
NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy);
2782927849
NumRegs =
2783027850
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2783127851
}

0 commit comments

Comments
 (0)