@@ -2021,13 +2021,15 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
20212021/// registers, returns 1.
20222022static unsigned
20232023getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
2024+ Type *ScalarTy,
20242025 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
20252026 unsigned NumParts = TTI.getNumberOfParts(VecTy);
20262027 if (NumParts == 0 || NumParts >= Limit)
20272028 return 1;
20282029 unsigned Sz = getNumElements(VecTy);
2029- if (NumParts >= Sz || Sz % NumParts != 0 ||
2030- !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
2030+ unsigned ScalarSz = getNumElements(ScalarTy);
2031+ if (NumParts >= Sz || Sz % NumParts != 0 || (Sz / NumParts) % ScalarSz != 0 ||
2032+ !hasFullVectorsOrPowerOf2(TTI, ScalarTy, Sz / NumParts))
20312033 return 1;
20322034 return NumParts;
20332035}
@@ -3902,8 +3904,10 @@ class slpvectorizer::BoUpSLP {
39023904 SmallPtrSetImpl<Value *> &CheckedExtracts);
39033905
39043906 /// Estimates spill/reload cost from vector register pressure for \p E at the
3905- /// point of emitting its vector result type \p FinalVecTy.
3906- InstructionCost getVectorSpillReloadCost(const TreeEntry *E,
3907+ /// point of emitting its vector result type \p FinalVecTy. \p ScalarTy is the
3908+ /// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
3909+ /// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
3910+ InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
39073911 VectorType *VecTy,
39083912 VectorType *FinalVecTy,
39093913 TTI::TargetCostKind CostKind) const;
@@ -6594,7 +6598,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
65946598 if (!isValidElementType(ScalarTy))
65956599 return std::nullopt;
65966600 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6597- unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6601+ unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy, NumScalars);
65986602 SmallVector<int> ExtractMask;
65996603 SmallVector<int> Mask;
66006604 SmallVector<SmallVector<const TreeEntry *>> Entries;
@@ -8159,8 +8163,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
81598163 }
81608164 }
81618165 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8162- ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
8163- 2 * TE.getVectorFactor())) == 1)
8166+ ::getNumberOfParts(*TTI,
8167+ getWidenedType(getValueType(TE.Scalars.front()),
8168+ 2 * TE.getVectorFactor()),
8169+ getValueType(TE.Scalars.front())) == 1)
81648170 return std::nullopt;
81658171 if (TE.ReuseShuffleIndices.size() % Sz != 0)
81668172 return std::nullopt;
@@ -14429,7 +14435,8 @@ void BoUpSLP::transformNodes() {
1442914435 bool IsTwoRegisterSplat = true;
1443014436 if (IsSplat && VF == 2) {
1443114437 unsigned NumRegs2VF = ::getNumberOfParts(
14432- *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
14438+ *TTI, getWidenedType(getValueType(Slice.front()), 2 * VF),
14439+ getValueType(Slice.front()));
1443314440 IsTwoRegisterSplat = NumRegs2VF == 2;
1443414441 }
1443514442 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
@@ -15563,7 +15570,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1556315570 }
1556415571 assert(!CommonMask.empty() && "Expected non-empty common mask.");
1556515572 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15566- unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
15573+ unsigned NumParts =
15574+ ::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
1556715575 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1556815576 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
1556915577 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -15577,7 +15585,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1557715585 }
1557815586 assert(!CommonMask.empty() && "Expected non-empty common mask.");
1557915587 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
15580- unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
15588+ unsigned NumParts =
15589+ ::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
1558115590 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
1558215591 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
1558315592 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -15892,8 +15901,8 @@ unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
1589215901}
1589315902
1589415903InstructionCost
15895- BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy ,
15896- VectorType *FinalVecTy,
15904+ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy ,
15905+ VectorType *VecTy, VectorType * FinalVecTy,
1589715906 TTI::TargetCostKind CostKind) const {
1589815907 InstructionCost SpillsReloads = 0;
1589915908
@@ -15919,7 +15928,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1591915928 PressureByClass[RegClass] += Parts;
1592015929 };
1592115930
15922- auto GetEntryVecTy = [&](const TreeEntry *TE) -> VectorType * {
15931+ auto GetEntryVecTy =
15932+ [&](const TreeEntry *TE) -> std::pair<Type *, VectorType *> {
1592315933 Type *ScalarTy = getValueType(TE->Scalars.front());
1592415934 auto BWIt = MinBWs.find(TE);
1592515935 if (BWIt != MinBWs.end()) {
@@ -15928,7 +15938,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1592815938 if (VTy)
1592915939 ScalarTy = getWidenedType(ScalarTy, VTy->getNumElements());
1593015940 }
15931- return getWidenedType(ScalarTy, TE->getVectorFactor());
15941+ return std::make_pair(ScalarTy,
15942+ getWidenedType(ScalarTy, TE->getVectorFactor()));
1593215943 };
1593315944
1593415945 if (E->State == TreeEntry::SplitVectorize) {
@@ -15937,8 +15948,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1593715948
1593815949 if (!CountedOpEntries.insert(OpTE).second)
1593915950 continue;
15940- auto * OpVecTy = GetEntryVecTy(OpTE);
15941- const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy);
15951+ auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
15952+ const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy );
1594215953 if (Parts == 0)
1594315954 continue;
1594415955 const unsigned RC =
@@ -15951,8 +15962,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1595115962 SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
1595215963 for (unsigned Idx : seq<unsigned>(E->getNumOperands())) {
1595315964 const TreeEntry *OpTE = getOperandEntry(E, Idx);
15954- auto * OpVecTy = GetEntryVecTy(OpTE);
15955- const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy);
15965+ auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
15966+ const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy );
1595615967 if (Parts == 0)
1595715968 continue;
1595815969 const unsigned RC =
@@ -15978,7 +15989,7 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1597815989 if (!CountedOpEntries.insert(OpTE).second)
1597915990 continue;
1598015991 auto *OpVecTy = getWidenedType(Op->getType(), Ops.size());
15981- const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy);
15992+ const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, Op->getType() );
1598215993 if (Parts == 0)
1598315994 continue;
1598415995 const unsigned RC =
@@ -15988,13 +15999,14 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
1598815999 }
1598916000
1599016001 if (E->getOpcode() != Instruction::Load) {
15991- const unsigned ResParts = ::getNumberOfParts(*TTI, VecTy);
16002+ const unsigned ResParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy );
1599216003 if (ResParts != 0) {
1599316004 const unsigned RC = TTI->getRegisterClassForType(/*Vector=*/true, VecTy);
1599416005 AddPartsToClass(RC, ResParts);
1599516006 }
1599616007 if (VecTy != FinalVecTy) {
15997- const unsigned FinalResParts = ::getNumberOfParts(*TTI, FinalVecTy);
16008+ const unsigned FinalResParts =
16009+ ::getNumberOfParts(*TTI, FinalVecTy, ScalarTy);
1599816010 if (FinalResParts != 0) {
1599916011 const unsigned RC =
1600016012 TTI->getRegisterClassForType(/*Vector=*/true, FinalVecTy);
@@ -16052,7 +16064,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1605216064 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
1605316065
1605416066 const InstructionCost SpillsReloads =
16055- getVectorSpillReloadCost(E, VecTy, FinalVecTy, CostKind);
16067+ getVectorSpillReloadCost(E, ScalarTy, VecTy, FinalVecTy, CostKind);
1605616068 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
1605716069 if (allConstant(VL))
1605816070 return 0;
@@ -16342,7 +16354,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1634216354 unsigned const NumElts = SrcVecTy->getNumElements();
1634316355 unsigned const NumScalars = VL.size();
1634416356
16345- unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
16357+ unsigned NumOfParts =
16358+ ::getNumberOfParts(*TTI, SrcVecTy, VL0->getOperand(1)->getType());
1634616359
1634716360 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
1634816361 unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -21133,7 +21146,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
2113321146 SmallVector<SmallVector<const TreeEntry *>> Entries;
2113421147 Type *OrigScalarTy = GatheredScalars.front()->getType();
2113521148 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
21136- unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
21149+ unsigned NumParts =
21150+ ::getNumberOfParts(*TTI, VecTy, ScalarTy, GatheredScalars.size());
2113721151 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
2113821152 // Check for gathered extracts.
2113921153 bool Resized = false;
@@ -21166,8 +21180,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
2116621180 Resized = true;
2116721181 GatheredScalars.append(VF - GatheredScalars.size(),
2116821182 PoisonValue::get(OrigScalarTy));
21169- NumParts =
21170- ::getNumberOfParts( *TTI, getWidenedType(OrigScalarTy, VF), VF);
21183+ NumParts = ::getNumberOfParts(
21184+ *TTI, getWidenedType(OrigScalarTy, VF), OrigScalarTy , VF);
2117121185 }
2117221186 }
2117321187 }
@@ -21395,9 +21409,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
2139521409 }
2139621410 }
2139721411 if (!GatherShuffles.empty()) {
21398- unsigned SliceSize =
21399- getPartNumElems( E->Scalars.size(),
21400- ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
21412+ unsigned SliceSize = getPartNumElems(
21413+ E->Scalars.size(),
21414+ ::getNumberOfParts(*TTI, VecTy, ScalarTy , E->Scalars.size()));
2140121415 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
2140221416 for (const auto [I, TEs] : enumerate(Entries)) {
2140321417 if (TEs.empty()) {
@@ -23631,10 +23645,13 @@ void BoUpSLP::optimizeGatherSequence() {
2363123645 // Check if the last undefs actually change the final number of used vector
2363223646 // registers.
2363323647 return SM1.size() - LastUndefsCnt > 1 &&
23634- ::getNumberOfParts(*TTI, SI1->getType()) ==
23648+ ::getNumberOfParts(*TTI, SI1->getType(),
23649+ SI1->getType()->getElementType()) ==
2363523650 ::getNumberOfParts(
23636- *TTI, getWidenedType(SI1->getType()->getElementType(),
23637- SM1.size() - LastUndefsCnt));
23651+ *TTI,
23652+ getWidenedType(SI1->getType()->getElementType(),
23653+ SM1.size() - LastUndefsCnt),
23654+ SI1->getType()->getElementType());
2363823655 };
2363923656 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
2364023657 // instructions. TODO: We can further optimize this scan if we split the
@@ -24948,12 +24965,14 @@ bool BoUpSLP::collectValuesToDemote(
2494824965 const unsigned VF = E.Scalars.size();
2494924966 Type *OrigScalarTy = E.Scalars.front()->getType();
2495024967 if (UniqueBases.size() <= 2 ||
24951- ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
24968+ ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF),
24969+ OrigScalarTy) >=
2495224970 ::getNumberOfParts(
2495324971 *TTI,
2495424972 getWidenedType(
2495524973 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
24956- VF))) {
24974+ VF),
24975+ IntegerType::get(OrigScalarTy->getContext(), BitWidth))) {
2495724976 ToDemote.push_back(E.Idx);
2495824977 return true;
2495924978 }
@@ -25395,7 +25414,6 @@ void BoUpSLP::computeMinimumValueSizes() {
2539525414
2539625415 unsigned VF = E.getVectorFactor();
2539725416 Type *ScalarTy = E.Scalars.front()->getType();
25398- unsigned ScalarTyNumElements = getNumElements(ScalarTy);
2539925417 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
2540025418 if (!TreeRootIT)
2540125419 return 0u;
@@ -25404,8 +25422,8 @@ void BoUpSLP::computeMinimumValueSizes() {
2540425422 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
2540525423 return 0u;
2540625424
25407- unsigned NumParts = ::getNumberOfParts(
25408- *TTI, getWidenedType(TreeRootIT , VF * ScalarTyNumElements) );
25425+ unsigned NumParts =
25426+ ::getNumberOfParts( *TTI, getWidenedType(ScalarTy , VF), ScalarTy );
2540925427
2541025428 // The maximum bit width required to represent all the values that can be
2541125429 // demoted without loss of precision. It would be safe to truncate the roots
@@ -25479,9 +25497,11 @@ void BoUpSLP::computeMinimumValueSizes() {
2547925497 if (NumParts > 1 &&
2548025498 NumParts ==
2548125499 ::getNumberOfParts(
25482- *TTI, getWidenedType(IntegerType::get(F->getContext(),
25483- bit_ceil(MaxBitWidth)),
25484- VF)))
25500+ *TTI,
25501+ getWidenedType(
25502+ IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)),
25503+ VF),
25504+ IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth))))
2548525505 return 0u;
2548625506
2548725507 unsigned Opcode = E.getOpcode();
@@ -27818,14 +27838,14 @@ class HorizontalReduction {
2781827838 ReduxWidth =
2781927839 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
2782027840 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
27821- NumParts = ::getNumberOfParts(TTI, Tp);
27841+ NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy );
2782227842 NumRegs =
2782327843 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2782427844 while (NumParts > NumRegs) {
2782527845 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
2782627846 ReduxWidth = bit_floor(ReduxWidth - 1);
2782727847 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
27828- NumParts = ::getNumberOfParts(TTI, Tp);
27848+ NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy );
2782927849 NumRegs =
2783027850 TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
2783127851 }
0 commit comments