[llvm] d7a40a4 - Revert "[SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions."

Tue Apr 18 10:41:48 PDT 2023

Author: Alexey Bataev
Date: 2023-04-18T10:41:00-07:00
New Revision: d7a40a447f1ed0294c6bc8fe82b6b2460e31de06

URL: https://github.com/llvm/llvm-project/commit/d7a40a447f1ed0294c6bc8fe82b6b2460e31de06
DIFF: https://github.com/llvm/llvm-project/commit/d7a40a447f1ed0294c6bc8fe82b6b2460e31de06.diff

LOG: Revert "[SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions."

This reverts commit cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc to fix
a crash revealed by buildbot https://lab.llvm.org/buildbot#builders/124/builds/7108.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 163e3581ea7be..a1fabec2a8d7c 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -946,14 +946,9 @@ static bool isSimple(Instruction *I) {
 }
 
 /// Shuffles \p Mask in accordance with the given \p SubMask.
-/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
-/// one but two input vectors.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
-                    bool ExtendingManyInputs = false) {
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
   if (SubMask.empty())
     return;
-  assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) &&
-         "SubMask with many inputs support must be larger than the mask.");
   if (Mask.empty()) {
     Mask.append(SubMask.begin(), SubMask.end());
     return;
@@ -961,9 +956,8 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
   SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
   int TermValue = std::min(Mask.size(), SubMask.size());
   for (int I = 0, E = SubMask.size(); I < E; ++I) {
-    if ((!ExtendingManyInputs &&
-         (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) ||
-        SubMask[I] == UndefMaskElem)
+    if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+        Mask[SubMask[I]] >= TermValue)
       continue;
     NewMask[I] = Mask[SubMask[I]];
   }
@@ -6794,8 +6788,6 @@ class BaseShuffleAnalysis {
 /// analysis/transformations.
 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   bool IsFinalized = false;
-  SmallVector<int> CommonMask;
-  SmallVector<Value *, 2> InVectors;
   const TargetTransformInfo &TTI;
   InstructionCost Cost = 0;
   ArrayRef<Value *> VectorizedVals;
@@ -7017,53 +7009,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                    VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
-    InVectors.assign(1, VecBase);
     return VecBase;
   }
-  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign(
-        2, Constant::getNullValue(FixedVectorType::get(
-               E1->Scalars.front()->getType(),
-               std::max(E1->getVectorFactor(), E2->getVectorFactor()))));
-  }
-  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign(
-        1, Constant::getNullValue(FixedVectorType::get(
-               E1->Scalars.front()->getType(), E1->getVectorFactor())));
-  }
   void gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
     Cost += getBuildVectorCost(VL, Root);
-    if (!Root) {
-      assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
-      InVectors.assign(1, Constant::getNullValue(FixedVectorType::get(
-                              VL.front()->getType(), VL.size())));
-    }
   }
   /// Finalize emission of the shuffles.
-  InstructionCost finalize(ArrayRef<int> ExtMask) {
+  InstructionCost finalize() {
     IsFinalized = true;
-    ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
-    if (CommonMask.empty())
-      return Cost;
-    int Limit = CommonMask.size() * 2;
-    if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
-        ShuffleVectorInst::isIdentityMask(CommonMask))
-      return Cost;
-    return Cost +
-           TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc
-                                                    : TTI::SK_PermuteSingleSrc,
-                              FixedVectorType::get(
-                                  cast<VectorType>(InVectors.front()->getType())
-                                      ->getElementType(),
-                                  CommonMask.size()),
-                              CommonMask);
+    return Cost;
   }
 
   ~ShuffleCostEstimator() {
-    assert((IsFinalized || CommonMask.empty()) &&
-           "Shuffle construction must be finalized.");
+      assert(IsFinalized && "Shuffle construction must be finalized.");
   }
 };
 
@@ -7151,30 +7109,35 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         if (Mask[I] != UndefMaskElem)
           GatheredScalars[I] = PoisonValue::get(ScalarTy);
       }
-      LLVM_DEBUG(
-          int Limit = Mask.size() * 2;
-          if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
-              all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
-              ShuffleVectorInst::isIdentityMask(Mask)) {
-            // Perfect match in the graph, will reuse the previously
-            // vectorized node. Cost is 0.
-            dbgs() << "SLP: perfect diamond match for gather bundle "
-                      "that starts with "
-                   << *VL.front() << ".\n";
-          } else {
-            dbgs() << "SLP: shuffled " << Entries.size()
-                   << " entries for bundle that starts with " << *VL.front()
-                   << ".\n";
-          });
-      if (Entries.size() == 1)
-        Estimator.add(Entries.front(), Mask);
-      else
-        Estimator.add(Entries.front(), Entries.back(), Mask);
+      InstructionCost GatherCost = 0;
+      int Limit = Mask.size() * 2;
+      if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
+          ShuffleVectorInst::isIdentityMask(Mask)) {
+        // Perfect match in the graph, will reuse the previously vectorized
+        // node. Cost is 0.
+        LLVM_DEBUG(
+            dbgs()
+            << "SLP: perfect diamond match for gather bundle that starts with "
+            << *VL.front() << ".\n");
+        if (NeedToShuffleReuses)
+          GatherCost =
+              TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                  FinalVecTy, E->ReuseShuffleIndices);
+      } else {
+        LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
+                          << " entries for bundle that starts with "
+                          << *VL.front() << ".\n");
+        // Detected that instead of gather we can emit a shuffle of single/two
+        // previously vectorized nodes. Add the cost of the permutation rather
+        // than gather.
+        ::addMask(Mask, E->ReuseShuffleIndices);
+        GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
+      }
       Estimator.gather(
           GatheredScalars,
           Constant::getNullValue(FixedVectorType::get(
               GatheredScalars.front()->getType(), GatheredScalars.size())));
-      return Estimator.finalize(E->ReuseShuffleIndices);
+      return GatherCost + Estimator.finalize();
     }
     if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
       // Check that gather of extractelements can be represented as just a
@@ -7184,15 +7147,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // single input vector or of 2 input vectors.
       InstructionCost Cost =
           computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
-      return Cost + Estimator.finalize(E->ReuseShuffleIndices);
-    }
-    Estimator.gather(
-        GatheredScalars,
-        (ExtractShuffle || GatherShuffle)
-            ? Constant::getNullValue(FixedVectorType::get(
-                  GatheredScalars.front()->getType(), GatheredScalars.size()))
-            : nullptr);
-    return Estimator.finalize(E->ReuseShuffleIndices);
+      if (NeedToShuffleReuses)
+        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    FinalVecTy, E->ReuseShuffleIndices);
+      return Cost + Estimator.finalize();
+    }
+    InstructionCost ReuseShuffleCost = 0;
+    if (NeedToShuffleReuses)
+      ReuseShuffleCost = TTI->getShuffleCost(
+          TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
+    Estimator.gather(GatheredScalars);
+    return ReuseShuffleCost + Estimator.finalize();
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index f26d14c44db87..c5b6ac647aa32 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -93,36 +93,20 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
 }
 
 define i1 @logical_and_icmp_
diff _preds(<4 x i32> %x) {
-; SSE-LABEL: @logical_and_icmp_
diff _preds(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; SSE-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
-; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]]
-; SSE-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 0
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false
-; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false
-; SSE-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; SSE-NEXT:    ret i1 [[S3]]
-;
-; AVX-LABEL: @logical_and_icmp_
diff _preds(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; AVX-NEXT:    [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
-; AVX-NEXT:    ret i1 [[S3]]
+; CHECK-LABEL: @logical_and_icmp_
diff _preds(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
+; CHECK-NEXT:    ret i1 [[S3]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1