[llvm] cd341f3 - [SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions.

Tue Apr 18 05:55:08 PDT 2023

Author: Alexey Bataev
Date: 2023-04-18T05:51:23-07:00
New Revision: cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc

URL: https://github.com/llvm/llvm-project/commit/cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc
DIFF: https://github.com/llvm/llvm-project/commit/cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc.diff

LOG: [SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions.

Implemented the reshuffling in finalize member function + add basic
support for add member functions, used during vector build.

Part of D110978

Differential Revision: https://reviews.llvm.org/D148279

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1fabec2a8d7c..163e3581ea7be 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -946,9 +946,14 @@ static bool isSimple(Instruction *I) {
 }
 
 /// Shuffles \p Mask in accordance with the given \p SubMask.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
+/// one but two input vectors.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
+                    bool ExtendingManyInputs = false) {
   if (SubMask.empty())
     return;
+  assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) &&
+         "SubMask with many inputs support must be larger than the mask.");
   if (Mask.empty()) {
     Mask.append(SubMask.begin(), SubMask.end());
     return;
@@ -956,8 +961,9 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
   SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
   int TermValue = std::min(Mask.size(), SubMask.size());
   for (int I = 0, E = SubMask.size(); I < E; ++I) {
-    if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
-        Mask[SubMask[I]] >= TermValue)
+    if ((!ExtendingManyInputs &&
+         (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) ||
+        SubMask[I] == UndefMaskElem)
       continue;
     NewMask[I] = Mask[SubMask[I]];
   }
@@ -6788,6 +6794,8 @@ class BaseShuffleAnalysis {
 /// analysis/transformations.
 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   bool IsFinalized = false;
+  SmallVector<int> CommonMask;
+  SmallVector<Value *, 2> InVectors;
   const TargetTransformInfo &TTI;
   InstructionCost Cost = 0;
   ArrayRef<Value *> VectorizedVals;
@@ -7009,19 +7017,53 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                    VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
+    InVectors.assign(1, VecBase);
     return VecBase;
   }
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign(
+        2, Constant::getNullValue(FixedVectorType::get(
+               E1->Scalars.front()->getType(),
+               std::max(E1->getVectorFactor(), E2->getVectorFactor()))));
+  }
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign(
+        1, Constant::getNullValue(FixedVectorType::get(
+               E1->Scalars.front()->getType(), E1->getVectorFactor())));
+  }
   void gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
     Cost += getBuildVectorCost(VL, Root);
+    if (!Root) {
+      assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
+      InVectors.assign(1, Constant::getNullValue(FixedVectorType::get(
+                              VL.front()->getType(), VL.size())));
+    }
   }
   /// Finalize emission of the shuffles.
-  InstructionCost finalize() {
+  InstructionCost finalize(ArrayRef<int> ExtMask) {
     IsFinalized = true;
-    return Cost;
+    ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
+    if (CommonMask.empty())
+      return Cost;
+    int Limit = CommonMask.size() * 2;
+    if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
+        ShuffleVectorInst::isIdentityMask(CommonMask))
+      return Cost;
+    return Cost +
+           TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc
+                                                    : TTI::SK_PermuteSingleSrc,
+                              FixedVectorType::get(
+                                  cast<VectorType>(InVectors.front()->getType())
+                                      ->getElementType(),
+                                  CommonMask.size()),
+                              CommonMask);
   }
 
   ~ShuffleCostEstimator() {
-      assert(IsFinalized && "Shuffle construction must be finalized.");
+    assert((IsFinalized || CommonMask.empty()) &&
+           "Shuffle construction must be finalized.");
   }
 };
 
@@ -7109,35 +7151,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         if (Mask[I] != UndefMaskElem)
           GatheredScalars[I] = PoisonValue::get(ScalarTy);
       }
-      InstructionCost GatherCost = 0;
-      int Limit = Mask.size() * 2;
-      if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
-          ShuffleVectorInst::isIdentityMask(Mask)) {
-        // Perfect match in the graph, will reuse the previously vectorized
-        // node. Cost is 0.
-        LLVM_DEBUG(
-            dbgs()
-            << "SLP: perfect diamond match for gather bundle that starts with "
-            << *VL.front() << ".\n");
-        if (NeedToShuffleReuses)
-          GatherCost =
-              TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                  FinalVecTy, E->ReuseShuffleIndices);
-      } else {
-        LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
-                          << " entries for bundle that starts with "
-                          << *VL.front() << ".\n");
-        // Detected that instead of gather we can emit a shuffle of single/two
-        // previously vectorized nodes. Add the cost of the permutation rather
-        // than gather.
-        ::addMask(Mask, E->ReuseShuffleIndices);
-        GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
-      }
+      LLVM_DEBUG(
+          int Limit = Mask.size() * 2;
+          if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+              all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
+              ShuffleVectorInst::isIdentityMask(Mask)) {
+            // Perfect match in the graph, will reuse the previously
+            // vectorized node. Cost is 0.
+            dbgs() << "SLP: perfect diamond match for gather bundle "
+                      "that starts with "
+                   << *VL.front() << ".\n";
+          } else {
+            dbgs() << "SLP: shuffled " << Entries.size()
+                   << " entries for bundle that starts with " << *VL.front()
+                   << ".\n";
+          });
+      if (Entries.size() == 1)
+        Estimator.add(Entries.front(), Mask);
+      else
+        Estimator.add(Entries.front(), Entries.back(), Mask);
       Estimator.gather(
           GatheredScalars,
           Constant::getNullValue(FixedVectorType::get(
               GatheredScalars.front()->getType(), GatheredScalars.size())));
-      return GatherCost + Estimator.finalize();
+      return Estimator.finalize(E->ReuseShuffleIndices);
     }
     if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
       // Check that gather of extractelements can be represented as just a
@@ -7147,17 +7184,15 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // single input vector or of 2 input vectors.
       InstructionCost Cost =
           computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
-      if (NeedToShuffleReuses)
-        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    FinalVecTy, E->ReuseShuffleIndices);
-      return Cost + Estimator.finalize();
-    }
-    InstructionCost ReuseShuffleCost = 0;
-    if (NeedToShuffleReuses)
-      ReuseShuffleCost = TTI->getShuffleCost(
-          TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
-    Estimator.gather(GatheredScalars);
-    return ReuseShuffleCost + Estimator.finalize();
+      return Cost + Estimator.finalize(E->ReuseShuffleIndices);
+    }
+    Estimator.gather(
+        GatheredScalars,
+        (ExtractShuffle || GatherShuffle)
+            ? Constant::getNullValue(FixedVectorType::get(
+                  GatheredScalars.front()->getType(), GatheredScalars.size()))
+            : nullptr);
+    return Estimator.finalize(E->ReuseShuffleIndices);
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index c5b6ac647aa32..f26d14c44db87 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -93,20 +93,36 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
 }
 
 define i1 @logical_and_icmp_
diff _preds(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_
diff _preds(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
-; CHECK-NEXT:    ret i1 [[S3]]
+; SSE-LABEL: @logical_and_icmp_
diff _preds(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 0
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false
+; SSE-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; SSE-NEXT:    ret i1 [[S3]]
+;
+; AVX-LABEL: @logical_and_icmp_
diff _preds(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
+; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
+; AVX-NEXT:    ret i1 [[S3]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1