[llvm] e4b9640 - [SLP]Improve bottom-to-top reordering.

Mon Feb 28 06:52:07 PST 2022

Author: Alexey Bataev
Date: 2022-02-28T06:48:46-08:00
New Revision: e4b9640867150723b33f81c6479682fc955b55aa

URL: https://github.com/llvm/llvm-project/commit/e4b9640867150723b33f81c6479682fc955b55aa
DIFF: https://github.com/llvm/llvm-project/commit/e4b9640867150723b33f81c6479682fc955b55aa.diff

LOG: [SLP]Improve bottom-to-top reordering.

Currently bottom-to-top reordering analysis counts orders of the
operands and then adds natural order counts for the operand users. It is
very conservative, this the user nodes themselves may require
reordering. Patch improves bottom-to-top analysis by checking for the
user nodes if they require/allows the reordring. If the user node must
be reordered, has reused scalars, is an alternate op vectorization node,
is a non-ordered gather node or may allow reordering because of the
reordered operands, such node is considered as the node that allows
reodring and is not counted as a node with the natural order.

Differential Revision: https://reviews.llvm.org/D120492

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
    llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 387d12be3e016..1eab725ac64e6 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1904,6 +1904,41 @@ class BoUpSLP {
   ~BoUpSLP();
 
 private:
+  /// Check if the operands on the edges \p Edges of the \p UserTE allows
+  /// reordering (i.e. the operands can be reordered because they have only one
+  /// user and reordarable).
+  /// \param NonVectorized List of all gather nodes that require reordering
+  /// (e.g., gather of extractlements or partially vectorizable loads).
+  /// \param GatherOps List of gather operand nodes for \p UserTE that require
+  /// reordering, subset of \p NonVectorized.
+  bool
+  canReorderOperands(TreeEntry *UserTE,
+                     SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+                     ArrayRef<TreeEntry *> ReorderableGathers,
+                     SmallVectorImpl<TreeEntry *> &GatherOps);
+
+  /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+  /// if any. If it is not vectorized (gather node), returns nullptr.
+  TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
+    ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
+    TreeEntry *TE = nullptr;
+    const auto *It = find_if(VL, [this, &TE](Value *V) {
+      TE = getTreeEntry(V);
+      return TE;
+    });
+    if (It != VL.end() && TE->isSame(VL))
+      return TE;
+    return nullptr;
+  }
+
+  /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
+  /// if any. If it is not vectorized (gather node), returns nullptr.
+  const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
+                                        unsigned OpIdx) const {
+    return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
+        const_cast<TreeEntry *>(UserTE), OpIdx);
+  }
+
   /// Checks if all users of \p I are the part of the vectorization tree.
   bool areAllUsersVectorized(Instruction *I,
                              ArrayRef<Value *> VectorizedVals) const;
@@ -3383,6 +3418,44 @@ void BoUpSLP::reorderTopToBottom() {
   }
 }
 
+bool BoUpSLP::canReorderOperands(
+    TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
+    ArrayRef<TreeEntry *> ReorderableGathers,
+    SmallVectorImpl<TreeEntry *> &GatherOps) {
+  for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
+    if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
+          return OpData.first == I &&
+                 OpData.second->State == TreeEntry::Vectorize;
+        }))
+      continue;
+    if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
+      // Do not reorder if operand node is used by many user nodes.
+      if (any_of(TE->UserTreeIndices,
+                 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
+        return false;
+      // Add the node to the list of the ordered nodes with the identity
+      // order.
+      Edges.emplace_back(I, TE);
+      continue;
+    }
+    ArrayRef<Value *> VL = UserTE->getOperand(I);
+    TreeEntry *Gather = nullptr;
+    if (count_if(ReorderableGathers, [VL, &Gather](TreeEntry *TE) {
+          assert(TE->State != TreeEntry::Vectorize &&
+                 "Only non-vectorized nodes are expected.");
+          if (TE->isSame(VL)) {
+            Gather = TE;
+            return true;
+          }
+          return false;
+        }) > 1)
+      return false;
+    if (Gather)
+      GatherOps.push_back(Gather);
+  }
+  return true;
+}
+
 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SetVector<TreeEntry *> OrderedEntries;
   DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
@@ -3403,42 +3476,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     }
   });
 
-  // Checks if the operands of the users are reordarable and have only single
-  // use.
-  auto &&CheckOperands =
-      [this, &NonVectorized](const auto &Data,
-                             SmallVectorImpl<TreeEntry *> &GatherOps) {
-        for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) {
-          if (any_of(Data.second,
-                     [I](const std::pair<unsigned, TreeEntry *> &OpData) {
-                       return OpData.first == I &&
-                              OpData.second->State == TreeEntry::Vectorize;
-                     }))
-            continue;
-          ArrayRef<Value *> VL = Data.first->getOperand(I);
-          const TreeEntry *TE = nullptr;
-          const auto *It = find_if(VL, [this, &TE](Value *V) {
-            TE = getTreeEntry(V);
-            return TE;
-          });
-          if (It != VL.end() && TE->isSame(VL))
-            return false;
-          TreeEntry *Gather = nullptr;
-          if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) {
-                assert(TE->State != TreeEntry::Vectorize &&
-                       "Only non-vectorized nodes are expected.");
-                if (TE->isSame(VL)) {
-                  Gather = TE;
-                  return true;
-                }
-                return false;
-              }) > 1)
-            return false;
-          if (Gather)
-            GatherOps.push_back(Gather);
-        }
-        return true;
-      };
   // 1. Propagate order to the graph nodes, which use only reordered nodes.
   // I.e., if the node has operands, that are reordered, try to make at least
   // one operand order in the natural order and reorder others + reorder the
@@ -3475,10 +3512,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     // Erase filtered entries.
     for_each(Filtered,
              [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
-    for (const auto &Data : Users) {
+    for (auto &Data : Users) {
       // Check that operands are used only in the User node.
       SmallVector<TreeEntry *> GatherOps;
-      if (!CheckOperands(Data, GatherOps)) {
+      if (!canReorderOperands(Data.first, Data.second, NonVectorized,
+                              GatherOps)) {
         for_each(Data.second,
                  [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
                    OrderedEntries.remove(Op.second);
@@ -3494,6 +3532,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       // the same node my be considered several times, though might be not
       // profitable.
       SmallPtrSet<const TreeEntry *, 4> VisitedOps;
+      SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
       for (const auto &Op : Data.second) {
         TreeEntry *OpTE = Op.second;
         if (!VisitedOps.insert(OpTE).second)
@@ -3506,6 +3545,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             return GathersToOrders.find(OpTE)->second;
           return OpTE->ReorderIndices;
         }();
+        unsigned NumOps = count_if(
+            Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
+              return P.second == OpTE;
+            });
         // Stores actually store the mask, not the order, need to invert.
         if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
             OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
@@ -3517,14 +3560,52 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
           });
           fixupOrderingIndices(CurrentOrder);
-          ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
+          OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
+              NumOps;
         } else {
-          ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
+          OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
+        }
+        auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
+        const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
+                                            const TreeEntry *TE) {
+          if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
+              (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
+              (IgnoreReorder && TE->Idx == 0))
+            return true;
+          if (TE->State == TreeEntry::NeedToGather) {
+            auto It = GathersToOrders.find(TE);
+            if (It != GathersToOrders.end())
+              return !It->second.empty();
+            return true;
+          }
+          return false;
+        };
+        for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
+          TreeEntry *UserTE = EI.UserTE;
+          if (!VisitedUsers.insert(UserTE).second)
+            continue;
+          // May reorder user node if it requires reordering, has reused
+          // scalars, is an alternate op vectorize node or its op nodes require
+          // reordering.
+          if (AllowsReordering(UserTE))
+            continue;
+          // Check if users allow reordering.
+          // Currently look up just 1 level of operands to avoid increase of
+          // the compile time.
+          // Profitable to reorder if definitely more operands allow
+          // reordering rather than those with natural order.
+          ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
+          if (static_cast<unsigned>(count_if(
+                  Ops, [UserTE, &AllowsReordering](
+                           const std::pair<unsigned, TreeEntry *> &Op) {
+                    return AllowsReordering(Op.second) &&
+                           all_of(Op.second->UserTreeIndices,
+                                  [UserTE](const EdgeInfo &EI) {
+                                    return EI.UserTE == UserTE;
+                                  });
+                  })) <= Ops.size() / 2)
+            ++Res.first->second;
         }
-        OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
-            OpTE->UserTreeIndices.size();
-        assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0.");
-        --OrdersUses[{}];
       }
       // If no orders - skip current nodes and jump to the next one, if any.
       if (OrdersUses.empty()) {
@@ -3565,7 +3646,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         OrderedEntries.remove(TE);
         if (!VisitedOps.insert(TE).second)
           continue;
-        if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) {
+        if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
           // Just reorder reuses indices.
           reorderReuses(TE->ReuseShuffleIndices, Mask);
           continue;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
index 2ac4b7d83c41a..cd8ae3540f7da 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -18,19 +18,16 @@ define void @test(i32* %0, i32* %1, i32* %2) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP20]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[TMP25]], <i32 1, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], <i32 1, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <4 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[TMP25]], <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub <4 x i32> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i32> [[TMP29]], <4 x i32> [[TMP30]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll
index 810e166c40c07..9bbe2d5d673c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll
@@ -7,10 +7,9 @@ define i32 @test(i32* %isec) {
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1
 ; CHECK-NEXT:    br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]]
 ; CHECK:       block1:
@@ -18,10 +17,10 @@ define i32 @test(i32* %isec) {
 ; CHECK:       block2:
 ; CHECK-NEXT:    br label [[BLOCK3]]
 ; CHECK:       block3:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ [[SHUFFLE]], [[BLOCK1]] ], [ [[SHUFFLE]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
 entry: