<div dir="ltr">Following up here:<div><br></div><div>After discussing with Alexey on IRC I've temporarily reverted this. Bogdan was seeing infinite loops in compilation and is going to follow up with a backtrace and a test case later if the backtrace isn't enough.</div><div><br></div><div>Reverted thusly:</div><div>echristo@athyra ~/s/llvm-project (master)> git push<br>To github.com:llvm/llvm-project.git<br>   b168bbfae42..ecfd8161bf4  master -> master<br></div><div><br></div><div>Thanks a ton Alexey, we'll get back to you asap.</div><div><br></div><div>-eric</div><div><br></div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Fri, Sep 18, 2020 at 9:38 AM Alexey Bataev via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><br>

Author: Alexey Bataev<br>

Date: 2020-09-18T09:34:59-04:00<br>

New Revision: 455ca0ebb69210046928fedffe292420a30f89ad<br>

<br>

URL: <a href="https://github.com/llvm/llvm-project/commit/455ca0ebb69210046928fedffe292420a30f89ad" rel="noreferrer" target="_blank">https://github.com/llvm/llvm-project/commit/455ca0ebb69210046928fedffe292420a30f89ad</a><br>

DIFF: <a href="https://github.com/llvm/llvm-project/commit/455ca0ebb69210046928fedffe292420a30f89ad.diff" rel="noreferrer" target="_blank">https://github.com/llvm/llvm-project/commit/455ca0ebb69210046928fedffe292420a30f89ad.diff</a><br>

<br>

LOG: [SLP] Allow reordering of vectorization trees with reused instructions.<br>

<br>

If some leaves have the same instructions to be vectorized, we may<br>

incorrectly evaluate the best order for the root node (it is built for the<br>

vector of instructions without repeated instructions and, thus, has less<br>

elements than the root node). In this case we just can not try to reorder<br>

the tree + we may calculate the wrong number of nodes that requre the<br>

same reordering.<br>

For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves<br>

are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first<br>

leaf, it will be shrink to \<a, b\>. If instructions in this leaf should<br>

be reordered, the best order will be \<1, 0\>. We need to extend this<br>

order for the root node. For the root node this order should look like<br>

\<3, 0, 1, 2\>. This patch allows extension of the orders of the nodes<br>

with the reused instructions.<br>

<br>

Reviewed By: RKSimon<br>

<br>

Differential Revision: <a href="https://reviews.llvm.org/D45263" rel="noreferrer" target="_blank">https://reviews.llvm.org/D45263</a><br>

<br>

Added: <br>

<br>

<br>

Modified: <br>

    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

    llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll<br>

    llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll<br>

    llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll<br>

<br>

Removed: <br>

<br>

<br>

<br>

################################################################################<br>

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

index c487301177c1..e4cad01e958a 100644<br>

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>

@@ -523,6 +523,15 @@ static bool isSimple(Instruction *I) {<br>

<br>

 namespace llvm {<br>

<br>

+static void inversePermutation(ArrayRef<unsigned> Indices,<br>

+                               SmallVectorImpl<int> &Mask) {<br>

+  Mask.clear();<br>

+  const unsigned E = Indices.size();<br>

+  Mask.resize(E, E + 1);<br>

+  for (unsigned I = 0; I < E; ++I)<br>

+    Mask[Indices[I]] = I;<br>

+}<br>

+<br>

 namespace slpvectorizer {<br>

<br>

 /// Bottom Up SLP Vectorizer.<br>

@@ -537,6 +546,7 @@ class BoUpSLP {<br>

   using StoreList = SmallVector<StoreInst *, 8>;<br>

   using ExtraValueToDebugLocsMap =<br>

       MapVector<Value *, SmallVector<Instruction *, 2>>;<br>

+  using OrdersType = SmallVector<unsigned, 4>;<br>

<br>

   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,<br>

           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,<br>

@@ -614,6 +624,14 @@ class BoUpSLP {<br>

<br>

   /// \returns The best order of instructions for vectorization.<br>

   Optional<ArrayRef<unsigned>> bestOrder() const {<br>

+    assert(llvm::all_of(<br>

+               NumOpsWantToKeepOrder,<br>

+               [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {<br>

+                 return D.getFirst().size() ==<br>

+                        VectorizableTree[0]->Scalars.size();<br>

+               }) &&<br>

+           "All orders must have the same size as number of instructions in "<br>

+           "tree node.");<br>

     auto I = std::max_element(<br>

         NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),<br>

         [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,<br>

@@ -627,6 +645,79 @@ class BoUpSLP {<br>

     return makeArrayRef(I->getFirst());<br>

   }<br>

<br>

+  /// Builds the correct order for root instructions.<br>

+  /// If some leaves have the same instructions to be vectorized, we may<br>

+  /// incorrectly evaluate the best order for the root node (it is built for the<br>

+  /// vector of instructions without repeated instructions and, thus, has less<br>

+  /// elements than the root node). This function builds the correct order for<br>

+  /// the root node.<br>

+  /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves<br>

+  /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first<br>

+  /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should<br>

+  /// be reordered, the best order will be \<1, 0\>. We need to extend this<br>

+  /// order for the root node. For the root node this order should look like<br>

+  /// \<3, 0, 1, 2\>. This function extends the order for the reused<br>

+  /// instructions.<br>

+  void findRootOrder(OrdersType &Order) {<br>

+    // If the leaf has the same number of instructions to vectorize as the root<br>

+    // - order must be set already.<br>

+    unsigned RootSize = VectorizableTree[0]->Scalars.size();<br>

+    if (Order.size() == RootSize)<br>

+      return;<br>

+    SmallVector<unsigned, 4> RealOrder(Order.size());<br>

+    std::swap(Order, RealOrder);<br>

+    SmallVector<int, 4> Mask;<br>

+    inversePermutation(RealOrder, Mask);<br>

+    for (int I = 0, E = Mask.size(); I < E; ++I)<br>

+      Order[I] = Mask[I];<br>

+    // The leaf has less number of instructions - need to find the true order of<br>

+    // the root.<br>

+    // Scan the nodes starting from the leaf back to the root.<br>

+    const TreeEntry *PNode = VectorizableTree.back().get();<br>

+    while (PNode) {<br>

+      const TreeEntry &Node = *PNode;<br>

+      PNode = Node.UserTreeIndices.back().UserTE;<br>

+      if (Node.ReuseShuffleIndices.empty())<br>

+        continue;<br>

+      // Build the order for the parent node.<br>

+      OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);<br>

+      SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);<br>

+      // The algorithm of the order extension is:<br>

+      // 1. Calculate the number of the same instructions for the order.<br>

+      // 2. Calculate the index of the new order: total number of instructions<br>

+      // with order less than the order of the current instruction + reuse<br>

+      // number of the current instruction.<br>

+      // 3. The new order is just the index of the instruction in the original<br>

+      // vector of the instructions.<br>

+      for (unsigned I : Node.ReuseShuffleIndices)<br>

+        ++OrderCounter[Order[I]];<br>

+      SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);<br>

+      for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {<br>

+        unsigned ReusedIdx = Node.ReuseShuffleIndices[I];<br>

+        unsigned OrderIdx = Order[ReusedIdx];<br>

+        unsigned NewIdx = 0;<br>

+        for (unsigned J = 0; J < OrderIdx; ++J)<br>

+          NewIdx += OrderCounter[J];<br>

+        NewIdx += CurrentCounter[OrderIdx];<br>

+        ++CurrentCounter[OrderIdx];<br>

+        assert(NewOrder[NewIdx] == RootSize &&<br>

+               "The order index should not be written already.");<br>

+        NewOrder[NewIdx] = I;<br>

+      }<br>

+      std::swap(Order, NewOrder);<br>

+      // If the size of the order is the same as number of instructions in the<br>

+      // root node, no need to extend it more.<br>

+      if (Order.size() == RootSize)<br>

+        break;<br>

+    }<br>

+    assert((!PNode || Order.size() == RootSize) &&<br>

+           "Root node is expected or the size of the order must be the same as "<br>

+           "the number of elements in the root node.");<br>

+    assert(llvm::all_of(Order,<br>

+                        [RootSize](unsigned Val) { return Val != RootSize; }) &&<br>

+           "All indices must be initialized");<br>

+  }<br>

+<br>

   /// \return The vector element size in bits to use when vectorizing the<br>

   /// expression tree ending at \p V. If V is a store, the size is the width of<br>

   /// the stored value. Otherwise, the size is the width of the largest loaded<br>

@@ -1467,7 +1558,7 @@ class BoUpSLP {<br>

     SmallVector<int, 4> ReuseShuffleIndices;<br>

<br>

     /// Does this entry require reordering?<br>

-    ArrayRef<unsigned> ReorderIndices;<br>

+    SmallVector<unsigned, 4> ReorderIndices;<br>

<br>

     /// Points back to the VectorizableTree.<br>

     ///<br>

@@ -1660,7 +1751,7 @@ class BoUpSLP {<br>

     Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;<br>

     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),<br>

                                      ReuseShuffleIndices.end());<br>

-    Last->ReorderIndices = ReorderIndices;<br>

+    Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());<br>

     Last->setOperations(S);<br>

     if (Vectorized) {<br>

       for (int i = 0, e = VL.size(); i != e; ++i) {<br>

@@ -2197,7 +2288,6 @@ class BoUpSLP {<br>

   /// List of users to ignore during scheduling and that don't need extracting.<br>

   ArrayRef<Value *> UserIgnoreList;<br>

<br>

-  using OrdersType = SmallVector<unsigned, 4>;<br>

   /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of<br>

   /// sorted SmallVectors of unsigned.<br>

   struct OrdersTypeDenseMapInfo {<br>

@@ -2659,12 +2749,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,<br>

         });<br>

         // Insert new order with initial value 0, if it does not exist,<br>

         // otherwise return the iterator to the existing one.<br>

-        auto StoredCurrentOrderAndNum =<br>

-            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;<br>

-        ++StoredCurrentOrderAndNum->getSecond();<br>

         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,<br>

-                     ReuseShuffleIndicies,<br>

-                     StoredCurrentOrderAndNum->getFirst());<br>

+                     ReuseShuffleIndicies, CurrentOrder);<br>

+        findRootOrder(CurrentOrder);<br>

+        ++NumOpsWantToKeepOrder[CurrentOrder];<br>

         // This is a special case, as it does not gather, but at the same time<br>

         // we are not extending buildTree_rec() towards the operands.<br>

         ValueList Op0;<br>

@@ -2741,13 +2829,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,<br>

             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");<br>

           } else {<br>

             // Need to reorder.<br>

-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;<br>

-            ++I->getSecond();<br>

             TreeEntry *TE =<br>

                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,<br>

-                             ReuseShuffleIndicies, I->getFirst());<br>

+                             ReuseShuffleIndicies, CurrentOrder);<br>

             TE->setOperandsInOrder();<br>

             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");<br>

+            findRootOrder(CurrentOrder);<br>

+            ++NumOpsWantToKeepOrder[CurrentOrder];<br>

           }<br>

           return;<br>

         }<br>

@@ -3003,15 +3091,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,<br>

             buildTree_rec(Operands, Depth + 1, {TE, 0});<br>

             LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");<br>

           } else {<br>

-            // Need to reorder.<br>

-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;<br>

-            ++(I->getSecond());<br>

             TreeEntry *TE =<br>

                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,<br>

-                             ReuseShuffleIndicies, I->getFirst());<br>

+                             ReuseShuffleIndicies, CurrentOrder);<br>

             TE->setOperandsInOrder();<br>

             buildTree_rec(Operands, Depth + 1, {TE, 0});<br>

             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");<br>

+            findRootOrder(CurrentOrder);<br>

+            ++NumOpsWantToKeepOrder[CurrentOrder];<br>

           }<br>

           return;<br>

         }<br>

@@ -4141,15 +4228,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {<br>

   return V;<br>

 }<br>

<br>

-static void inversePermutation(ArrayRef<unsigned> Indices,<br>

-                               SmallVectorImpl<int> &Mask) {<br>

-  Mask.clear();<br>

-  const unsigned E = Indices.size();<br>

-  Mask.resize(E);<br>

-  for (unsigned I = 0; I < E; ++I)<br>

-    Mask[Indices[I]] = I;<br>

-}<br>

-<br>

 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {<br>

   IRBuilder<>::InsertPointGuard Guard(Builder);<br>

<br>

@@ -6873,8 +6951,10 @@ class HorizontalReduction {<br>

       ArrayRef<Value *> VL = makeArrayRef(&ReducedVals[i], ReduxWidth);<br>

       V.buildTree(VL, ExternallyUsedValues, IgnoreList);<br>

       Optional<ArrayRef<unsigned>> Order = V.bestOrder();<br>

-      // TODO: Handle orders of size less than number of elements in the vector.<br>

-      if (Order && Order->size() == VL.size()) {<br>

+      if (Order) {<br>

+        assert(Order->size() == VL.size() &&<br>

+               "Order size must be the same as number of vectorized "<br>

+               "instructions.");<br>

         // TODO: reorder tree nodes without tree rebuilding.<br>

         SmallVector<Value *, 4> ReorderedOps(VL.size());<br>

         llvm::transform(*Order, ReorderedOps.begin(),<br>

<br>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll<br>

index 8b12b9272c7e..a84b1f7e4fcd 100644<br>

--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll<br>

+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll<br>

@@ -11,7 +11,7 @@<br>

 @h = common dso_local global float 0.000000e+00, align 4<br>

<br>

 define dso_local void @j() local_unnamed_addr {<br>

-; CHECK-LABEL: define {{[^@]+}}@j(<br>

+; CHECK-LABEL: @j(<br>

 ; CHECK-NEXT:  entry:<br>

 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** @b, align 8<br>

 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4<br>

@@ -19,42 +19,39 @@ define dso_local void @j() local_unnamed_addr {<br>

 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5<br>

 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*<br>

 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4<br>

-; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 0><br>

 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13<br>

 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*<br>

 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4<br>

-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <2 x i32> <i32 1, i32 0><br>

-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]<br>

+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]<br>

 ; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float><br>

 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 1.000000e+01, float 1.000000e+01><br>

-; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> <float 0.000000e+00, float 1.000000e+00>, [[TMP7]]<br>

-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1><br>

-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1<br>

+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP7]]<br>

+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1><br>

+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0<br>

 ; CHECK-NEXT:    store float [[TMP9]], float* @g, align 4<br>

-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00><br>

-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2<br>

+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00><br>

+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3<br>

 ; CHECK-NEXT:    store float [[TMP11]], float* @c, align 4<br>

-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0<br>

+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2<br>

 ; CHECK-NEXT:    store float [[TMP12]], float* @d, align 4<br>

-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3<br>

+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1<br>

 ; CHECK-NEXT:    store float [[TMP13]], float* @e, align 4<br>

-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1<br>

+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0<br>

 ; CHECK-NEXT:    store float [[TMP14]], float* @f, align 4<br>

 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14<br>

 ; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15<br>

 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* @a, align 4<br>

 ; CHECK-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float<br>

-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> undef, float [[CONV19]], i32 0<br>

-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float -1.000000e+00, i32 1<br>

-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0<br>

-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 2<br>

-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float -1.000000e+00, i32 3<br>

-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP20]]<br>

-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <4 x float> [[TMP10]], [[TMP20]]<br>

-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP22]], <4 x i32> <i32 0, i32 5, i32 2, i32 7><br>

-; CHECK-NEXT:    [[TMP24:%.*]] = fptosi <4 x float> [[TMP23]] to <4 x i32><br>

-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*<br>

-; CHECK-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4<br>

+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> <float -1.000000e+00, float -1.000000e+00, float undef, float undef>, float [[CONV19]], i32 2<br>

+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2<br>

+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3<br>

+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]]<br>

+; CHECK-NEXT:    [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]]<br>

+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7><br>

+; CHECK-NEXT:    [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32><br>

+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1><br>

+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*<br>

+; CHECK-NEXT:    store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP23]], align 4<br>

 ; CHECK-NEXT:    ret void<br>

 ;<br>

 entry:<br>

<br>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll<br>

index 384e540efb79..9ed21a1c3f8c 100644<br>

--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll<br>

+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll<br>

@@ -14,11 +14,10 @@ define void @hoge() {<br>

 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]], i32 0<br>

 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1<br>

 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32><br>

-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 0><br>

-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[REORDER_SHUFFLE]]<br>

+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 undef, i32 63>, [[TMP2]]<br>

 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef<br>

-; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1><br>

-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 undef, i32 15, i32 31, i32 47><br>

+; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1><br>

+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 15, i32 31, i32 47, i32 undef><br>

 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])<br>

 ; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef<br>

 ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63<br>

<br>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll<br>

index b7cff2dac5d4..02e7c5b37f3e 100644<br>

--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll<br>

+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll<br>

@@ -7,16 +7,15 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4<br>

 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1<br>

 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>*<br>

 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4<br>

-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 0><br>

-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1><br>

-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0<br>

-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1<br>

-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2<br>

-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3<br>

-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4<br>

-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5<br>

-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6<br>

-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7<br>

+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1><br>

+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A7:%.*]], i32 0<br>

+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1<br>

+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2<br>

+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3<br>

+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4<br>

+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5<br>

+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6<br>

+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7<br>

 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]<br>

 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])<br>

 ; CHECK-NEXT:    ret i32 [[TMP11]]<br>

@@ -58,16 +57,15 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a<br>

 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3<br>

 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*<br>

 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4<br>

-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0><br>

-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 1, i32 0><br>

-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0<br>

-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1<br>

-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2<br>

-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3<br>

-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4<br>

-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5<br>

+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 3><br>

+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A6:%.*]], i32 0<br>

+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1<br>

+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2<br>

+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3<br>

+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4<br>

+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5<br>

 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6<br>

-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7<br>

+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7<br>

 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]<br>

 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])<br>

 ; CHECK-NEXT:    ret i32 [[TMP11]]<br>

@@ -113,16 +111,15 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a<br>

 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1<br>

 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*<br>

 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4<br>

-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1><br>

-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 2, i32 3, i32 2, i32 1, i32 3><br>

-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0<br>

-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1<br>

-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2<br>

-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3<br>

-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4<br>

-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5<br>

-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6<br>

-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7<br>

+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3><br>

+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0<br>

+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1<br>

+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2<br>

+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3<br>

+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4<br>

+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5<br>

+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6<br>

+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7<br>

 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]<br>

 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])<br>

 ; CHECK-NEXT:    ret i32 [[TMP11]]<br>

<br>

<br>

<br>

_______________________________________________<br>

llvm-commits mailing list<br>

<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>

<a href="https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>

</blockquote></div>