[llvm] 455ca0e - [SLP] Allow reordering of vectorization trees with reused instructions.
Eric Christopher via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 18 13:47:38 PDT 2020
Following up here:
After discussing with Alexey on IRC I've temporarily reverted this. Bogdan
was seeing infinite loops in compilation and is going to follow up with a
backtrace and a test case later if the backtrace isn't enough.
Reverted thusly:
echristo at athyra ~/s/llvm-project (master)> git push
To github.com:llvm/llvm-project.git
b168bbfae42..ecfd8161bf4 master -> master
Thanks a ton Alexey, we'll get back to you asap.
-eric
On Fri, Sep 18, 2020 at 9:38 AM Alexey Bataev via llvm-commits <
llvm-commits at lists.llvm.org> wrote:
>
> Author: Alexey Bataev
> Date: 2020-09-18T09:34:59-04:00
> New Revision: 455ca0ebb69210046928fedffe292420a30f89ad
>
> URL:
> https://github.com/llvm/llvm-project/commit/455ca0ebb69210046928fedffe292420a30f89ad
> DIFF:
> https://github.com/llvm/llvm-project/commit/455ca0ebb69210046928fedffe292420a30f89ad.diff
>
> LOG: [SLP] Allow reordering of vectorization trees with reused
> instructions.
>
> If some leaves have the same instructions to be vectorized, we may
> incorrectly evaluate the best order for the root node (it is built for the
> vector of instructions without repeated instructions and, thus, has less
> elements than the root node). In this case we just can not try to reorder
> the tree + we may calculate the wrong number of nodes that requre the
> same reordering.
> For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
> are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
> leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
> be reordered, the best order will be \<1, 0\>. We need to extend this
> order for the root node. For the root node this order should look like
> \<3, 0, 1, 2\>. This patch allows extension of the orders of the nodes
> with the reused instructions.
>
> Reviewed By: RKSimon
>
> Differential Revision: https://reviews.llvm.org/D45263
>
> Added:
>
>
> Modified:
> llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
> llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
> llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
> llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
>
> Removed:
>
>
>
>
> ################################################################################
> diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
> b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
> index c487301177c1..e4cad01e958a 100644
> --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
> +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
> @@ -523,6 +523,15 @@ static bool isSimple(Instruction *I) {
>
> namespace llvm {
>
> +static void inversePermutation(ArrayRef<unsigned> Indices,
> + SmallVectorImpl<int> &Mask) {
> + Mask.clear();
> + const unsigned E = Indices.size();
> + Mask.resize(E, E + 1);
> + for (unsigned I = 0; I < E; ++I)
> + Mask[Indices[I]] = I;
> +}
> +
> namespace slpvectorizer {
>
> /// Bottom Up SLP Vectorizer.
> @@ -537,6 +546,7 @@ class BoUpSLP {
> using StoreList = SmallVector<StoreInst *, 8>;
> using ExtraValueToDebugLocsMap =
> MapVector<Value *, SmallVector<Instruction *, 2>>;
> + using OrdersType = SmallVector<unsigned, 4>;
>
> BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
> TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
> @@ -614,6 +624,14 @@ class BoUpSLP {
>
> /// \returns The best order of instructions for vectorization.
> Optional<ArrayRef<unsigned>> bestOrder() const {
> + assert(llvm::all_of(
> + NumOpsWantToKeepOrder,
> + [this](const decltype(NumOpsWantToKeepOrder)::value_type
> &D) {
> + return D.getFirst().size() ==
> + VectorizableTree[0]->Scalars.size();
> + }) &&
> + "All orders must have the same size as number of instructions
> in "
> + "tree node.");
> auto I = std::max_element(
> NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
> [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
> @@ -627,6 +645,79 @@ class BoUpSLP {
> return makeArrayRef(I->getFirst());
> }
>
> + /// Builds the correct order for root instructions.
> + /// If some leaves have the same instructions to be vectorized, we may
> + /// incorrectly evaluate the best order for the root node (it is built
> for the
> + /// vector of instructions without repeated instructions and, thus, has
> less
> + /// elements than the root node). This function builds the correct
> order for
> + /// the root node.
> + /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the
> leaves
> + /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the
> first
> + /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf
> should
> + /// be reordered, the best order will be \<1, 0\>. We need to extend
> this
> + /// order for the root node. For the root node this order should look
> like
> + /// \<3, 0, 1, 2\>. This function extends the order for the reused
> + /// instructions.
> + void findRootOrder(OrdersType &Order) {
> + // If the leaf has the same number of instructions to vectorize as
> the root
> + // - order must be set already.
> + unsigned RootSize = VectorizableTree[0]->Scalars.size();
> + if (Order.size() == RootSize)
> + return;
> + SmallVector<unsigned, 4> RealOrder(Order.size());
> + std::swap(Order, RealOrder);
> + SmallVector<int, 4> Mask;
> + inversePermutation(RealOrder, Mask);
> + for (int I = 0, E = Mask.size(); I < E; ++I)
> + Order[I] = Mask[I];
> + // The leaf has less number of instructions - need to find the true
> order of
> + // the root.
> + // Scan the nodes starting from the leaf back to the root.
> + const TreeEntry *PNode = VectorizableTree.back().get();
> + while (PNode) {
> + const TreeEntry &Node = *PNode;
> + PNode = Node.UserTreeIndices.back().UserTE;
> + if (Node.ReuseShuffleIndices.empty())
> + continue;
> + // Build the order for the parent node.
> + OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
> + SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
> + // The algorithm of the order extension is:
> + // 1. Calculate the number of the same instructions for the order.
> + // 2. Calculate the index of the new order: total number of
> instructions
> + // with order less than the order of the current instruction + reuse
> + // number of the current instruction.
> + // 3. The new order is just the index of the instruction in the
> original
> + // vector of the instructions.
> + for (unsigned I : Node.ReuseShuffleIndices)
> + ++OrderCounter[Order[I]];
> + SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
> + for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E;
> ++I) {
> + unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
> + unsigned OrderIdx = Order[ReusedIdx];
> + unsigned NewIdx = 0;
> + for (unsigned J = 0; J < OrderIdx; ++J)
> + NewIdx += OrderCounter[J];
> + NewIdx += CurrentCounter[OrderIdx];
> + ++CurrentCounter[OrderIdx];
> + assert(NewOrder[NewIdx] == RootSize &&
> + "The order index should not be written already.");
> + NewOrder[NewIdx] = I;
> + }
> + std::swap(Order, NewOrder);
> + // If the size of the order is the same as number of instructions
> in the
> + // root node, no need to extend it more.
> + if (Order.size() == RootSize)
> + break;
> + }
> + assert((!PNode || Order.size() == RootSize) &&
> + "Root node is expected or the size of the order must be the
> same as "
> + "the number of elements in the root node.");
> + assert(llvm::all_of(Order,
> + [RootSize](unsigned Val) { return Val !=
> RootSize; }) &&
> + "All indices must be initialized");
> + }
> +
> /// \return The vector element size in bits to use when vectorizing the
> /// expression tree ending at \p V. If V is a store, the size is the
> width of
> /// the stored value. Otherwise, the size is the width of the largest
> loaded
> @@ -1467,7 +1558,7 @@ class BoUpSLP {
> SmallVector<int, 4> ReuseShuffleIndices;
>
> /// Does this entry require reordering?
> - ArrayRef<unsigned> ReorderIndices;
> + SmallVector<unsigned, 4> ReorderIndices;
>
> /// Points back to the VectorizableTree.
> ///
> @@ -1660,7 +1751,7 @@ class BoUpSLP {
> Last->State = Vectorized ? TreeEntry::Vectorize :
> TreeEntry::NeedToGather;
> Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
> ReuseShuffleIndices.end());
> - Last->ReorderIndices = ReorderIndices;
> + Last->ReorderIndices.append(ReorderIndices.begin(),
> ReorderIndices.end());
> Last->setOperations(S);
> if (Vectorized) {
> for (int i = 0, e = VL.size(); i != e; ++i) {
> @@ -2197,7 +2288,6 @@ class BoUpSLP {
> /// List of users to ignore during scheduling and that don't need
> extracting.
> ArrayRef<Value *> UserIgnoreList;
>
> - using OrdersType = SmallVector<unsigned, 4>;
> /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
> /// sorted SmallVectors of unsigned.
> struct OrdersTypeDenseMapInfo {
> @@ -2659,12 +2749,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,
> unsigned Depth,
> });
> // Insert new order with initial value 0, if it does not exist,
> // otherwise return the iterator to the existing one.
> - auto StoredCurrentOrderAndNum =
> - NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
> - ++StoredCurrentOrderAndNum->getSecond();
> newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
> - ReuseShuffleIndicies,
> - StoredCurrentOrderAndNum->getFirst());
> + ReuseShuffleIndicies, CurrentOrder);
> + findRootOrder(CurrentOrder);
> + ++NumOpsWantToKeepOrder[CurrentOrder];
> // This is a special case, as it does not gather, but at the same
> time
> // we are not extending buildTree_rec() towards the operands.
> ValueList Op0;
> @@ -2741,13 +2829,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,
> unsigned Depth,
> LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
> } else {
> // Need to reorder.
> - auto I =
> NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
> - ++I->getSecond();
> TreeEntry *TE =
> newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
> - ReuseShuffleIndicies, I->getFirst());
> + ReuseShuffleIndicies, CurrentOrder);
> TE->setOperandsInOrder();
> LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled
> loads.\n");
> + findRootOrder(CurrentOrder);
> + ++NumOpsWantToKeepOrder[CurrentOrder];
> }
> return;
> }
> @@ -3003,15 +3091,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,
> unsigned Depth,
> buildTree_rec(Operands, Depth + 1, {TE, 0});
> LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
> } else {
> - // Need to reorder.
> - auto I =
> NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
> - ++(I->getSecond());
> TreeEntry *TE =
> newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
> - ReuseShuffleIndicies, I->getFirst());
> + ReuseShuffleIndicies, CurrentOrder);
> TE->setOperandsInOrder();
> buildTree_rec(Operands, Depth + 1, {TE, 0});
> LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled
> stores.\n");
> + findRootOrder(CurrentOrder);
> + ++NumOpsWantToKeepOrder[CurrentOrder];
> }
> return;
> }
> @@ -4141,15 +4228,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL)
> {
> return V;
> }
>
> -static void inversePermutation(ArrayRef<unsigned> Indices,
> - SmallVectorImpl<int> &Mask) {
> - Mask.clear();
> - const unsigned E = Indices.size();
> - Mask.resize(E);
> - for (unsigned I = 0; I < E; ++I)
> - Mask[Indices[I]] = I;
> -}
> -
> Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
> IRBuilder<>::InsertPointGuard Guard(Builder);
>
> @@ -6873,8 +6951,10 @@ class HorizontalReduction {
> ArrayRef<Value *> VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
> V.buildTree(VL, ExternallyUsedValues, IgnoreList);
> Optional<ArrayRef<unsigned>> Order = V.bestOrder();
> - // TODO: Handle orders of size less than number of elements in the
> vector.
> - if (Order && Order->size() == VL.size()) {
> + if (Order) {
> + assert(Order->size() == VL.size() &&
> + "Order size must be the same as number of vectorized "
> + "instructions.");
> // TODO: reorder tree nodes without tree rebuilding.
> SmallVector<Value *, 4> ReorderedOps(VL.size());
> llvm::transform(*Order, ReorderedOps.begin(),
>
> diff --git
> a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
> b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
> index 8b12b9272c7e..a84b1f7e4fcd 100644
> --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
> +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
> @@ -11,7 +11,7 @@
> @h = common dso_local global float 0.000000e+00, align 4
>
> define dso_local void @j() local_unnamed_addr {
> -; CHECK-LABEL: define {{[^@]+}}@j(
> +; CHECK-LABEL: @j(
> ; CHECK-NEXT: entry:
> ; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** @b, align 8
> ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32*
> [[TMP0]], i64 4
> @@ -19,42 +19,39 @@ define dso_local void @j() local_unnamed_addr {
> ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32*
> [[TMP0]], i64 5
> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
> ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]],
> align 4
> -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x i32>
> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32*
> [[TMP0]], i64 13
> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*
> ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]],
> align 4
> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32>
> [[TMP4]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[REORDER_SHUFFLE]],
> [[REORDER_SHUFFLE1]]
> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
> ; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float>
> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float
> 1.000000e+01, float 1.000000e+01>
> -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> <float 0.000000e+00,
> float 1.000000e+00>, [[TMP7]]
> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2
> x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]],
> i32 1
> +; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> <float 1.000000e+00,
> float 0.000000e+00>, [[TMP7]]
> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2
> x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]],
> i32 0
> ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4
> -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float
> -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
> -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 2
> +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float
> -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00>
> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 3
> ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4
> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 0
> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 2
> ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4
> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 3
> +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 1
> ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4
> -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 1
> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]],
> i32 0
> ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4
> ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32*
> [[TMP0]], i64 14
> ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32*
> [[TMP0]], i64 15
> ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4
> ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float
> -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> undef, float
> [[CONV19]], i32 0
> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]],
> float -1.000000e+00, i32 1
> -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[SHUFFLE]],
> i32 0
> -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]],
> float [[TMP18]], i32 2
> -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]],
> float -1.000000e+00, i32 3
> -; CHECK-NEXT: [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP20]]
> -; CHECK-NEXT: [[TMP22:%.*]] = fadd <4 x float> [[TMP10]], [[TMP20]]
> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[TMP21]], <4
> x float> [[TMP22]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
> -; CHECK-NEXT: [[TMP24:%.*]] = fptosi <4 x float> [[TMP23]] to <4 x i32>
> -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
> -; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
> +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> <float
> -1.000000e+00, float -1.000000e+00, float undef, float undef>, float
> [[CONV19]], i32 2
> +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]],
> i32 2
> +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]],
> float [[TMP17]], i32 3
> +; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]]
> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]]
> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4
> x float> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
> +; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32>
> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32>
> [[TMP22]], <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
> +; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>*
> [[TMP23]], align 4
> ; CHECK-NEXT: ret void
> ;
> entry:
>
> diff --git
> a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
> b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
> index 384e540efb79..9ed21a1c3f8c 100644
> --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
> +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
> @@ -14,11 +14,10 @@ define void @hoge() {
> ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]],
> i32 0
> ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16
> undef, i32 1
> ; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32>
> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>,
> [[REORDER_SHUFFLE]]
> +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> <i32 undef, i32 63>,
> [[TMP2]]
> ; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
> -; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2
> x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 undef,
> i32 15, i32 31, i32 47>
> +; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2
> x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 15, i32
> 31, i32 47, i32 undef>
> ; CHECK-NEXT: [[TMP6:%.*]] = call i32
> @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
> ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
> ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63
>
> diff --git
> a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
> b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
> index b7cff2dac5d4..02e7c5b37f3e 100644
> --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
> +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
> @@ -7,16 +7,15 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1,
> i32 %a2, i32 %a3, i32 %a4
> ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32*
> [[ARR:%.*]], i64 1
> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>*
> ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]],
> align 4
> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32>
> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32>
> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32
> 0, i32 0, i32 0, i32 1, i32 1>
> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32
> [[A1:%.*]], i32 0
> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32
> [[A2:%.*]], i32 1
> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32
> [[A3:%.*]], i32 2
> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32
> [[A4:%.*]], i32 3
> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32
> [[A5:%.*]], i32 4
> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32
> [[A6:%.*]], i32 5
> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32
> [[A7:%.*]], i32 6
> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32
> [[A8:%.*]], i32 7
> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x
> i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32
> 1>
> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32
> [[A7:%.*]], i32 0
> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32
> [[A8:%.*]], i32 1
> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32
> [[A1:%.*]], i32 2
> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32
> [[A2:%.*]], i32 3
> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32
> [[A3:%.*]], i32 4
> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32
> [[A4:%.*]], i32 5
> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32
> [[A5:%.*]], i32 6
> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32
> [[A6:%.*]], i32 7
> ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
> ; CHECK-NEXT: [[TMP11:%.*]] = call i32
> @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
> ; CHECK-NEXT: ret i32 [[TMP11]]
> @@ -58,16 +57,15 @@ define i32 @foo1(i32* nocapture readonly %arr, i32
> %a1, i32 %a2, i32 %a3, i32 %a
> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32*
> [[ARR]], i64 3
> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32>
> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32>
> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32
> 0, i32 0, i32 3, i32 1, i32 0>
> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32
> [[A1:%.*]], i32 0
> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32
> [[A2:%.*]], i32 1
> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32
> [[A3:%.*]], i32 2
> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32
> [[A4:%.*]], i32 3
> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32
> [[A5:%.*]], i32 4
> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32
> [[A6:%.*]], i32 5
> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x
> i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32
> 3>
> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32
> [[A6:%.*]], i32 0
> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32
> [[A1:%.*]], i32 1
> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32
> [[A4:%.*]], i32 2
> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32
> [[A5:%.*]], i32 3
> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32
> [[A8:%.*]], i32 4
> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32
> [[A2:%.*]], i32 5
> ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32
> [[A7:%.*]], i32 6
> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32
> [[A8:%.*]], i32 7
> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32
> [[A3:%.*]], i32 7
> ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
> ; CHECK-NEXT: [[TMP11:%.*]] = call i32
> @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
> ; CHECK-NEXT: ret i32 [[TMP11]]
> @@ -113,16 +111,15 @@ define i32 @foo2(i32* nocapture readonly %arr, i32
> %a1, i32 %a2, i32 %a3, i32 %a
> ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32*
> [[ARR]], i64 1
> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]],
> align 4
> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32>
> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32>
> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32
> 2, i32 3, i32 2, i32 1, i32 3>
> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32
> [[A1:%.*]], i32 0
> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32
> [[A2:%.*]], i32 1
> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32
> [[A3:%.*]], i32 2
> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32
> [[A4:%.*]], i32 3
> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32
> [[A5:%.*]], i32 4
> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32
> [[A6:%.*]], i32 5
> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32
> [[A7:%.*]], i32 6
> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32
> [[A8:%.*]], i32 7
> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x
> i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32
> 3>
> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32
> [[A4:%.*]], i32 0
> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32
> [[A6:%.*]], i32 1
> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32
> [[A5:%.*]], i32 2
> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32
> [[A8:%.*]], i32 3
> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32
> [[A2:%.*]], i32 4
> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32
> [[A7:%.*]], i32 5
> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32
> [[A1:%.*]], i32 6
> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32
> [[A3:%.*]], i32 7
> ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
> ; CHECK-NEXT: [[TMP11:%.*]] = call i32
> @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
> ; CHECK-NEXT: ret i32 [[TMP11]]
>
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200918/475fb1dd/attachment-0001.html>
More information about the llvm-commits
mailing list