[llvm] 8220415 - [SLP] Make getSameOpcode support different instructions if they have same semantics. (#112181)

via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 12 20:06:14 PST 2024


Author: Han-Kuan Chen
Date: 2024-12-13T12:06:10+08:00
New Revision: 82204154b7bd1f8c487c94c7ef00399d776b29f0

URL: https://github.com/llvm/llvm-project/commit/82204154b7bd1f8c487c94c7ef00399d776b29f0
DIFF: https://github.com/llvm/llvm-project/commit/82204154b7bd1f8c487c94c7ef00399d776b29f0.diff

LOG: [SLP] Make getSameOpcode support different instructions if they have same semantics. (#112181)

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
    llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
    llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
    llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
    llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
    llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
    llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
    llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
    llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
    llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0e11e8704db2f2..f1391ae93c8630 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -842,8 +842,123 @@ class InstructionsState {
   static InstructionsState invalid() { return {nullptr, nullptr}; }
 };
 
+struct InterchangeableInstruction {
+  unsigned Opcode;
+  SmallVector<Value *> Ops;
+  template <class... ArgTypes>
+  InterchangeableInstruction(unsigned Opcode, ArgTypes &&...Args)
+      : Opcode(Opcode), Ops{std::forward<decltype(Args)>(Args)...} {}
+};
+
+bool operator<(const InterchangeableInstruction &LHS,
+               const InterchangeableInstruction &RHS) {
+  return LHS.Opcode < RHS.Opcode;
+}
+
 } // end anonymous namespace
 
+/// \returns a sorted list of interchangeable instructions by instruction opcode
+/// that \p I can be converted to.
+/// e.g.,
+/// x << y -> x * (2^y)
+/// x << 1 -> x *   2
+/// x << 0 -> x *   1   -> x - 0 -> x + 0 -> x & 11...1 -> x | 0
+///           x *   0                     -> x & 0
+///           x *  -1   -> 0 - x
+/// TODO: support more patterns
+static SmallVector<InterchangeableInstruction>
+getInterchangeableInstruction(Instruction *I) {
+  // PII = Possible Interchangeable Instruction
+  SmallVector<InterchangeableInstruction> PII;
+  unsigned Opcode = I->getOpcode();
+  PII.emplace_back(Opcode, I->operands());
+  if (!is_contained({Instruction::Shl, Instruction::Mul, Instruction::Sub,
+                     Instruction::Add},
+                    Opcode))
+    return PII;
+  Constant *C;
+  if (match(I, m_BinOp(m_Value(), m_Constant(C)))) {
+    ConstantInt *V = nullptr;
+    if (auto *CI = dyn_cast<ConstantInt>(C)) {
+      V = CI;
+    } else if (auto *CDV = dyn_cast<ConstantDataVector>(C)) {
+      if (auto *CI = dyn_cast_if_present<ConstantInt>(CDV->getSplatValue()))
+        V = CI;
+    }
+    if (!V)
+      return PII;
+    Value *Op0 = I->getOperand(0);
+    Type *Op1Ty = I->getOperand(1)->getType();
+    const APInt &Op1Int = V->getValue();
+    Constant *Zero =
+        ConstantInt::get(Op1Ty, APInt::getZero(Op1Int.getBitWidth()));
+    Constant *UnsignedMax =
+        ConstantInt::get(Op1Ty, APInt::getMaxValue(Op1Int.getBitWidth()));
+    switch (Opcode) {
+    case Instruction::Shl: {
+      PII.emplace_back(Instruction::Mul, Op0,
+                       ConstantInt::get(Op1Ty, 1 << Op1Int.getZExtValue()));
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::Sub, Op0, Zero);
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    }
+    case Instruction::Mul: {
+      if (Op1Int.isOne()) {
+        PII.emplace_back(Instruction::Sub, Op0, Zero);
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      } else if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::And, Op0, Zero);
+      } else if (Op1Int.isAllOnes()) {
+        PII.emplace_back(Instruction::Sub, Zero, Op0);
+      }
+      break;
+    }
+    case Instruction::Sub:
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    case Instruction::Add:
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    }
+  }
+  // std::set_intersection requires a sorted range.
+  sort(PII);
+  return PII;
+}
+
+/// \returns the Op and operands which \p I convert to.
+static std::pair<Value *, SmallVector<Value *>>
+getInterchangeableInstruction(Instruction *I, Instruction *MainOp,
+                              Instruction *AltOp) {
+  SmallVector<InterchangeableInstruction> IIList =
+      getInterchangeableInstruction(I);
+  const auto *Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+    return II.Opcode == MainOp->getOpcode();
+  });
+  if (Iter == IIList.end()) {
+    Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+      return II.Opcode == AltOp->getOpcode();
+    });
+    assert(Iter != IIList.end() &&
+           "Cannot find an interchangeable instruction.");
+    return std::make_pair(AltOp, Iter->Ops);
+  }
+  return std::make_pair(MainOp, Iter->Ops);
+}
+
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
 ///
@@ -957,6 +1072,22 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       return InstructionsState::invalid();
   }
   bool AnyPoison = InstCnt != VL.size();
+  // Currently, this is only used for binary ops.
+  // TODO: support all instructions
+  SmallVector<InterchangeableInstruction> InterchangeableOpcode =
+      getInterchangeableInstruction(cast<Instruction>(V));
+  SmallVector<InterchangeableInstruction> AlternateInterchangeableOpcode;
+  auto UpdateInterchangeableOpcode =
+      [](SmallVector<InterchangeableInstruction> &LHS,
+         ArrayRef<InterchangeableInstruction> RHS) {
+        SmallVector<InterchangeableInstruction> NewInterchangeableOpcode;
+        std::set_intersection(LHS.begin(), LHS.end(), RHS.begin(), RHS.end(),
+                              std::back_inserter(NewInterchangeableOpcode));
+        if (NewInterchangeableOpcode.empty())
+          return false;
+        LHS.swap(NewInterchangeableOpcode);
+        return true;
+      };
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
     auto *I = dyn_cast<Instruction>(VL[Cnt]);
     if (!I)
@@ -969,14 +1100,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       return InstructionsState::invalid();
     unsigned InstOpcode = I->getOpcode();
     if (IsBinOp && isa<BinaryOperator>(I)) {
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      SmallVector<InterchangeableInstruction> ThisInterchangeableOpcode(
+          getInterchangeableInstruction(I));
+      if (UpdateInterchangeableOpcode(InterchangeableOpcode,
+                                      ThisInterchangeableOpcode))
         continue;
-      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
-          isValidForAlternation(Opcode)) {
-        AltOpcode = InstOpcode;
-        AltIndex = Cnt;
+      if (AlternateInterchangeableOpcode.empty()) {
+        InterchangeableOpcode.erase(
+            remove_if(InterchangeableOpcode,
+                      [](const InterchangeableInstruction &I) {
+                        return !isValidForAlternation(I.Opcode);
+                      }),
+            InterchangeableOpcode.end());
+        ThisInterchangeableOpcode.erase(
+            remove_if(ThisInterchangeableOpcode,
+                      [](const InterchangeableInstruction &I) {
+                        return !isValidForAlternation(I.Opcode);
+                      }),
+            ThisInterchangeableOpcode.end());
+        if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty())
+          return InstructionsState::invalid();
+        AlternateInterchangeableOpcode.swap(ThisInterchangeableOpcode);
         continue;
       }
+      if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode,
+                                      ThisInterchangeableOpcode))
+        continue;
     } else if (IsCastOp && isa<CastInst>(I)) {
       Value *Op0 = IBase->getOperand(0);
       Type *Ty0 = Op0->getType();
@@ -1077,6 +1226,24 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState::invalid();
   }
 
+  if (IsBinOp) {
+    auto FindOp = [&](ArrayRef<InterchangeableInstruction> CandidateOp) {
+      for (Value *V : VL) {
+        if (isa<PoisonValue>(V))
+          continue;
+        for (const InterchangeableInstruction &I : CandidateOp)
+          if (cast<Instruction>(V)->getOpcode() == I.Opcode)
+            return cast<Instruction>(V);
+      }
+      llvm_unreachable(
+          "Cannot find the candidate instruction for InstructionsState.");
+    };
+    Instruction *MainOp = FindOp(InterchangeableOpcode);
+    Instruction *AltOp = AlternateInterchangeableOpcode.empty()
+                             ? MainOp
+                             : FindOp(AlternateInterchangeableOpcode);
+    return InstructionsState(MainOp, AltOp);
+  }
   return InstructionsState(cast<Instruction>(V),
                            cast<Instruction>(VL[AltIndex]));
 }
@@ -2407,42 +2574,46 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *MainOp,
+                            Instruction *AltOp) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = VL0->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
+      unsigned NumOperands = MainOp->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+      for (unsigned OpIdx : seq<unsigned>(NumOperands))
         OpsVec[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
-                 "Expected instruction or poison value");
-          // Our tree has just 3 nodes: the root and two operands.
-          // It is therefore trivial to get the APO. We only need to check the
-          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
-          // RHS operand. The LHS operand of both add and sub is never attached
-          // to an inversese operation in the linearized form, therefore its APO
-          // is false. The RHS is true only if VL[Lane] is an inverse operation.
-
-          // Since operand reordering is performed on groups of commutative
-          // operations or alternating sequences (e.g., +, -), we can safely
-          // tell the inverse operations by checking commutativity.
-          if (isa<PoisonValue>(VL[Lane])) {
+      for (auto [Lane, V] : enumerate(VL)) {
+        assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
+               "Expected instruction or poison value");
+        if (isa<PoisonValue>(V)) {
+          for (unsigned OpIdx : seq<unsigned>(NumOperands))
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
                 false};
-            continue;
-          }
-          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+          continue;
+        }
+        auto [SelectedOp, Ops] =
+            getInterchangeableInstruction(cast<Instruction>(V), MainOp, AltOp);
+        // Our tree has just 3 nodes: the root and two operands.
+        // It is therefore trivial to get the APO. We only need to check the
+        // opcode of V and whether the operand at OpIdx is the LHS or RHS
+        // operand. The LHS operand of both add and sub is never attached to an
+        // inversese operation in the linearized form, therefore its APO is
+        // false. The RHS is true only if V is an inverse operation.
+
+        // Since operand reordering is performed on groups of commutative
+        // operations or alternating sequences (e.g., +, -), we can safely
+        // tell the inverse operations by checking commutativity.
+        bool IsInverseOperation = !isCommutative(cast<Instruction>(SelectedOp));
+        for (unsigned OpIdx : seq<unsigned>(NumOperands)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
-                                 APO, false};
+          OpsVec[OpIdx][Lane] = {Ops[OpIdx], APO, false};
         }
       }
     }
@@ -2549,11 +2720,12 @@ class BoUpSLP {
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, Instruction *MainOp,
+               Instruction *AltOp, const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor((VL0->getParent()))) {
+          L(R.LI->getLoopFor(MainOp->getParent())) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, VL0);
+      appendOperandsOfVL(RootVL, MainOp, AltOp);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3345,7 +3517,7 @@ class BoUpSLP {
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, MainOp, R);
+      VLOperands Ops(Scalars, MainOp, AltOp, R);
       if (RequireReorder)
         Ops.reorder();
       for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
@@ -8561,7 +8733,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
-      VLOperands Ops(VL, VL0, *this);
+      VLOperands Ops(VL, VL0, S.getAltOp(), *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
@@ -15619,7 +15791,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = Builder.CreateBinOp(
           static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
           RHS);
-      propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
+      propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
       if (auto *I = dyn_cast<Instruction>(V)) {
         V = ::propagateMetadata(I, E->Scalars);
         // Drop nuw flags for abs(sub(commutative), true).

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index feb4ad865f3147..c65df26fa0d5cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; POW2-ONLY-LABEL: @store_try_reorder(
 ; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index fd3d4ab80b29cc..74d7f1c91f3bff 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -7,19 +7,18 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub <2 x double> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP11]], ptr align 8 [[ARRAYIDX17_I28_1]], i64 -8, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = fsub <2 x double> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP12]], ptr align 8 [[TMP8]], i64 -8, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    br label %[[BB]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 7ab5e4d6cb787e..89b87a3a45d12c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; POW2-ONLY-LABEL: @store_try_reorder(
 ; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
index f46a5d84a86cc9..e3b4898e852126 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -10,10 +10,8 @@ define i32 @foo(ptr nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 9)
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
index 889f5a95c81d69..7af0c64f187480 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -4,22 +4,17 @@
 define void @test(ptr %0, ptr %1, ptr %2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> <i32 0, i32 0, i32 1, i32 0>, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP2:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, ptr %1, align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index c976525b6720eb..d474a5f2cecae0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -4,21 +4,17 @@
 define i64 @foo(i32 %tmp7) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[TMP5:%.*]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = sub i32 undef, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, i32 [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, i32 [[TMP24]], i32 6
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 poison, i32 2, i32 3, i32 poison, i32 14, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP7:%.*]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i32> [[TMP0]], <i32 0, i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0>, <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 14, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i32> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP9]], 0
 ; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP64]]
 ;
@@ -29,7 +25,7 @@ bb:
   %tmp4 = xor i32 %tmp3, 0
   %tmp6 = sub i32 0, 0
   %tmp8 = sub i32 %tmp7, 0
-  %tmp9 = sub nsw i32 0, undef
+  %tmp9 = sub nsw i32 0, poison
   %tmp10 = add nsw i32 0, %tmp6
   %tmp11 = sub nsw i32 0, %tmp8
   %tmp12 = add i32 0, %tmp10
@@ -44,10 +40,10 @@ bb:
   %tmp21 = add i32 %tmp20, %tmp17
   %tmp22 = sub i32 0, 0
   %tmp23 = add i32 0, 0
-  %tmp24 = sub i32 undef, 0
-  %tmp25 = add nsw i32 %tmp23, undef
+  %tmp24 = sub i32 poison, 0
+  %tmp25 = add nsw i32 %tmp23, poison
   %tmp26 = add nsw i32 %tmp24, %tmp22
-  %tmp27 = sub nsw i32 undef, %tmp24
+  %tmp27 = sub nsw i32 poison, %tmp24
   %tmp28 = add i32 0, %tmp25
   %tmp29 = xor i32 %tmp28, 0
   %tmp30 = add i32 0, %tmp26
@@ -58,7 +54,7 @@ bb:
   %tmp35 = add i32 %tmp34, %tmp29
   %tmp36 = add i32 %tmp35, 0
   %tmp37 = add i32 %tmp36, %tmp33
-  %tmp38 = sub nsw i32 0, undef
+  %tmp38 = sub nsw i32 0, poison
   %tmp39 = add i32 0, %tmp38
   %tmp40 = xor i32 %tmp39, 0
   %tmp41 = add i32 0, %tmp37

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
index 02c3173adc654f..fc62b0b38fd535 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
@@ -9,12 +9,10 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9)
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 9)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP6]], [[M:%.*]]
 ; CHECK-NEXT:    [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP6]], [[M]]
 ; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
index 2a5bfa73907704..daab4b6ea4c957 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
@@ -8,10 +8,8 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[A_PROMOTED]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], <i16 0, i16 -1, i16 0, i16 0>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
index e6a166c27ac494..94f2c79faa8c93 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
@@ -9,9 +9,7 @@ define i32 @foo() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 undef, i32 1, i32 0>, i32 [[D]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4
+; CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
index 6e2a43ac5f9f10..15dd6756cd7dbb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -242,13 +242,18 @@ exit:
 }
 
 define void @store_try_reorder(ptr %dst) {
-; CHECK-LABEL: @store_try_reorder(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @store_try_reorder(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @store_try_reorder(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %add = add i32 0, 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
index c250029519590f..e4eff0f72b3565 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
@@ -1,18 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define <2 x i32> @test(i32 %arg) {
-; CHECK-LABEL: define <2 x i32> @test(
-; CHECK-SAME: i32 [[ARG:%.*]]) {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG]], 0
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 0, 1
-; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1
-; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+; X86-LABEL: define <2 x i32> @test(
+; X86-SAME: i32 [[ARG:%.*]]) {
+; X86-NEXT:  bb:
+; X86-NEXT:    [[OR:%.*]] = or i32 [[ARG]], 0
+; X86-NEXT:    [[MUL:%.*]] = mul i32 0, 1
+; X86-NEXT:    [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1
+; X86-NEXT:    ret <2 x i32> [[TMP1]]
+;
+; AARCH64-LABEL: define <2 x i32> @test(
+; AARCH64-SAME: i32 [[ARG:%.*]]) {
+; AARCH64-NEXT:  bb:
+; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[TMP0]], zeroinitializer
+; AARCH64-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; AARCH64-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]]
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; AARCH64-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 bb:
   %or = or i32 %arg, 0
@@ -23,4 +34,3 @@ bb:
   %1 = insertelement <2 x i32> %0, i32 %mul, i32 1
   ret <2 x i32> %1
 }
-

diff  --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
index 61a84a67c9ff19..6f9768af38caf6 100644
--- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -5,15 +5,13 @@ define void @func(i32 %0) {
 ; CHECK-LABEL: define void @func(
 ; CHECK-SAME: i32 [[TMP0:%.*]]) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, i32 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> <i32 poison, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
@@ -24,61 +22,61 @@ define void @func(i32 %0) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <32 x i1> [[TMP20]], i32 31
-; CHECK-NEXT:    [[TMP22:%.*]] = and i1 false, [[TMP21]]
+; CHECK-NEXT:    [[TMP76:%.*]] = and i1 false, [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <32 x i1> [[TMP20]], i32 30
-; CHECK-NEXT:    [[TMP24:%.*]] = and i1 false, [[TMP23]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and i1 false, [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <32 x i1> [[TMP20]], i32 29
-; CHECK-NEXT:    [[TMP26:%.*]] = and i1 false, [[TMP25]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and i1 false, [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <32 x i1> [[TMP20]], i32 28
-; CHECK-NEXT:    [[TMP28:%.*]] = and i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP26:%.*]] = and i1 false, [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <32 x i1> [[TMP20]], i32 27
-; CHECK-NEXT:    [[TMP30:%.*]] = and i1 false, [[TMP29]]
+; CHECK-NEXT:    [[TMP28:%.*]] = and i1 false, [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <32 x i1> [[TMP20]], i32 26
-; CHECK-NEXT:    [[TMP32:%.*]] = and i1 false, [[TMP31]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and i1 false, [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <32 x i1> [[TMP20]], i32 25
-; CHECK-NEXT:    [[TMP34:%.*]] = and i1 false, [[TMP33]]
+; CHECK-NEXT:    [[TMP32:%.*]] = and i1 false, [[TMP33]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <32 x i1> [[TMP20]], i32 24
-; CHECK-NEXT:    [[TMP36:%.*]] = and i1 false, [[TMP35]]
+; CHECK-NEXT:    [[TMP34:%.*]] = and i1 false, [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i1> [[TMP20]], i32 23
-; CHECK-NEXT:    [[TMP38:%.*]] = and i1 false, [[TMP37]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and i1 false, [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <32 x i1> [[TMP20]], i32 22
-; CHECK-NEXT:    [[TMP40:%.*]] = and i1 false, [[TMP39]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and i1 false, [[TMP39]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <32 x i1> [[TMP20]], i32 21
-; CHECK-NEXT:    [[TMP42:%.*]] = and i1 false, [[TMP41]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and i1 false, [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i1> [[TMP20]], i32 20
-; CHECK-NEXT:    [[TMP44:%.*]] = and i1 false, [[TMP43]]
+; CHECK-NEXT:    [[TMP42:%.*]] = and i1 false, [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <32 x i1> [[TMP20]], i32 19
-; CHECK-NEXT:    [[TMP46:%.*]] = and i1 false, [[TMP45]]
+; CHECK-NEXT:    [[TMP44:%.*]] = and i1 false, [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <32 x i1> [[TMP20]], i32 18
-; CHECK-NEXT:    [[TMP48:%.*]] = and i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and i1 false, [[TMP47]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <32 x i1> [[TMP20]], i32 17
-; CHECK-NEXT:    [[TMP50:%.*]] = and i1 false, [[TMP49]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and i1 false, [[TMP49]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <32 x i1> [[TMP20]], i32 16
-; CHECK-NEXT:    [[TMP52:%.*]] = and i1 false, [[TMP51]]
+; CHECK-NEXT:    [[TMP50:%.*]] = and i1 false, [[TMP51]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <32 x i1> [[TMP20]], i32 15
-; CHECK-NEXT:    [[TMP54:%.*]] = and i1 false, [[TMP53]]
+; CHECK-NEXT:    [[TMP52:%.*]] = and i1 false, [[TMP53]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <32 x i1> [[TMP20]], i32 14
-; CHECK-NEXT:    [[TMP56:%.*]] = and i1 false, [[TMP55]]
+; CHECK-NEXT:    [[TMP54:%.*]] = and i1 false, [[TMP55]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <32 x i1> [[TMP20]], i32 13
-; CHECK-NEXT:    [[TMP58:%.*]] = and i1 false, [[TMP57]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and i1 false, [[TMP57]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <32 x i1> [[TMP20]], i32 12
-; CHECK-NEXT:    [[TMP60:%.*]] = and i1 false, [[TMP59]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and i1 false, [[TMP59]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <32 x i1> [[TMP20]], i32 11
-; CHECK-NEXT:    [[TMP62:%.*]] = and i1 false, [[TMP61]]
+; CHECK-NEXT:    [[TMP60:%.*]] = and i1 false, [[TMP61]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <32 x i1> [[TMP20]], i32 10
-; CHECK-NEXT:    [[TMP64:%.*]] = and i1 false, [[TMP63]]
+; CHECK-NEXT:    [[TMP62:%.*]] = and i1 false, [[TMP63]]
 ; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <32 x i1> [[TMP20]], i32 9
-; CHECK-NEXT:    [[TMP66:%.*]] = and i1 false, [[TMP65]]
+; CHECK-NEXT:    [[TMP64:%.*]] = and i1 false, [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i1> [[TMP20]], i32 8
-; CHECK-NEXT:    [[TMP68:%.*]] = and i1 false, [[TMP67]]
+; CHECK-NEXT:    [[TMP66:%.*]] = and i1 false, [[TMP67]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <32 x i1> [[TMP20]], i32 7
-; CHECK-NEXT:    [[TMP70:%.*]] = and i1 false, [[TMP69]]
+; CHECK-NEXT:    [[TMP68:%.*]] = and i1 false, [[TMP69]]
 ; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <32 x i1> [[TMP20]], i32 6
-; CHECK-NEXT:    [[TMP72:%.*]] = and i1 false, [[TMP71]]
+; CHECK-NEXT:    [[TMP70:%.*]] = and i1 false, [[TMP71]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i1> [[TMP20]], i32 5
-; CHECK-NEXT:    [[TMP74:%.*]] = and i1 false, [[TMP73]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i1 false, [[TMP73]]
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <32 x i1> [[TMP20]], i32 4
-; CHECK-NEXT:    [[TMP76:%.*]] = and i1 false, [[TMP75]]
+; CHECK-NEXT:    [[TMP74:%.*]] = and i1 false, [[TMP75]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <32 x i32> [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP78:%.*]] = zext i32 [[TMP77]] to i64
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr float, ptr addrspace(1) null, i64 [[TMP78]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
index 732b50396a460d..1e3255f2187af0 100644
--- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
@@ -12,9 +12,7 @@ define i32 @test() {
 ; CHECK-NEXT:    br i1 false, label [[BB4:%.*]], label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP5]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5]] = or <2 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ]


        


More information about the llvm-commits mailing list