[llvm] [SLP] Make getSameOpcode support different instructions if they have same semantics. (PR #112181)

Han-Kuan Chen via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 11 01:09:28 PST 2024


https://github.com/HanKuanChen updated https://github.com/llvm/llvm-project/pull/112181

>From ad591acf5eb8609692e6700cc0e5c66e49cf7035 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Thu, 3 Oct 2024 06:39:59 -0700
Subject: [PATCH 1/8] [SLP] Make getSameOpcode support different instructions
 if they have same semantics.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 237 +++++++++++++++---
 .../SLPVectorizer/AArch64/vec3-base.ll        |   8 +-
 ...reversed-strided-node-with-external-ptr.ll |   7 +-
 .../SLPVectorizer/RISCV/vec3-base.ll          |   8 +-
 .../SLPVectorizer/X86/barriercall.ll          |   4 +-
 .../X86/bottom-to-top-reorder.ll              |  27 +-
 .../X86/extract-scalar-from-undef.ll          |  27 +-
 .../SLPVectorizer/X86/extractcost.ll          |   4 +-
 .../X86/minbitwidth-drop-wrapping-flags.ll    |   4 +-
 .../X86/multi-extracts-bv-combined.ll         |   4 +-
 .../Transforms/SLPVectorizer/X86/vec3-base.ll |  19 +-
 .../alternate-opcode-sindle-bv.ll             |  36 ++-
 .../resized-alt-shuffle-after-minbw.ll        |  62 +++--
 .../SLPVectorizer/shuffle-mask-resized.ll     |   4 +-
 14 files changed, 313 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ba70ab1e5e14b9..99cb81f13a250c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -832,8 +832,107 @@ struct InstructionsState {
       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
 };
 
+struct InterchangeableInstruction {
+  unsigned Opcode;
+  SmallVector<Value *> Ops;
+  template <class... ArgTypes>
+  InterchangeableInstruction(unsigned Opcode, ArgTypes &&...Args)
+      : Opcode(Opcode), Ops{std::forward<decltype(Args)>(Args)...} {}
+};
+
+bool operator<(const InterchangeableInstruction &LHS,
+               const InterchangeableInstruction &RHS) {
+  return LHS.Opcode < RHS.Opcode;
+}
+
 } // end anonymous namespace
 
+/// \returns a sorted list of interchangeable instructions by instruction opcode
+/// that \p I can be converted to.
+/// e.g.,
+/// x << y -> x * (2^y)
+/// x << 1 -> x *   2
+/// x << 0 -> x *   1   -> x - 0 -> x + 0 -> x & 11...1 -> x | 0
+///           x *   0                     -> x & 0
+///           x *  -1   -> 0 - x
+/// TODO: support more patterns
+static SmallVector<InterchangeableInstruction>
+getInterchangeableInstruction(Instruction *I) {
+  // PII = Possible Interchangeable Instruction
+  SmallVector<InterchangeableInstruction> PII;
+  unsigned Opcode = I->getOpcode();
+  PII.emplace_back(Opcode, I->operands());
+  if (!is_contained({Instruction::Shl, Instruction::Mul, Instruction::Sub,
+                     Instruction::Add},
+                    Opcode))
+    return PII;
+  Constant *C;
+  if (match(I, m_BinOp(m_Value(), m_Constant(C)))) {
+    ConstantInt *V = nullptr;
+    if (auto *CI = dyn_cast<ConstantInt>(C)) {
+      V = CI;
+    } else if (auto *CDV = dyn_cast<ConstantDataVector>(C)) {
+      if (auto *CI = dyn_cast_if_present<ConstantInt>(CDV->getSplatValue()))
+        V = CI;
+    }
+    if (!V)
+      return PII;
+    Value *Op0 = I->getOperand(0);
+    Type *Op1Ty = I->getOperand(1)->getType();
+    const APInt &Op1Int = V->getValue();
+    Constant *Zero =
+        ConstantInt::get(Op1Ty, APInt::getZero(Op1Int.getBitWidth()));
+    Constant *UnsignedMax =
+        ConstantInt::get(Op1Ty, APInt::getMaxValue(Op1Int.getBitWidth()));
+    switch (Opcode) {
+    case Instruction::Shl: {
+      PII.emplace_back(Instruction::Mul, Op0,
+                       ConstantInt::get(Op1Ty, 1 << Op1Int.getZExtValue()));
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::Sub, Op0, Zero);
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    }
+    case Instruction::Mul: {
+      switch (Op1Int.getSExtValue()) {
+      case 1:
+        PII.emplace_back(Instruction::Sub, Op0, Zero);
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+        break;
+      case 0:
+        PII.emplace_back(Instruction::And, Op0, Zero);
+        break;
+      case -1:
+        PII.emplace_back(Instruction::Sub, Zero, Op0);
+        break;
+      }
+      break;
+    }
+    case Instruction::Sub:
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::Add, Op0, Zero);
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    case Instruction::Add:
+      if (Op1Int.isZero()) {
+        PII.emplace_back(Instruction::And, Op0, UnsignedMax);
+        PII.emplace_back(Instruction::Or, Op0, Zero);
+      }
+      break;
+    }
+  }
+  // std::set_intersection requires a sorted range.
+  sort(PII);
+  return PII;
+}
+
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
 ///
@@ -938,18 +1037,54 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
       return InstructionsState(VL[BaseIndex], nullptr, nullptr);
   }
+  // Currently, this is only used for binary ops.
+  // TODO: support all instructions
+  SmallVector<InterchangeableInstruction> InterchangeableOpcode =
+      getInterchangeableInstruction(cast<Instruction>(VL[BaseIndex]));
+  SmallVector<InterchangeableInstruction> AlternateInterchangeableOpcode;
+  auto UpdateInterchangeableOpcode =
+      [](SmallVector<InterchangeableInstruction> &LHS,
+         ArrayRef<InterchangeableInstruction> RHS) {
+        SmallVector<InterchangeableInstruction> NewInterchangeableOpcode;
+        std::set_intersection(LHS.begin(), LHS.end(), RHS.begin(), RHS.end(),
+                              std::back_inserter(NewInterchangeableOpcode));
+        if (NewInterchangeableOpcode.empty())
+          return false;
+        LHS = std::move(NewInterchangeableOpcode);
+        return true;
+      };
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
     auto *I = cast<Instruction>(VL[Cnt]);
     unsigned InstOpcode = I->getOpcode();
     if (IsBinOp && isa<BinaryOperator>(I)) {
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      SmallVector<InterchangeableInstruction> ThisInterchangeableOpcode(
+          getInterchangeableInstruction(I));
+      if (UpdateInterchangeableOpcode(InterchangeableOpcode,
+                                      ThisInterchangeableOpcode))
         continue;
-      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
-          isValidForAlternation(Opcode)) {
-        AltOpcode = InstOpcode;
-        AltIndex = Cnt;
+      if (AlternateInterchangeableOpcode.empty()) {
+        InterchangeableOpcode.erase(
+            std::remove_if(InterchangeableOpcode.begin(),
+                           InterchangeableOpcode.end(),
+                           [](const InterchangeableInstruction &I) {
+                             return !isValidForAlternation(I.Opcode);
+                           }),
+            InterchangeableOpcode.end());
+        ThisInterchangeableOpcode.erase(
+            std::remove_if(ThisInterchangeableOpcode.begin(),
+                           ThisInterchangeableOpcode.end(),
+                           [](const InterchangeableInstruction &I) {
+                             return !isValidForAlternation(I.Opcode);
+                           }),
+            ThisInterchangeableOpcode.end());
+        if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty())
+          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+        AlternateInterchangeableOpcode = std::move(ThisInterchangeableOpcode);
         continue;
       }
+      if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode,
+                                      ThisInterchangeableOpcode))
+        continue;
     } else if (IsCastOp && isa<CastInst>(I)) {
       Value *Op0 = IBase->getOperand(0);
       Type *Ty0 = Op0->getType();
@@ -1043,6 +1178,21 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
   }
 
+  if (IsBinOp) {
+    auto FindOp = [&](ArrayRef<InterchangeableInstruction> CandidateOp) {
+      for (Value *V : VL)
+        for (const InterchangeableInstruction &I : CandidateOp)
+          if (cast<Instruction>(V)->getOpcode() == I.Opcode)
+            return cast<Instruction>(V);
+      llvm_unreachable(
+          "Cannot find the candidate instruction for InstructionsState.");
+    };
+    Instruction *MainOp = FindOp(InterchangeableOpcode);
+    Instruction *AltOp = AlternateInterchangeableOpcode.empty()
+                             ? MainOp
+                             : FindOp(AlternateInterchangeableOpcode);
+    return InstructionsState(VL[BaseIndex], MainOp, AltOp);
+  }
   return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
                            cast<Instruction>(VL[AltIndex]));
 }
@@ -2335,24 +2485,41 @@ class BoUpSLP {
                                  : cast<Instruction>(VL[0])->getNumOperands();
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+      InstructionsState S = getSameOpcode(VL, TLI);
+      for (unsigned OpIdx : seq<unsigned>(NumOperands))
         OpsVec[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
-          // Our tree has just 3 nodes: the root and two operands.
-          // It is therefore trivial to get the APO. We only need to check the
-          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
-          // RHS operand. The LHS operand of both add and sub is never attached
-          // to an inversese operation in the linearized form, therefore its APO
-          // is false. The RHS is true only if VL[Lane] is an inverse operation.
-
-          // Since operand reordering is performed on groups of commutative
-          // operations or alternating sequences (e.g., +, -), we can safely
-          // tell the inverse operations by checking commutativity.
-          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+      for (auto [I, V] : enumerate(VL)) {
+        assert(isa<Instruction>(V) && "Expected instruction");
+        SmallVector<InterchangeableInstruction> IIList =
+            getInterchangeableInstruction(cast<Instruction>(V));
+        Value *SelectedOp;
+        auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+          return II.Opcode == S.MainOp->getOpcode();
+        });
+        if (Iter == IIList.end()) {
+          Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+            return II.Opcode == S.AltOp->getOpcode();
+          });
+          SelectedOp = S.AltOp;
+        } else {
+          SelectedOp = S.MainOp;
+        }
+        assert(Iter != IIList.end() &&
+               "Cannot find an interchangeable instruction.");
+        // Our tree has just 3 nodes: the root and two operands.
+        // It is therefore trivial to get the APO. We only need to check the
+        // opcode of V and whether the operand at OpIdx is the LHS or RHS
+        // operand. The LHS operand of both add and sub is never attached to an
+        // inversese operation in the linearized form, therefore its APO is
+        // false. The RHS is true only if V is an inverse operation.
+
+        // Since operand reordering is performed on groups of commutative
+        // operations or alternating sequences (e.g., +, -), we can safely
+        // tell the inverse operations by checking commutativity.
+        bool IsInverseOperation = !isCommutative(cast<Instruction>(SelectedOp));
+        for (unsigned OpIdx : seq<unsigned>(NumOperands)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
-                                 APO, false};
+          OpsVec[OpIdx][I] = {Iter->Ops[OpIdx], APO, false};
         }
       }
     }
@@ -3252,15 +3419,25 @@ class BoUpSLP {
       auto *I0 = cast<Instruction>(Scalars[0]);
       Operands.resize(I0->getNumOperands());
       unsigned NumLanes = Scalars.size();
-      for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
-           OpIdx != NumOperands; ++OpIdx) {
+      unsigned NumOperands = I0->getNumOperands();
+      for (unsigned OpIdx : seq<unsigned>(NumOperands))
         Operands[OpIdx].resize(NumLanes);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          auto *I = cast<Instruction>(Scalars[Lane]);
-          assert(I->getNumOperands() == NumOperands &&
-                 "Expected same number of operands");
-          Operands[OpIdx][Lane] = I->getOperand(OpIdx);
-        }
+      for (auto [I, V] : enumerate(Scalars)) {
+        SmallVector<InterchangeableInstruction> IIList =
+            getInterchangeableInstruction(cast<Instruction>(V));
+        auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+          return II.Opcode == MainOp->getOpcode();
+        });
+        if (Iter == IIList.end())
+          Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+            return II.Opcode == AltOp->getOpcode();
+          });
+        assert(Iter != IIList.end() &&
+               "Cannot find an interchangeable instruction.");
+        assert(Iter->Ops.size() == NumOperands &&
+               "Expected same number of operands");
+        for (auto [J, Op] : enumerate(Iter->Ops))
+          Operands[J][I] = Op;
       }
     }
 
@@ -14935,7 +15112,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = Builder.CreateBinOp(
           static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
           RHS);
-      propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
+      propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
       if (auto *I = dyn_cast<Instruction>(V)) {
         V = propagateMetadata(I, E->Scalars);
         // Drop nuw flags for abs(sub(commutative), true).
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index c18811a35c1eeb..c7c999bb572851 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; POW2-ONLY-LABEL: @store_try_reorder(
 ; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 9c1da08c64b7b7..7bc03e7c7755b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -7,13 +7,12 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 308d0e27f1ea89..e158c2a3ed87ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; POW2-ONLY-LABEL: @store_try_reorder(
 ; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; POW2-ONLY-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; POW2-ONLY-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
index d388fd17925a16..59a6e5f4d0c6c2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -10,9 +10,7 @@ define i32 @foo(ptr nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
index 889f5a95c81d69..7af0c64f187480 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -4,22 +4,17 @@
 define void @test(ptr %0, ptr %1, ptr %2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> <i32 0, i32 0, i32 1, i32 0>, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP2:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, ptr %1, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index 6ff03acf85cdfd..06f4b6e4521dea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -4,19 +4,20 @@
 define i64 @foo(i32 %tmp7) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, i32 undef, i32 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, i32 [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 8, i32 5, i32 10, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP7:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, <2 x i32> <i32 undef, i32 0>, i64 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, <8 x i32> <i32 poison, i32 poison, i32 10, i32 0, i32 1, i32 13, i32 poison, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 undef, i32 6
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP12]], 0
 ; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP64]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
index 1374e9873e1c53..ac4603c9c88de3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
@@ -9,9 +9,7 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
index 2a5bfa73907704..daab4b6ea4c957 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
@@ -8,10 +8,8 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[A_PROMOTED]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], <i16 0, i16 -1, i16 0, i16 0>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
index e6a166c27ac494..94f2c79faa8c93 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll
@@ -9,9 +9,7 @@ define i32 @foo() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 undef, i32 1, i32 0>, i32 [[D]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4
+; CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
index 96d4b84e036918..83391a96c5e34f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -242,13 +242,18 @@ exit:
 }
 
 define void @store_try_reorder(ptr %dst) {
-; CHECK-LABEL: @store_try_reorder(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 0, 0
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @store_try_reorder(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @store_try_reorder(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
+; POW2-ONLY-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %add = add i32 0, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
index c250029519590f..e4eff0f72b3565 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
@@ -1,18 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define <2 x i32> @test(i32 %arg) {
-; CHECK-LABEL: define <2 x i32> @test(
-; CHECK-SAME: i32 [[ARG:%.*]]) {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG]], 0
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 0, 1
-; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1
-; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+; X86-LABEL: define <2 x i32> @test(
+; X86-SAME: i32 [[ARG:%.*]]) {
+; X86-NEXT:  bb:
+; X86-NEXT:    [[OR:%.*]] = or i32 [[ARG]], 0
+; X86-NEXT:    [[MUL:%.*]] = mul i32 0, 1
+; X86-NEXT:    [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1
+; X86-NEXT:    ret <2 x i32> [[TMP1]]
+;
+; AARCH64-LABEL: define <2 x i32> @test(
+; AARCH64-SAME: i32 [[ARG:%.*]]) {
+; AARCH64-NEXT:  bb:
+; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[TMP0]], zeroinitializer
+; AARCH64-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; AARCH64-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]]
+; AARCH64-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; AARCH64-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 bb:
   %or = or i32 %arg, 0
@@ -23,4 +34,3 @@ bb:
   %1 = insertelement <2 x i32> %0, i32 %mul, i32 1
   ret <2 x i32> %1
 }
-
diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
index 56281424c7114a..bcca8ba53016dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -5,15 +5,13 @@ define void @func(i32 %0) {
 ; CHECK-LABEL: define void @func(
 ; CHECK-SAME: i32 [[TMP0:%.*]]) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, i32 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> <i32 poison, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
@@ -24,61 +22,61 @@ define void @func(i32 %0) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <32 x i1> [[TMP20]], i32 31
-; CHECK-NEXT:    [[TMP22:%.*]] = and i1 false, [[TMP21]]
+; CHECK-NEXT:    [[TMP76:%.*]] = and i1 false, [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <32 x i1> [[TMP20]], i32 30
-; CHECK-NEXT:    [[TMP24:%.*]] = and i1 false, [[TMP23]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and i1 false, [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <32 x i1> [[TMP20]], i32 29
-; CHECK-NEXT:    [[TMP26:%.*]] = and i1 false, [[TMP25]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and i1 false, [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <32 x i1> [[TMP20]], i32 28
-; CHECK-NEXT:    [[TMP28:%.*]] = and i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP26:%.*]] = and i1 false, [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <32 x i1> [[TMP20]], i32 27
-; CHECK-NEXT:    [[TMP30:%.*]] = and i1 false, [[TMP29]]
+; CHECK-NEXT:    [[TMP28:%.*]] = and i1 false, [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <32 x i1> [[TMP20]], i32 26
-; CHECK-NEXT:    [[TMP32:%.*]] = and i1 false, [[TMP31]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and i1 false, [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <32 x i1> [[TMP20]], i32 25
-; CHECK-NEXT:    [[TMP34:%.*]] = and i1 false, [[TMP33]]
+; CHECK-NEXT:    [[TMP32:%.*]] = and i1 false, [[TMP33]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <32 x i1> [[TMP20]], i32 24
-; CHECK-NEXT:    [[TMP36:%.*]] = and i1 false, [[TMP35]]
+; CHECK-NEXT:    [[TMP34:%.*]] = and i1 false, [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i1> [[TMP20]], i32 23
-; CHECK-NEXT:    [[TMP38:%.*]] = and i1 false, [[TMP37]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and i1 false, [[TMP37]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <32 x i1> [[TMP20]], i32 22
-; CHECK-NEXT:    [[TMP40:%.*]] = and i1 false, [[TMP39]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and i1 false, [[TMP39]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <32 x i1> [[TMP20]], i32 21
-; CHECK-NEXT:    [[TMP42:%.*]] = and i1 false, [[TMP41]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and i1 false, [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i1> [[TMP20]], i32 20
-; CHECK-NEXT:    [[TMP44:%.*]] = and i1 false, [[TMP43]]
+; CHECK-NEXT:    [[TMP42:%.*]] = and i1 false, [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <32 x i1> [[TMP20]], i32 19
-; CHECK-NEXT:    [[TMP46:%.*]] = and i1 false, [[TMP45]]
+; CHECK-NEXT:    [[TMP44:%.*]] = and i1 false, [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <32 x i1> [[TMP20]], i32 18
-; CHECK-NEXT:    [[TMP48:%.*]] = and i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP46:%.*]] = and i1 false, [[TMP47]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <32 x i1> [[TMP20]], i32 17
-; CHECK-NEXT:    [[TMP50:%.*]] = and i1 false, [[TMP49]]
+; CHECK-NEXT:    [[TMP48:%.*]] = and i1 false, [[TMP49]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <32 x i1> [[TMP20]], i32 16
-; CHECK-NEXT:    [[TMP52:%.*]] = and i1 false, [[TMP51]]
+; CHECK-NEXT:    [[TMP50:%.*]] = and i1 false, [[TMP51]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <32 x i1> [[TMP20]], i32 15
-; CHECK-NEXT:    [[TMP54:%.*]] = and i1 false, [[TMP53]]
+; CHECK-NEXT:    [[TMP52:%.*]] = and i1 false, [[TMP53]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <32 x i1> [[TMP20]], i32 14
-; CHECK-NEXT:    [[TMP56:%.*]] = and i1 false, [[TMP55]]
+; CHECK-NEXT:    [[TMP54:%.*]] = and i1 false, [[TMP55]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <32 x i1> [[TMP20]], i32 13
-; CHECK-NEXT:    [[TMP58:%.*]] = and i1 false, [[TMP57]]
+; CHECK-NEXT:    [[TMP56:%.*]] = and i1 false, [[TMP57]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <32 x i1> [[TMP20]], i32 12
-; CHECK-NEXT:    [[TMP60:%.*]] = and i1 false, [[TMP59]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and i1 false, [[TMP59]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <32 x i1> [[TMP20]], i32 11
-; CHECK-NEXT:    [[TMP62:%.*]] = and i1 false, [[TMP61]]
+; CHECK-NEXT:    [[TMP60:%.*]] = and i1 false, [[TMP61]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <32 x i1> [[TMP20]], i32 10
-; CHECK-NEXT:    [[TMP64:%.*]] = and i1 false, [[TMP63]]
+; CHECK-NEXT:    [[TMP62:%.*]] = and i1 false, [[TMP63]]
 ; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <32 x i1> [[TMP20]], i32 9
-; CHECK-NEXT:    [[TMP66:%.*]] = and i1 false, [[TMP65]]
+; CHECK-NEXT:    [[TMP64:%.*]] = and i1 false, [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i1> [[TMP20]], i32 8
-; CHECK-NEXT:    [[TMP68:%.*]] = and i1 false, [[TMP67]]
+; CHECK-NEXT:    [[TMP66:%.*]] = and i1 false, [[TMP67]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <32 x i1> [[TMP20]], i32 7
-; CHECK-NEXT:    [[TMP70:%.*]] = and i1 false, [[TMP69]]
+; CHECK-NEXT:    [[TMP68:%.*]] = and i1 false, [[TMP69]]
 ; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <32 x i1> [[TMP20]], i32 6
-; CHECK-NEXT:    [[TMP72:%.*]] = and i1 false, [[TMP71]]
+; CHECK-NEXT:    [[TMP70:%.*]] = and i1 false, [[TMP71]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i1> [[TMP20]], i32 5
-; CHECK-NEXT:    [[TMP74:%.*]] = and i1 false, [[TMP73]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i1 false, [[TMP73]]
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <32 x i1> [[TMP20]], i32 4
-; CHECK-NEXT:    [[TMP76:%.*]] = and i1 false, [[TMP75]]
+; CHECK-NEXT:    [[TMP74:%.*]] = and i1 false, [[TMP75]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <32 x i32> [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP78:%.*]] = sext i32 [[TMP77]] to i64
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr float, ptr addrspace(1) null, i64 [[TMP78]]
diff --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
index 732b50396a460d..1e3255f2187af0 100644
--- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
@@ -12,9 +12,7 @@ define i32 @test() {
 ; CHECK-NEXT:    br i1 false, label [[BB4:%.*]], label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP5]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5]] = or <2 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ]

>From f3935909ec19ac27e432da8eb8bcbad72fe5d752 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 21 Oct 2024 00:35:40 -0700
Subject: [PATCH 2/8] getSExtValue may use too many bits

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99cb81f13a250c..d743a33057d15a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -897,19 +897,15 @@ getInterchangeableInstruction(Instruction *I) {
       break;
     }
     case Instruction::Mul: {
-      switch (Op1Int.getSExtValue()) {
-      case 1:
+      if (Op1Int.isOne()) {
         PII.emplace_back(Instruction::Sub, Op0, Zero);
         PII.emplace_back(Instruction::Add, Op0, Zero);
         PII.emplace_back(Instruction::And, Op0, UnsignedMax);
         PII.emplace_back(Instruction::Or, Op0, Zero);
-        break;
-      case 0:
+      } else if (Op1Int.isZero()) {
         PII.emplace_back(Instruction::And, Op0, Zero);
-        break;
-      case -1:
+      } else if (Op1Int.isAllOnes()) {
         PII.emplace_back(Instruction::Sub, Zero, Op0);
-        break;
       }
       break;
     }

>From 83ed351211e4fa9b02b32b736793c8f212af26f5 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 21 Oct 2024 03:41:04 -0700
Subject: [PATCH 3/8] apply comment

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d743a33057d15a..a55d9a3dea6191 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1046,7 +1046,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                               std::back_inserter(NewInterchangeableOpcode));
         if (NewInterchangeableOpcode.empty())
           return false;
-        LHS = std::move(NewInterchangeableOpcode);
+        LHS.swap(NewInterchangeableOpcode);
         return true;
       };
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
@@ -1060,22 +1060,20 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         continue;
       if (AlternateInterchangeableOpcode.empty()) {
         InterchangeableOpcode.erase(
-            std::remove_if(InterchangeableOpcode.begin(),
-                           InterchangeableOpcode.end(),
-                           [](const InterchangeableInstruction &I) {
-                             return !isValidForAlternation(I.Opcode);
-                           }),
+            remove_if(InterchangeableOpcode,
+                      [](const InterchangeableInstruction &I) {
+                        return !isValidForAlternation(I.Opcode);
+                      }),
             InterchangeableOpcode.end());
         ThisInterchangeableOpcode.erase(
-            std::remove_if(ThisInterchangeableOpcode.begin(),
-                           ThisInterchangeableOpcode.end(),
-                           [](const InterchangeableInstruction &I) {
-                             return !isValidForAlternation(I.Opcode);
-                           }),
+            remove_if(ThisInterchangeableOpcode,
+                      [](const InterchangeableInstruction &I) {
+                        return !isValidForAlternation(I.Opcode);
+                      }),
             ThisInterchangeableOpcode.end());
         if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty())
           return InstructionsState(VL[BaseIndex], nullptr, nullptr);
-        AlternateInterchangeableOpcode = std::move(ThisInterchangeableOpcode);
+        AlternateInterchangeableOpcode.swap(ThisInterchangeableOpcode);
         continue;
       }
       if (UpdateInterchangeableOpcode(AlternateInterchangeableOpcode,

>From 9672f6d3e36c5135f7387e58ee21aa82a6f384b6 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Tue, 22 Oct 2024 23:40:04 -0700
Subject: [PATCH 4/8] reduce repeated code

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 61 +++++++++----------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a55d9a3dea6191..b82b15a81f997a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -929,6 +929,29 @@ getInterchangeableInstruction(Instruction *I) {
   return PII;
 }
 
+/// \returns the Op and operands which \p I convert to.
+static std::pair<Value *, SmallVector<Value *>>
+getInterchangeableInstruction(Instruction *I, Instruction *MainOp,
+                              Instruction *AltOp) {
+  SmallVector<InterchangeableInstruction> IIList =
+      getInterchangeableInstruction(I);
+  auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+    return II.Opcode == MainOp->getOpcode();
+  });
+  Value *SelectedOp;
+  if (Iter == IIList.end()) {
+    Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
+      return II.Opcode == AltOp->getOpcode();
+    });
+    assert(Iter != IIList.end() &&
+           "Cannot find an interchangeable instruction.");
+    SelectedOp = AltOp;
+  } else {
+    SelectedOp = MainOp;
+  }
+  return std::make_pair(SelectedOp, Iter->Ops);
+}
+
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
 ///
@@ -2484,22 +2507,8 @@ class BoUpSLP {
         OpsVec[OpIdx].resize(NumLanes);
       for (auto [I, V] : enumerate(VL)) {
         assert(isa<Instruction>(V) && "Expected instruction");
-        SmallVector<InterchangeableInstruction> IIList =
-            getInterchangeableInstruction(cast<Instruction>(V));
-        Value *SelectedOp;
-        auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
-          return II.Opcode == S.MainOp->getOpcode();
-        });
-        if (Iter == IIList.end()) {
-          Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
-            return II.Opcode == S.AltOp->getOpcode();
-          });
-          SelectedOp = S.AltOp;
-        } else {
-          SelectedOp = S.MainOp;
-        }
-        assert(Iter != IIList.end() &&
-               "Cannot find an interchangeable instruction.");
+        auto [SelectedOp, Ops] = getInterchangeableInstruction(
+            cast<Instruction>(V), S.MainOp, S.AltOp);
         // Our tree has just 3 nodes: the root and two operands.
         // It is therefore trivial to get the APO. We only need to check the
         // opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2513,7 +2522,7 @@ class BoUpSLP {
         bool IsInverseOperation = !isCommutative(cast<Instruction>(SelectedOp));
         for (unsigned OpIdx : seq<unsigned>(NumOperands)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][I] = {Iter->Ops[OpIdx], APO, false};
+          OpsVec[OpIdx][I] = {Ops[OpIdx], APO, false};
         }
       }
     }
@@ -3417,20 +3426,10 @@ class BoUpSLP {
       for (unsigned OpIdx : seq<unsigned>(NumOperands))
         Operands[OpIdx].resize(NumLanes);
       for (auto [I, V] : enumerate(Scalars)) {
-        SmallVector<InterchangeableInstruction> IIList =
-            getInterchangeableInstruction(cast<Instruction>(V));
-        auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
-          return II.Opcode == MainOp->getOpcode();
-        });
-        if (Iter == IIList.end())
-          Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
-            return II.Opcode == AltOp->getOpcode();
-          });
-        assert(Iter != IIList.end() &&
-               "Cannot find an interchangeable instruction.");
-        assert(Iter->Ops.size() == NumOperands &&
-               "Expected same number of operands");
-        for (auto [J, Op] : enumerate(Iter->Ops))
+        auto [SelectedOp, Ops] =
+            getInterchangeableInstruction(cast<Instruction>(V), MainOp, AltOp);
+        assert(Ops.size() == NumOperands && "Expected same number of operands");
+        for (auto [J, Op] : enumerate(Ops))
           Operands[J][I] = Op;
       }
     }

>From 10977812eef36e31162664d76ff0bbeb113539a2 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Fri, 6 Dec 2024 01:35:35 -0800
Subject: [PATCH 5/8] fix conflict

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 73 ++++---------------
 ...reversed-strided-node-with-external-ptr.ll | 17 ++---
 .../SLPVectorizer/X86/barriercall.ll          | 11 +--
 .../X86/extract-scalar-from-undef.ll          | 43 +++--------
 .../SLPVectorizer/X86/extractcost.ll          | 15 +---
 5 files changed, 40 insertions(+), 119 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bfa4ac5a22a225..fe660b1f81714c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1072,11 +1072,11 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
       return InstructionsState::invalid();
   }
-<<<<<<< HEAD
+  bool AnyPoison = InstCnt != VL.size();
   // Currently, this is only used for binary ops.
   // TODO: support all instructions
   SmallVector<InterchangeableInstruction> InterchangeableOpcode =
-      getInterchangeableInstruction(cast<Instruction>(VL[BaseIndex]));
+      getInterchangeableInstruction(cast<Instruction>(V));
   SmallVector<InterchangeableInstruction> AlternateInterchangeableOpcode;
   auto UpdateInterchangeableOpcode =
       [](SmallVector<InterchangeableInstruction> &LHS,
@@ -1089,9 +1089,6 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         LHS.swap(NewInterchangeableOpcode);
         return true;
       };
-=======
-  bool AnyPoison = InstCnt != VL.size();
->>>>>>> upstream/main
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
     auto *I = dyn_cast<Instruction>(VL[Cnt]);
     if (!I)
@@ -1123,7 +1120,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                       }),
             ThisInterchangeableOpcode.end());
         if (InterchangeableOpcode.empty() || ThisInterchangeableOpcode.empty())
-          return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+          return InstructionsState::invalid();
         AlternateInterchangeableOpcode.swap(ThisInterchangeableOpcode);
         continue;
       }
@@ -1230,7 +1227,6 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     return InstructionsState::invalid();
   }
 
-<<<<<<< HEAD
   if (IsBinOp) {
     auto FindOp = [&](ArrayRef<InterchangeableInstruction> CandidateOp) {
       for (Value *V : VL)
@@ -1244,12 +1240,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     Instruction *AltOp = AlternateInterchangeableOpcode.empty()
                              ? MainOp
                              : FindOp(AlternateInterchangeableOpcode);
-    return InstructionsState(VL[BaseIndex], MainOp, AltOp);
+    return InstructionsState(MainOp, AltOp);
   }
-  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
-=======
   return InstructionsState(cast<Instruction>(V),
->>>>>>> upstream/main
                            cast<Instruction>(VL[AltIndex]));
 }
 
@@ -2593,11 +2586,18 @@ class BoUpSLP {
       InstructionsState S = getSameOpcode(VL, TLI);
       for (unsigned OpIdx : seq<unsigned>(NumOperands))
         OpsVec[OpIdx].resize(NumLanes);
-<<<<<<< HEAD
-      for (auto [I, V] : enumerate(VL)) {
-        assert(isa<Instruction>(V) && "Expected instruction");
+      for (auto [Lane, V] : enumerate(VL)) {
+        assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
+               "Expected instruction or poison value");
+        if (isa<PoisonValue>(V)) {
+          for (unsigned OpIdx : seq<unsigned>(NumOperands))
+            OpsVec[OpIdx][Lane] = {
+                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                false};
+          continue;
+        }
         auto [SelectedOp, Ops] = getInterchangeableInstruction(
-            cast<Instruction>(V), S.MainOp, S.AltOp);
+            cast<Instruction>(V), S.getMainOp(), S.getAltOp());
         // Our tree has just 3 nodes: the root and two operands.
         // It is therefore trivial to get the APO. We only need to check the
         // opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2610,30 +2610,8 @@ class BoUpSLP {
         // tell the inverse operations by checking commutativity.
         bool IsInverseOperation = !isCommutative(cast<Instruction>(SelectedOp));
         for (unsigned OpIdx : seq<unsigned>(NumOperands)) {
-=======
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
-                 "Expected instruction or poison value");
-          // Our tree has just 3 nodes: the root and two operands.
-          // It is therefore trivial to get the APO. We only need to check the
-          // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
-          // RHS operand. The LHS operand of both add and sub is never attached
-          // to an inversese operation in the linearized form, therefore its APO
-          // is false. The RHS is true only if VL[Lane] is an inverse operation.
-
-          // Since operand reordering is performed on groups of commutative
-          // operations or alternating sequences (e.g., +, -), we can safely
-          // tell the inverse operations by checking commutativity.
-          if (isa<PoisonValue>(VL[Lane])) {
-            OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
-                false};
-            continue;
-          }
-          bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
->>>>>>> upstream/main
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
-          OpsVec[OpIdx][I] = {Ops[OpIdx], APO, false};
+          OpsVec[OpIdx][Lane] = {Ops[OpIdx], APO, false};
         }
       }
     }
@@ -3534,24 +3512,6 @@ class BoUpSLP {
       copy(OpVL, Operands[OpIdx].begin());
     }
 
-<<<<<<< HEAD
-    /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
-      assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
-      Operands.resize(I0->getNumOperands());
-      unsigned NumLanes = Scalars.size();
-      unsigned NumOperands = I0->getNumOperands();
-      for (unsigned OpIdx : seq<unsigned>(NumOperands))
-        Operands[OpIdx].resize(NumLanes);
-      for (auto [I, V] : enumerate(Scalars)) {
-        auto [SelectedOp, Ops] =
-            getInterchangeableInstruction(cast<Instruction>(V), MainOp, AltOp);
-        assert(Ops.size() == NumOperands && "Expected same number of operands");
-        for (auto [J, Op] : enumerate(Ops))
-          Operands[J][I] = Op;
-      }
-=======
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
       VLOperands Ops(Scalars, MainOp, R);
@@ -3559,7 +3519,6 @@ class BoUpSLP {
         Ops.reorder();
       for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
         setOperand(I, Ops.getVL(I));
->>>>>>> upstream/main
     }
 
     /// Reorders operands of the node to the given mask \p Mask.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 8b768ec4ca83af..74d7f1c91f3bff 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -12,18 +12,13 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK:       [[BB]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
-<<<<<<< HEAD
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
-=======
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
->>>>>>> upstream/main
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub <2 x double> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP11]], ptr align 8 [[ARRAYIDX17_I28_1]], i64 -8, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = fsub <2 x double> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP12]], ptr align 8 [[TMP8]], i64 -8, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    br label %[[BB]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
index fd52e01aa37beb..e3b4898e852126 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -10,15 +10,8 @@ define i32 @foo(ptr nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-<<<<<<< HEAD
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
-=======
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9)
->>>>>>> upstream/main
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 9)
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index 151185c6899e94..119882f6ced1f4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -4,38 +4,19 @@
 define i64 @foo(i32 %tmp7) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  bb:
-<<<<<<< HEAD
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP7:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[TMP7:%.*]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, <2 x i32> <i32 undef, i32 0>, i64 4)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, <8 x i32> <i32 poison, i32 poison, i32 10, i32 0, i32 1, i32 13, i32 poison, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 undef, i32 6
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP12]], 0
-=======
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[TMP5:%.*]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = sub i32 undef, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 undef, i32 0>, i32 [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, i32 [[TMP24]], i32 6
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 poison, i32 2, i32 3, i32 poison, i32 14, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP10]], 0
->>>>>>> upstream/main
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 undef>, i32 undef, i32 6
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 poison, i32 2, i32 3, i32 poison, i32 14, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP11]], 0
 ; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP64]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
index a22259369ce4f8..fc62b0b38fd535 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
@@ -9,17 +9,10 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-<<<<<<< HEAD
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
-=======
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9)
->>>>>>> upstream/main
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 8, i32 10>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 9)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP6]], [[M:%.*]]
 ; CHECK-NEXT:    [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP6]], [[M]]
 ; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]]

>From 999f45a44b48ea788b4d85e35556366fa71e4fe9 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Fri, 6 Dec 2024 01:43:54 -0800
Subject: [PATCH 6/8] fix VL may contain PoisonValue

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fe660b1f81714c..c46dd65f229681 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1229,10 +1229,13 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
 
   if (IsBinOp) {
     auto FindOp = [&](ArrayRef<InterchangeableInstruction> CandidateOp) {
-      for (Value *V : VL)
+      for (Value *V : VL) {
+        if (!isa<Instruction>(V))
+          continue;
         for (const InterchangeableInstruction &I : CandidateOp)
           if (cast<Instruction>(V)->getOpcode() == I.Opcode)
             return cast<Instruction>(V);
+      }
       llvm_unreachable(
           "Cannot find the candidate instruction for InstructionsState.");
     };

>From ee74f11629e10ee5b2c9f212b3df62e2f6b70add Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Wed, 11 Dec 2024 01:06:32 -0800
Subject: [PATCH 7/8] appendOperandsOfVL does not have to call getSameOpcode.
 Instead, we pass MainOp and AltOp.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c46dd65f229681..e9e0bf1641de9d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2575,18 +2575,18 @@ class BoUpSLP {
     }
 
     /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *MainOp,
+                            Instruction *AltOp) {
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
       // IntrinsicInst::isCommutative returns true if swapping the first "two"
       // arguments to the intrinsic produces the same result.
       constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = VL0->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
+      unsigned NumOperands = MainOp->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
-      InstructionsState S = getSameOpcode(VL, TLI);
       for (unsigned OpIdx : seq<unsigned>(NumOperands))
         OpsVec[OpIdx].resize(NumLanes);
       for (auto [Lane, V] : enumerate(VL)) {
@@ -2595,12 +2595,12 @@ class BoUpSLP {
         if (isa<PoisonValue>(V)) {
           for (unsigned OpIdx : seq<unsigned>(NumOperands))
             OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
                 false};
           continue;
         }
-        auto [SelectedOp, Ops] = getInterchangeableInstruction(
-            cast<Instruction>(V), S.getMainOp(), S.getAltOp());
+        auto [SelectedOp, Ops] =
+            getInterchangeableInstruction(cast<Instruction>(V), MainOp, AltOp);
         // Our tree has just 3 nodes: the root and two operands.
         // It is therefore trivial to get the APO. We only need to check the
         // opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2721,11 +2721,12 @@ class BoUpSLP {
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, Instruction *MainOp,
+               Instruction *AltOp, const BoUpSLP &R)
         : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor((VL0->getParent()))) {
+          L(R.LI->getLoopFor(MainOp->getParent())) {
       // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, VL0);
+      appendOperandsOfVL(RootVL, MainOp, AltOp);
     }
 
     /// \Returns a value vector with the operands across all lanes for the
@@ -3517,7 +3518,7 @@ class BoUpSLP {
 
     /// Set this bundle's operand from Scalars.
     void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, MainOp, R);
+      VLOperands Ops(Scalars, MainOp, AltOp, R);
       if (RequireReorder)
         Ops.reorder();
       for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
@@ -8733,7 +8734,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
-      VLOperands Ops(VL, VL0, *this);
+      VLOperands Ops(VL, VL0, S.getAltOp(), *this);
       if (cast<CmpInst>(VL0)->isCommutative()) {
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.

>From ddf7ab48ccdb29103fa1bbbc139121ddeb5f4394 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Wed, 11 Dec 2024 01:08:46 -0800
Subject: [PATCH 8/8] refactor getInterchangeableInstruction

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e9e0bf1641de9d..a20bf01446d3cb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -946,18 +946,15 @@ getInterchangeableInstruction(Instruction *I, Instruction *MainOp,
   auto Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
     return II.Opcode == MainOp->getOpcode();
   });
-  Value *SelectedOp;
   if (Iter == IIList.end()) {
     Iter = find_if(IIList, [&](const InterchangeableInstruction &II) {
       return II.Opcode == AltOp->getOpcode();
     });
     assert(Iter != IIList.end() &&
            "Cannot find an interchangeable instruction.");
-    SelectedOp = AltOp;
-  } else {
-    SelectedOp = MainOp;
+    return std::make_pair(AltOp, Iter->Ops);
   }
-  return std::make_pair(SelectedOp, Iter->Ops);
+  return std::make_pair(MainOp, Iter->Ops);
 }
 
 /// \returns true if \p Opcode is allowed as part of the main/alternate



More information about the llvm-commits mailing list