[llvm] 708eb1b - [SLP]Add shuffling of extractelements to avoid extra costs/data movement.

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 20 06:16:34 PST 2023


Author: Alexey Bataev
Date: 2023-02-20T06:14:42-08:00
New Revision: 708eb1b96d9a36f9c0182b7d53c492059778fa35

URL: https://github.com/llvm/llvm-project/commit/708eb1b96d9a36f9c0182b7d53c492059778fa35
DIFF: https://github.com/llvm/llvm-project/commit/708eb1b96d9a36f9c0182b7d53c492059778fa35.diff

LOG: [SLP]Add shuffling of extractelements to avoid extra costs/data movement.

If the scalar must be extracted and then used in the gather node,
instead we can emit shuffle instruction to avoid those extra
extractelements and vector-to-scalar and back data movement.

Part of D110978

Differential Revision: https://reviews.llvm.org/D141940

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53de2e0158968..3ff40b5c9247e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -540,6 +540,114 @@ static std::optional<unsigned> getExtractIndex(Instruction *E) {
   return *EI->idx_begin();
 }
 
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static std::optional<TTI::ShuffleKind>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+                           SmallVectorImpl<int> &Mask) {
+  // Scan list of gathered scalars for extractelements that can be represented
+  // as shuffles.
+  MapVector<Value *, SmallVector<int>> VectorOpToIdx;
+  SmallVector<int> UndefVectorExtracts;
+  for (int I = 0, E = VL.size(); I < E; ++I) {
+    auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+    if (!EI)
+      continue;
+    auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
+    if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
+      continue;
+    std::optional<unsigned> Idx = getExtractIndex(EI);
+    // Undefined index.
+    if (!Idx) {
+      UndefVectorExtracts.push_back(I);
+      continue;
+    }
+    SmallBitVector ExtractMask(VecTy->getNumElements(), true);
+    ExtractMask.reset(*Idx);
+    if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
+      UndefVectorExtracts.push_back(I);
+      continue;
+    }
+    VectorOpToIdx[EI->getVectorOperand()].push_back(I);
+  }
+  // Sort the vector operands by the maximum number of uses in extractelements.
+  MapVector<unsigned, SmallVector<Value *>> VFToVector;
+  for (const auto &Data : VectorOpToIdx)
+    VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
+        .push_back(Data.first);
+  for (auto &Data : VFToVector) {
+    stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
+      return VectorOpToIdx.find(V1)->second.size() >
+             VectorOpToIdx.find(V2)->second.size();
+    });
+  }
+  // Find the best pair of the vectors with the same number of elements or a
+  // single vector.
+  const int UndefSz = UndefVectorExtracts.size();
+  unsigned SingleMax = 0;
+  Value *SingleVec = nullptr;
+  unsigned PairMax = 0;
+  std::pair<Value *, Value *> PairVec(nullptr, nullptr);
+  for (auto &Data : VFToVector) {
+    Value *V1 = Data.second.front();
+    if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
+      SingleMax = VectorOpToIdx[V1].size() + UndefSz;
+      SingleVec = V1;
+    }
+    Value *V2 = nullptr;
+    if (Data.second.size() > 1)
+      V2 = *std::next(Data.second.begin());
+    if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
+                            UndefSz) {
+      PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
+      PairVec = std::make_pair(V1, V2);
+    }
+  }
+  if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
+    return std::nullopt;
+  // Check if better to perform a shuffle of 2 vectors or just of a single
+  // vector.
+  SmallVector<Value *> SavedVL(VL.begin(), VL.end());
+  SmallVector<Value *> GatheredExtracts(
+      VL.size(), PoisonValue::get(VL.front()->getType()));
+  if (SingleMax >= PairMax && SingleMax) {
+    for (int Idx : VectorOpToIdx[SingleVec])
+      std::swap(GatheredExtracts[Idx], VL[Idx]);
+  } else {
+    for (Value *V : {PairVec.first, PairVec.second})
+      for (int Idx : VectorOpToIdx[V])
+        std::swap(GatheredExtracts[Idx], VL[Idx]);
+  }
+  // Add extracts from undefs too.
+  for (int Idx : UndefVectorExtracts)
+    std::swap(GatheredExtracts[Idx], VL[Idx]);
+  // Check that gather of extractelements can be represented as just a
+  // shuffle of a single/two vectors the scalars are extracted from.
+  std::optional<TTI::ShuffleKind> Res =
+      isFixedVectorShuffle(GatheredExtracts, Mask);
+  if (!Res) {
+    // TODO: try to check other subsets if possible.
+    // Restore the original VL if attempt was not successful.
+    VL.swap(SavedVL);
+    return std::nullopt;
+  }
+  // Restore unused scalars from mask, if some of the extractelements were not
+  // selected for shuffle.
+  for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
+    auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
+    if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
+        !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
+        is_contained(UndefVectorExtracts, I))
+      continue;
+    if (Mask[I] == UndefMaskElem)
+      std::swap(VL[I], GatheredExtracts[I]);
+  }
+  return Res;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -6622,15 +6730,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   // FIXME: it tries to fix a problem with MSVC buildbots.
   TargetTransformInfo *TTI = this->TTI;
-  auto AdjustExtractsCost = [=](InstructionCost &Cost) {
+  auto AdjustExtractsCost = [=](InstructionCost &Cost,
+                                ArrayRef<int> Mask) -> Value * {
+    if (Mask.empty())
+      return nullptr;
+    Value *VecBase = nullptr;
     // If the resulting type is scalarized, do not adjust the cost.
     unsigned VecNumParts = TTI->getNumberOfParts(VecTy);
     if (VecNumParts == VecTy->getNumElements())
-      return;
+      return nullptr;
     DenseMap<Value *, int> ExtractVectorsTys;
     SmallPtrSet<Value *, 4> CheckedExtracts;
-    for (auto *V : VL) {
-      if (isa<UndefValue>(V))
+    for (auto [I, V] : enumerate(VL)) {
+      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == UndefMaskElem))
         continue;
       // If all users of instruction are going to be vectorized and this
       // instruction itself is not going to be vectorized, consider this
@@ -6644,6 +6756,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           (VE && VE != E))
         continue;
       auto *EE = cast<ExtractElementInst>(V);
+      VecBase = EE->getVectorOperand();
       std::optional<unsigned> EEIdx = getExtractIndex(EE);
       if (!EEIdx)
         continue;
@@ -6683,6 +6796,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       if (TTI->getNumberOfParts(EEVTy) > VecNumParts) {
         unsigned Idx = (Data.second / NumElts) * NumElts;
         unsigned EENumElts = EEVTy->getNumElements();
+        if (Idx % NumElts == 0)
+          continue;
         if (Idx + NumElts <= EENumElts) {
           Cost +=
               TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
@@ -6702,12 +6817,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                     VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
+    return VecBase;
   };
   if (E->State == TreeEntry::NeedToGather) {
     if (allConstant(VL))
       return 0;
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
+    unsigned VF = E->getVectorFactor();
+    SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+                                          E->ReuseShuffleIndices.end());
     SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
     // Build a mask out of the reorder indices and reorder scalars per this
     // mask.
@@ -6716,24 +6835,51 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
     if (!ReorderMask.empty())
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
+    SmallVector<int> ExtractMask;
+    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
     SmallVector<const TreeEntry *> Entries;
+    Type *ScalarTy = GatheredScalars.front()->getType();
+    // Check for gathered extracts.
+    ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+    SmallVector<Value *> IgnoredVals;
+    if (UserIgnoreList)
+      IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+
+    InstructionCost Cost = 0;
+    bool Resized = false;
+    if (Value *VecBase = AdjustExtractsCost(Cost, ExtractMask))
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
+        }
+
     // Do not try to look for reshuffled loads for gathered loads (they will be
     // handled later), for vectorized scalars, and cases, which are definitely
     // not profitable (splats and small gather nodes.)
-    if (E->getOpcode() != Instruction::Load || E->isAltShuffle() ||
+    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+        E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
       GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
     if (GatherShuffle) {
+      assert((Entries.size() == 1 || Entries.size() == 2) &&
+             "Expected shuffle of 1 or 2 entries.");
+      if (!Resized) {
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
+      }
       // Remove shuffled elements from list of gathers.
       for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
         if (Mask[I] != UndefMaskElem)
           GatheredScalars[I] = PoisonValue::get(ScalarTy);
       }
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
       InstructionCost GatherCost = 0;
       int Limit = Mask.size() * 2;
       if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
@@ -6762,29 +6908,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         GatherCost += getGatherCost(GatheredScalars);
       return GatherCost;
     }
-    if ((E->getOpcode() == Instruction::ExtractElement ||
-         all_of(E->Scalars,
-                [](Value *V) {
-                  return isa<ExtractElementInst, UndefValue>(V);
-                })) &&
-        allSameType(VL)) {
+    if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
       // Check that gather of extractelements can be represented as just a
       // shuffle of a single/two vectors the scalars are extracted from.
-      SmallVector<int> Mask;
-      std::optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
-          isFixedVectorShuffle(VL, Mask);
-      if (ShuffleKind) {
-        // Found the bunch of extractelement instructions that must be gathered
-        // into a vector and can be represented as a permutation elements in a
-        // single input vector or of 2 input vectors.
-        InstructionCost Cost =
-            computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
-        AdjustExtractsCost(Cost);
-        if (NeedToShuffleReuses)
-          Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                      FinalVecTy, E->ReuseShuffleIndices);
-        return Cost;
-      }
+      // Found the bunch of extractelement instructions that must be gathered
+      // into a vector and can be represented as a permutation elements in a
+      // single input vector or of 2 input vectors.
+      InstructionCost Cost =
+          computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
+      AdjustExtractsCost(Cost, ExtractMask);
+      if (NeedToShuffleReuses)
+        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                    FinalVecTy, E->ReuseShuffleIndices);
+      return Cost;
     }
     if (isSplat(VL)) {
       // Found the broadcasting of the single scalar, calculate the cost as the
@@ -9192,6 +9328,56 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
     if (E->getMainOp())
       setInsertPointAfterBundle(E);
+    unsigned VF = E->getVectorFactor();
+    auto AdjustExtracts = [&](const TreeEntry *E, ArrayRef<int> Mask) {
+      Value *VecBase = nullptr;
+      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+        int Idx = Mask[I];
+        if (Idx == UndefMaskElem)
+          continue;
+        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+        VecBase = EI->getVectorOperand();
+        // If all users are vectorized - can delete the extractelement
+        // itself.
+        if (any_of(EI->users(),
+                   [&](User *U) { return !ScalarToTreeEntry.count(U); }))
+          continue;
+        eraseInstruction(EI);
+      }
+      return VecBase;
+    };
+    auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
+      if (V1->getType() != V2->getType()) {
+        unsigned VF1 = cast<FixedVectorType>(V1->getType())->getNumElements();
+        unsigned VF2 = cast<FixedVectorType>(V2->getType())->getNumElements();
+        unsigned VF = std::max(VF1, VF2);
+        SmallVector<int> ExtMask(VF, UndefMaskElem);
+        std::iota(ExtMask.begin(),
+                  std::next(ExtMask.begin(), std::min(VF1, VF2)), 0);
+        if (VF1 < VF2) {
+          V1 = Builder.CreateShuffleVector(V1, ExtMask);
+          if (auto *I = dyn_cast<Instruction>(V1)) {
+            GatherShuffleExtractSeq.insert(I);
+            CSEBlocks.insert(I->getParent());
+          }
+        } else {
+          V2 = Builder.CreateShuffleVector(V2, ExtMask);
+          if (auto *I = dyn_cast<Instruction>(V2)) {
+            GatherShuffleExtractSeq.insert(I);
+            CSEBlocks.insert(I->getParent());
+          }
+        }
+      }
+      Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
+      if (auto *I = dyn_cast<Instruction>(Vec)) {
+        GatherShuffleExtractSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+      return Vec;
+    };
+
+    SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
+                                          E->ReuseShuffleIndices.end());
     SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
     // Build a mask out of the reorder indices and reorder scalars per this
     // mask.
@@ -9199,19 +9385,94 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     inversePermutation(E->ReorderIndices, ReorderMask);
     if (!ReorderMask.empty())
       reorderScalars(GatheredScalars, ReorderMask);
-    Value *Vec;
+    Value *Vec = nullptr;
     SmallVector<int> Mask;
+    SmallVector<int> ExtractMask;
+    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
     SmallVector<const TreeEntry *> Entries;
-    std::optional<TargetTransformInfo::ShuffleKind> Shuffle =
-        isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
-    if (Shuffle) {
+    Type *ScalarTy = GatheredScalars.front()->getType();
+    // Check for gathered extracts.
+    ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask);
+    SmallVector<Value *> IgnoredVals;
+    if (UserIgnoreList)
+      IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
+    bool Resized = false;
+    if (Value *VecBase = AdjustExtracts(E, ExtractMask))
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
+        }
+    // Gather extracts after we check for full matched gathers only.
+    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+        E->isAltShuffle() ||
+        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+        isSplat(E->Scalars) ||
+        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+    if (GatherShuffle) {
       assert((Entries.size() == 1 || Entries.size() == 2) &&
              "Expected shuffle of 1 or 2 entries.");
-      Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
-                                        Entries.back()->VectorizedValue, Mask);
-      if (auto *I = dyn_cast<Instruction>(Vec)) {
-        GatherShuffleExtractSeq.insert(I);
-        CSEBlocks.insert(I->getParent());
+      if (!Resized) {
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
+      }
+      // Remove shuffled elements from list of gathers.
+      for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
+        if (Mask[I] != UndefMaskElem)
+          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+      }
+    }
+    if ((ExtractShuffle || GatherShuffle) &&
+        all_of(GatheredScalars, PoisonValue::classof)) {
+      Value *Vec1 = nullptr;
+      if (ExtractShuffle) {
+        // Gather of extractelements can be represented as just a shuffle of
+        // a single/two vectors the scalars are extracted from.
+        // Find input vectors.
+        Value *Vec2 = nullptr;
+        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+          if (ExtractMask[I] == UndefMaskElem ||
+              (!Mask.empty() && Mask[I] != UndefMaskElem)) {
+            ExtractMask[I] = UndefMaskElem;
+            continue;
+          }
+          if (isa<UndefValue>(E->Scalars[I]))
+            continue;
+          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+          if (!Vec1) {
+            Vec1 = EI->getVectorOperand();
+          } else if (Vec1 != EI->getVectorOperand()) {
+            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                   "Expected only 1 or 2 vectors shuffle.");
+            Vec2 = EI->getVectorOperand();
+          }
+        }
+        if (Vec2)
+          Vec1 = CreateShuffle(Vec1, Vec2, ExtractMask);
+        else if (Vec1)
+          Vec1 = CreateShuffle(Vec1, Vec1, ExtractMask);
+      }
+      if (GatherShuffle) {
+        Vec = CreateShuffle(Entries.front()->VectorizedValue,
+                            Entries.back()->VectorizedValue, Mask);
+        if (Vec1) {
+          // Build final mask.
+          for (auto [I, Idx] : enumerate(Mask)) {
+            if (ExtractMask[I] != UndefMaskElem)
+              Idx = I;
+            else if (Idx != UndefMaskElem)
+              Idx = I + VF;
+          }
+          Vec = CreateShuffle(Vec1, Vec, Mask);
+        }
+      } else {
+        Vec = Vec1;
       }
     } else {
       Vec = gather(E->Scalars);

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index 02d1cdfb03d28..ca415b377b214 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -110,18 +110,15 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 
 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -141,18 +138,15 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-LABEL: @k_bb(
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   br label %bb1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 12392f364eb9d..731be77a76511 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -110,18 +110,15 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 
 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -141,18 +138,15 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-LABEL: @k_bb(
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   br label %bb1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
index 2d6ea13b1dd6f..8f592783d8d76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -18,21 +18,21 @@ define void @splat(i8 %a, i8 %b, i8 %c) {
 ; SSE-LABEL: @splat(
 ; SSE-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
 ; SSE-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
-; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
-; SSE-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]]
-; SSE-NEXT:    store <16 x i8> [[TMP4]], ptr @cle, align 16
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], ptr @cle, align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @splat(
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
-; AVX-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]]
-; AVX-NEXT:    store <16 x i8> [[TMP4]], ptr @cle, align 16
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]]
+; AVX-NEXT:    store <16 x i8> [[TMP6]], ptr @cle, align 16
 ; AVX-NEXT:    ret void
 ;
   %1 = xor i8 %c, %a
@@ -91,15 +91,15 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
 ;
 ; AVX-LABEL: @same_opcode_on_one_side(
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
-; AVX-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2
-; AVX-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[SHUFFLE2]]
-; AVX-NEXT:    store <4 x i32> [[TMP6]], ptr @cle32, align 16
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+; AVX-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]]
+; AVX-NEXT:    store <4 x i32> [[TMP9]], ptr @cle32, align 16
 ; AVX-NEXT:    ret void
 ;
   %add1 = add i32 %c, %a

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index 7effd1b11a63e..e266ce02962b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -146,16 +146,14 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
 ; MINTREESIZE-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
 ; MINTREESIZE-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> <i32 0, i32 1>
 ; MINTREESIZE-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
 ; MINTREESIZE-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1
+; MINTREESIZE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> <i32 2, i32 3>
 ; MINTREESIZE-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
 ; MINTREESIZE-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
+; MINTREESIZE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
+; MINTREESIZE-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 925b9a31dd5db..4be025a7324a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -180,16 +180,14 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
 ; MINTREESIZE-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
 ; MINTREESIZE-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> <i32 0, i32 1>
 ; MINTREESIZE-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
 ; MINTREESIZE-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; MINTREESIZE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0
-; MINTREESIZE-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1
+; MINTREESIZE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> <i32 2, i32 3>
 ; MINTREESIZE-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
 ; MINTREESIZE-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
-; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
-; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
+; MINTREESIZE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
+; MINTREESIZE-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index 9da58994c7580..6c7d5e6324ca6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -18,41 +18,11 @@
 define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
-; SSE2-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
-; SSE2-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
-; SSE2-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
-; SSE2-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
-; SSE2-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
-; SSE2-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0
-; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2
-; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3
-; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4
-; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5
-; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6
-; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7
-; SSE2-NEXT:    [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1
-; SSE2-NEXT:    [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2
-; SSE2-NEXT:    [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3
-; SSE2-NEXT:    [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1
-; SSE2-NEXT:    [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2
-; SSE2-NEXT:    [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3
-; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0
-; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1
-; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2
-; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3
-; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4
-; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5
-; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6
-; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7
-; SSE2-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]])
-; SSE2-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]])
-; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]]
+; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
+; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -70,41 +40,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
 ;
 ; AVX-LABEL: @reduce_and4(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
-; AVX-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
-; AVX-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
-; AVX-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
-; AVX-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
-; AVX-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7
-; AVX-NEXT:    [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1
-; AVX-NEXT:    [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2
-; AVX-NEXT:    [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3
-; AVX-NEXT:    [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1
-; AVX-NEXT:    [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2
-; AVX-NEXT:    [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0
-; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7
-; AVX-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]])
-; AVX-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]]
+; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; AVX-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -154,41 +94,11 @@ entry:
 
 define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4_transpose(
-; SSE2-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0
-; SSE2-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
-; SSE2-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
-; SSE2-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1
-; SSE2-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1
-; SSE2-NEXT:    [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
-; SSE2-NEXT:    [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
-; SSE2-NEXT:    [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2
-; SSE2-NEXT:    [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2
-; SSE2-NEXT:    [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
-; SSE2-NEXT:    [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1
-; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2
-; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3
-; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4
-; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5
-; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6
-; SSE2-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7
-; SSE2-NEXT:    [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3
-; SSE2-NEXT:    [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3
-; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0
-; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1
-; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2
-; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3
-; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4
-; SSE2-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5
-; SSE2-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6
-; SSE2-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7
-; SSE2-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]])
-; SSE2-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
-; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
+; SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -204,41 +114,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
 ; SSE42-NEXT:    ret i32 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @reduce_and4_transpose(
-; AVX-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0
-; AVX-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
-; AVX-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
-; AVX-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1
-; AVX-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1
-; AVX-NEXT:    [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
-; AVX-NEXT:    [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
-; AVX-NEXT:    [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2
-; AVX-NEXT:    [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2
-; AVX-NEXT:    [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
-; AVX-NEXT:    [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7
-; AVX-NEXT:    [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3
-; AVX-NEXT:    [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3
-; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6
-; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7
-; AVX-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]])
-; AVX-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
+; AVX-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
 ; AVX-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; AVX-NEXT:    ret i32 [[OP_RDX1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
index e8ca9aeb8147c..a8b216552cb2b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
@@ -18,7 +18,7 @@ define void @fextr(ptr %ptr) {
 ; YAML-NEXT: Function:        fextr
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '-20'
+; YAML-NEXT:   - Cost:            '-19'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '4'
 


        


More information about the llvm-commits mailing list