[llvm] b10ecfa - [SLP]Represent externally used values as original scalars, if profitable.

via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 12 07:15:07 PDT 2024


Author: Alexey Bataev
Date: 2024-08-12T10:15:02-04:00
New Revision: b10ecfa914dd1bc2013584917d0505ba5f15f75c

URL: https://github.com/llvm/llvm-project/commit/b10ecfa914dd1bc2013584917d0505ba5f15f75c
DIFF: https://github.com/llvm/llvm-project/commit/b10ecfa914dd1bc2013584917d0505ba5f15f75c.diff

LOG: [SLP]Represent externally used values as original scalars, if profitable.

Currently SLP vectorizer tries to keep only GEPs as scalar, if they are
vectorized but used externally. Same approach can be used for all scalar
values. This patch tries to keep original scalars if all its operands
remain scalar or externally used, the cost of the original scalar is
lower than the cost of the extractelement instruction, or if the number
of externally used scalars in the same entry is power of 2. Last
criterion allows better revectorization for multiply used scalars.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/100904

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
    llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
    llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll
    llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
    llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
    llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
    llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
    llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
    llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
    llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
    llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
    llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll
    llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
    llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
    llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll
    llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
    llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
    llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
    llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
    llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
    llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
    llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
    llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
    llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f5be6bbe4a2b67..c05e4b822d2c4e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1253,7 +1253,7 @@ class BoUpSLP {
     NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
     ExternalUses.clear();
-    ExternalUsesAsGEPs.clear();
+    ExternalUsesAsOriginalScalar.clear();
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -3468,7 +3468,7 @@ class BoUpSLP {
 
   /// A list of GEPs which can be reaplced by scalar GEPs instead of
   /// extractelement instructions.
-  SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
+  SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
 
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
@@ -10663,6 +10663,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   SmallDenseSet<Value *, 4> UsedInserts;
   DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
   std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
+  DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10771,52 +10772,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         }
       }
     }
-    // Leave the GEPs as is, they are free in most cases and better to keep them
-    // as GEPs.
+
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
+    // If we plan to rewrite the tree in a smaller type, we will need to sign
+    // extend the extracted value back to the original type. Here, we account
+    // for the extract and the added cost of the sign extend if needed.
+    InstructionCost ExtraCost = TTI::TCC_Free;
+    auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
+    const TreeEntry *Entry = getTreeEntry(EU.Scalar);
+    auto It = MinBWs.find(Entry);
+    if (It != MinBWs.end()) {
+      auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
+      unsigned Extend =
+          It->second.second ? Instruction::SExt : Instruction::ZExt;
+      VecTy = getWidenedType(MinTy, BundleWidth);
+      ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+                                                VecTy, EU.Lane);
+    } else {
+      ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                          CostKind, EU.Lane);
+    }
+    // Leave the scalar instructions as is if they are cheaper than extracts.
+    if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
+        Entry->getOpcode() == Instruction::Load) {
       if (!ValueToExtUses) {
         ValueToExtUses.emplace();
         for_each(enumerate(ExternalUses), [&](const auto &P) {
+          // Ignore phis in loops.
+          if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
+            auto *I = cast<Instruction>(P.value().Scalar);
+            const Loop *L = LI->getLoopFor(Phi->getParent());
+            if (L && (Phi->getParent() == I->getParent() ||
+                      L == LI->getLoopFor(I->getParent())))
+              return;
+          }
+
           ValueToExtUses->try_emplace(P.value().Scalar, P.index());
         });
       }
-      // Can use original GEP, if no operands vectorized or they are marked as
-      // externally used already.
-      bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
-        if (!getTreeEntry(V))
-          return true;
-        auto It = ValueToExtUses->find(V);
-        if (It != ValueToExtUses->end()) {
-          // Replace all uses to avoid compiler crash.
-          ExternalUses[It->second].User = nullptr;
+      // Can use original instruction, if no operands vectorized or they are
+      // marked as externally used already.
+      auto *Inst = cast<Instruction>(EU.Scalar);
+      bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+        if (!getTreeEntry(V)) {
+          // Some extractelements might be not vectorized, but
+          // transformed into shuffle and removed from the function,
+          // consider it here.
+          if (auto *EE = dyn_cast<ExtractElementInst>(V))
+            return !EE->hasOneUse() || !MustGather.contains(EE);
           return true;
         }
-        return false;
+        return ValueToExtUses->contains(V);
       });
-      if (CanBeUsedAsGEP) {
-        ExtractCost += TTI->getInstructionCost(GEP, CostKind);
-        ExternalUsesAsGEPs.insert(EU.Scalar);
-        continue;
+      if (CanBeUsedAsScalar) {
+        InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+        bool KeepScalar = ScalarCost <= ExtraCost;
+        if (KeepScalar && ScalarCost != TTI::TCC_Free &&
+            ExtraCost - ScalarCost <= TTI::TCC_Basic) {
+          unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
+            return ValueToExtUses->contains(V);
+          });
+          auto It = ExtractsCount.find(Entry);
+          if (It != ExtractsCount.end())
+            ScalarUsesCount -= It->getSecond().size();
+          // Keep original scalar if number of externally used instructions in
+          // the same entry is not power of 2. It may help to do some extra
+          // vectorization for now.
+          KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
+        }
+        if (KeepScalar) {
+          ExternalUsesAsOriginalScalar.insert(EU.Scalar);
+          for_each(Inst->operands(), [&](Value *V) {
+            auto It = ValueToExtUses->find(V);
+            if (It != ValueToExtUses->end()) {
+              // Replace all uses to avoid compiler crash.
+              ExternalUses[It->second].User = nullptr;
+            }
+          });
+          ExtraCost = ScalarCost;
+          ExtractsCount[Entry].insert(Inst);
+        }
       }
     }
 
-    // If we plan to rewrite the tree in a smaller type, we will need to sign
-    // extend the extracted value back to the original type. Here, we account
-    // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
-    auto It = MinBWs.find(getTreeEntry(EU.Scalar));
-    if (It != MinBWs.end()) {
-      auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
-      unsigned Extend =
-          It->second.second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = getWidenedType(MinTy, BundleWidth);
-      ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
-                                                   VecTy, EU.Lane);
-    } else {
-      ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                             CostKind, EU.Lane);
-    }
+    ExtractCost += ExtraCost;
   }
   // Add reduced value cost, if resized.
   if (!VectorizedVals.empty()) {
@@ -14067,8 +14106,7 @@ Value *BoUpSLP::vectorizeTree(
   DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
   // Maps extract Scalar to the corresponding extractelement instruction in the
   // basic block. Only one extractelement per block should be emitted.
-  DenseMap<Value *,
-           DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
+  DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
       ScalarToEEs;
   SmallDenseSet<Value *, 4> UsedInserts;
   DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
@@ -14098,30 +14136,41 @@ Value *BoUpSLP::vectorizeTree(
       if (Scalar->getType() != Vec->getType()) {
         Value *Ex = nullptr;
         Value *ExV = nullptr;
-        auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
-        bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
+        auto *Inst = dyn_cast<Instruction>(Scalar);
+        bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
         auto It = ScalarToEEs.find(Scalar);
         if (It != ScalarToEEs.end()) {
           // No need to emit many extracts, just move the only one in the
           // current block.
-          auto EEIt = It->second.find(Builder.GetInsertBlock());
+          auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
+                                                  : Builder.GetInsertBlock());
           if (EEIt != It->second.end()) {
-            Instruction *I = EEIt->second.first;
-            if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
+            Value *PrevV = EEIt->second.first;
+            if (auto *I = dyn_cast<Instruction>(PrevV);
+                I && !ReplaceInst &&
+                Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
                 Builder.GetInsertPoint()->comesBefore(I)) {
               I->moveBefore(*Builder.GetInsertPoint()->getParent(),
                             Builder.GetInsertPoint());
-              if (auto *CI = EEIt->second.second)
+              if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
                 CI->moveAfter(I);
             }
-            Ex = I;
+            Ex = PrevV;
             ExV = EEIt->second.second ? EEIt->second.second : Ex;
           }
         }
         if (!Ex) {
           // "Reuse" the existing extract to improve final codegen.
-          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
-              ES && isa<Instruction>(Vec)) {
+          if (ReplaceInst) {
+            // Leave the instruction as is, if it cheaper extracts and all
+            // operands are scalar.
+            auto *CloneInst = Inst->clone();
+            CloneInst->insertBefore(Inst);
+            if (Inst->hasName())
+              CloneInst->takeName(Inst);
+            Ex = CloneInst;
+          } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
+                     ES && isa<Instruction>(Vec)) {
             Value *V = ES->getVectorOperand();
             auto *IVec = cast<Instruction>(Vec);
             if (const TreeEntry *ETE = getTreeEntry(V))
@@ -14132,18 +14181,6 @@ Value *BoUpSLP::vectorizeTree(
               Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
             else
               Ex = Builder.CreateExtractElement(Vec, Lane);
-          } else if (ReplaceGEP) {
-            // Leave the GEPs as is, they are free in most cases and better to
-            // keep them as GEPs.
-            auto *CloneGEP = GEP->clone();
-            if (isa<Instruction>(Vec))
-              CloneGEP->insertBefore(*Builder.GetInsertBlock(),
-                                     Builder.GetInsertPoint());
-            else
-              CloneGEP->insertBefore(GEP);
-            if (GEP->hasName())
-              CloneGEP->takeName(GEP);
-            Ex = CloneGEP;
           } else if (auto *VecTy =
                          dyn_cast<FixedVectorType>(Scalar->getType())) {
             assert(SLPReVec && "FixedVectorType is not expected.");
@@ -14164,14 +14201,15 @@ Value *BoUpSLP::vectorizeTree(
           if (Scalar->getType() != Ex->getType())
             ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
                                         MinBWs.find(E)->second.second);
-          if (auto *I = dyn_cast<Instruction>(Ex))
-            ScalarToEEs[Scalar].try_emplace(
-                Builder.GetInsertBlock(),
-                std::make_pair(I, cast<Instruction>(ExV)));
+          auto *I = dyn_cast<Instruction>(Ex);
+          ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
+                                            : &F->getEntryBlock(),
+                                          std::make_pair(Ex, ExV));
         }
         // The then branch of the previous if may produce constants, since 0
         // operand might be a constant.
-        if (auto *ExI = dyn_cast<Instruction>(Ex)) {
+        if (auto *ExI = dyn_cast<Instruction>(Ex);
+            ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
           GatherShuffleExtractSeq.insert(ExI);
           CSEBlocks.insert(ExI->getParent());
         }
@@ -14192,9 +14230,10 @@ Value *BoUpSLP::vectorizeTree(
         continue;
       assert((ExternallyUsedValues.count(Scalar) ||
               Scalar->hasNUsesOrMore(UsesLimit) ||
+              ExternalUsesAsOriginalScalar.contains(Scalar) ||
               any_of(Scalar->users(),
                      [&](llvm::User *U) {
-                       if (ExternalUsesAsGEPs.contains(U))
+                       if (ExternalUsesAsOriginalScalar.contains(U))
                          return true;
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
index d4e3fb3e248537..0d6eb7b5e08aae 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
@@ -4,8 +4,9 @@
 define i16 @foo(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 0, ptr [[P1:%.*]], align 1
-; CHECK-NEXT:    [[CONST_MAT:%.*]] = or i32 0, 0
+; CHECK-NEXT:    [[CONST:%.*]] = bitcast i32 0 to i32
+; CHECK-NEXT:    store i32 [[CONST]], ptr [[P1:%.*]], align 1
+; CHECK-NEXT:    [[CONST_MAT:%.*]] = or i32 [[CONST]], 0
 ; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[P2:%.*]], align 1
 ; CHECK-NEXT:    ret i16 0
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 70cdd08548b2df..8f6d5d8f2d7ec8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -1236,20 +1236,20 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
 ; CHECK:       bb22:
 ; CHECK-NEXT:    [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
+; CHECK-NEXT:    [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    store float [[TMP4]], ptr [[T25]], align 4
+; CHECK-NEXT:    store float [[T26]], ptr [[T25]], align 4
 ; CHECK-NEXT:    [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
 ; CHECK-NEXT:    br label [[BB30]]
 ; CHECK:       bb30:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP4]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[BB36:%.*]]
 ; CHECK:       bb36:
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], <float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[ARG_3]], align 4
 ; CHECK-NEXT:    br label [[BB41:%.*]]
 ; CHECK:       bb41:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index f85f658fed4d5c..d89d6286703605 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -29,151 +29,215 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP16]], <64 x i16> [[TMP17]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <64 x i16> [[TMP18]], <64 x i16> [[TMP19]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <64 x i16> [[TMP20]], <64 x i16> [[TMP21]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <64 x i16> [[TMP22]] to <64 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP23]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw nsw <64 x i32> [[TMP23]], [[TMP23]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP23]], i32 2
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP23]], i32 3
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP23]], i32 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP23]], i32 5
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP23]], i32 6
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP23]], i32 7
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP32]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP23]], i32 8
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP23]], i32 9
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP34]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP23]], i32 10
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP35]]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP23]], i32 11
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP23]], i32 12
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP37]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP23]], i32 13
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP23]], i32 14
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP39]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP23]], i32 15
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP23]], i32 16
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP23]], i32 17
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP23]], i32 18
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP43]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP23]], i32 19
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP44]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP23]], i32 20
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP23]], i32 21
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP23]], i32 22
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP23]], i32 23
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP23]], i32 24
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP23]], i32 25
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP23]], i32 26
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP23]], i32 27
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP23]], i32 28
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP53]]
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP23]], i32 29
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP54]]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP23]], i32 30
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP23]], i32 31
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP56]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP23]], i32 32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP23]], i32 33
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP58]]
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP23]], i32 34
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP59]]
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP23]], i32 35
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP60]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP23]], i32 36
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP61]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP23]], i32 37
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP62]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP23]], i32 38
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP63]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP23]], i32 39
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP64]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP23]], i32 40
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP65]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP23]], i32 41
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP66]]
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP23]], i32 42
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP67]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP23]], i32 43
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP68]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP23]], i32 44
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP69]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP23]], i32 45
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP70]]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP23]], i32 46
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP71]]
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP23]], i32 47
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP72]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP23]], i32 48
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP73]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP23]], i32 49
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP74]]
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP23]], i32 50
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP75]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP23]], i32 51
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP23]], i32 52
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP77]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP23]], i32 53
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP78]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP23]], i32 54
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP79]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP23]], i32 55
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP23]], i32 56
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP81]]
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <64 x i32> [[TMP23]], i32 57
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP82]]
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <64 x i32> [[TMP23]], i32 58
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP83]]
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <64 x i32> [[TMP23]], i32 59
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP84]]
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <64 x i32> [[TMP23]], i32 60
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP85]]
-; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <64 x i32> [[TMP23]], i32 61
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP86]]
-; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <64 x i32> [[TMP23]], i32 62
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP87]]
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <64 x i32> [[TMP23]], i32 63
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP88]]
-; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7
+; CHECK-NEXT:    [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
+; CHECK-NEXT:    [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6
+; CHECK-NEXT:    [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5
+; CHECK-NEXT:    [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4
+; CHECK-NEXT:    [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3
+; CHECK-NEXT:    [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2
+; CHECK-NEXT:    [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1
+; CHECK-NEXT:    [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0
+; CHECK-NEXT:    [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
+; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
+; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
+; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
+; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3
+; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2
+; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1
+; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
+; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
+; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
+; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
+; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
+; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
+; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
+; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3
+; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2
+; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1
+; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0
+; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
+; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
+; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
+; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP86]] to i32
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
+; CHECK-NEXT:    [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]]
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]]
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]])
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
 ; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP89]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index c6209fd71063a0..6f6b66255a4340 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -6,7 +6,8 @@
 define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG:%.*]], i32 1
+; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
@@ -24,18 +25,17 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float
 ; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
 ; CHECK:       bb25:
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
 ; CHECK-NEXT:    br label [[BB30:%.*]]
 ; CHECK:       bb30:
 ; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
+; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float>
+; CHECK-NEXT:    [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]])
 ; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
-; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]])
+; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]])
 ; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])
 ; CHECK-NEXT:    br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
 ; CHECK:       bb57:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 0783a28f56d850..e39cd8aaa111b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -7,9 +7,11 @@ define void @p(double %0) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 0b26c53ca45030..03f67ecb3e695a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -7,19 +7,21 @@ define void @slp_not_profitable_with_fast_fmf(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
 ; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
 ; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]]
-; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[A]], align 4
 ; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -46,19 +48,21 @@ define void @slp_not_profitable_with_reassoc_fmf(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
 ; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
 ; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]]
-; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub reassoc <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[A]], align 4
 ; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -86,19 +90,21 @@ define void @slp_profitable_missing_fmf_on_fadd_fsub(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
 ; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
 ; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]]
-; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[A]], align 4
 ; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -126,19 +132,21 @@ define void @slp_profitable_missing_fmf_on_fmul_fadd_fsub(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
 ; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
 ; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]]
-; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[A]], align 4
 ; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -166,19 +174,21 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
 ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
 ; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
 ; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]]
-; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul nnan <2 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul nnan <2 x float> [[TMP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub nnan <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd nnan <2 x float> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[A]], align 4
 ; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index cd4aa9a73dba21..1bd63b79b0f5ca 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -124,18 +124,18 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 1e0245812d8d7d..f99f6ecd333823 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -124,18 +124,18 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
-; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
index 55504985d9a6fe..e2d1a29ee22de4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
@@ -12,13 +12,13 @@ define void @noop_extracts_first_2_lanes(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-LABEL: @noop_extracts_first_2_lanes(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
+; CHECK-NEXT:    [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
+; CHECK-NEXT:    [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[V_1]], i32 0
-; CHECK-NEXT:    call void @use(double [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 1
-; CHECK-NEXT:    call void @use(double [[TMP3]])
+; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
+; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -127,14 +127,14 @@ define void @extract_reverse_order(ptr %ptr.1, ptr %ptr.2) {
 ; CHECK-LABEL: @extract_reverse_order(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
+; CHECK-NEXT:    [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
+; CHECK-NEXT:    [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0
-; CHECK-NEXT:    call void @use(double [[TMP3]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1
-; CHECK-NEXT:    call void @use(double [[TMP4]])
+; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
+; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll
index 0d5c644b9cc0fc..125fe69820d5c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll
@@ -9,10 +9,10 @@ define i32 @test() {
 ; CHECK-NEXT:    [[CONV5:%.*]] = sext i8 [[VECTOR_RECUR_EXTRACT]] to i32
 ; CHECK-NEXT:    store i32 [[CONV5]], ptr getelementptr ([0 x i32], ptr null, i64 0, i64 -14), align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr getelementptr ([9 x i8], ptr null, i64 -2, i64 5), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr ([9 x i8], ptr null, i64 -2, i64 5), align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i8> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i16>
 ; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr getelementptr ([0 x i16], ptr null, i64 0, i64 -14), align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[CONV5_1:%.*]] = sext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    store i32 [[CONV5_1]], ptr getelementptr ([0 x i32], ptr null, i64 0, i64 -13), align 4
 ; CHECK-NEXT:    ret i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
index c72d6cc75d827a..93f5b5e46d2c33 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
@@ -7,6 +7,7 @@ define void @test() {
 ; CHECK-LABEL: define void @test
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB4_I_I65_US:%.*]] = or i64 0, 1
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[ADD_I_I62_US:%.*]] = shl i64 0, 0
@@ -17,8 +18,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[TMP6]], i64 0
+; CHECK-NEXT:    [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
 ; CHECK-NEXT:    br label [[BODY]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll
index 4fd22639d63712..c0e1ab56c110bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll
@@ -5,10 +5,10 @@ define void @loads() {
 ; CHECK-LABEL: define void @loads(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x fp128>, ptr null, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp une <2 x fp128> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x fp128>, ptr null, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp une <2 x fp128> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    call void null(i32 0, ptr null, i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <2 x fp128> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <2 x fp128> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
index 473b37167409ec..14685fcca51077 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -50,15 +50,10 @@ define void @i64_simplifiedi_reversed(ptr noalias %st, ptr noalias %ld) {
 define void @i64_simplifiedi_extract(ptr noalias %st, ptr noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_extract(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[T0:%.*]] = load i64, ptr [[LD]], align 8
 ; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[ST:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[ST]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, ptr [[ST]], i64 3
-; CHECK-NEXT:    store i64 [[T0]], ptr [[ST]], align 8
-; CHECK-NEXT:    store i64 [[T0]], ptr [[ARRAYIDX3]], align 8
-; CHECK-NEXT:    store i64 [[T0]], ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    store i64 [[T1]], ptr [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[LD]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[ST:%.*]], align 8
 ; CHECK-NEXT:    store i64 [[T1]], ptr [[LD]], align 8
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
index 5ec7aac8a79352..e9aa434dec03d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -9,10 +9,11 @@ define void @mainTest(ptr %ptr) #0  {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 5c261d69cd53e8..143e09374a891b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -7,32 +7,32 @@ define void @Test(i32) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
+; CHECK-NEXT:    [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
+; CHECK-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
 ; FORCE_REDUCTION-NEXT:  entry:
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
 ; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
-; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]]
-; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
-; FORCE_REDUCTION-NEXT:    [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
+; FORCE_REDUCTION-NEXT:    [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index 2ea7f191947b43..194c7021f60f59 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -7,17 +7,17 @@ define void @mainTest(i32 %param, ptr %vals, i32 %len) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 31>, i32 [[PARAM:%.*]], i32 0
 ; CHECK-NEXT:    br label [[BCI_15:%.*]]
 ; CHECK:       bci_15:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store atomic i32 [[TMP4]], ptr [[VALS:%.*]] unordered, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP5]], [[TMP2]]
-; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0
-; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[V44]], i32 1
+; CHECK-NEXT:    [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM]], [[BCI_15_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]]
+; CHECK-NEXT:    [[V44]] = add i32 [[LOCAL_4_]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0
+; CHECK-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[V44]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
index 22cba328b180ad..ae6e6723706cd9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -80,11 +80,11 @@ declare i32 @printf(ptr nocapture, ...)
 
 define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) {
 ; CHECK-LABEL: @merge_anyof_v4f32_wrong_first(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
-; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[X3]], 4.200000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OP_RDX]], float -1.000000e+00, float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
 ;
@@ -107,11 +107,11 @@ define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) {
 
 define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) {
 ; CHECK-LABEL: @merge_anyof_v4f32_wrong_last(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01
-; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = fcmp olt float [[X3]], 4.200000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x float> [[X]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OP_RDX]], float -1.000000e+00, float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
 ;
@@ -134,11 +134,11 @@ define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) {
 
 define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) {
 ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OP_RDX]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@@ -164,12 +164,12 @@ define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) {
 
 define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[Y3:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3
+; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], [[Y3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]]
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OP_RDX]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
index 5a0deddb9247c0..e3a860a4c6f062 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
@@ -20,18 +20,23 @@ define void @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 1
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 1
 ; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[BB6:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[TMP0]] = phi <2 x double> [ <double 1.800000e+01, double 2.800000e+01>, [[BB:%.*]] ], [ [[TMP3:%.*]], [[BB17:%.*]] ], [ [[TMP3]], [[BB16:%.*]] ], [ [[TMP3]], [[BB16]] ]
-; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr [[I]], align 8
-; CHECK-NEXT:    [[TMP3]] = load <2 x double>, ptr [[I2]], align 8
+; CHECK-NEXT:    [[I7:%.*]] = phi double [ 2.800000e+01, [[BB:%.*]] ], [ [[I10:%.*]], [[BB17:%.*]] ], [ [[I10]], [[BB16:%.*]] ], [ [[I10]], [[BB16]] ]
+; CHECK-NEXT:    [[I8:%.*]] = phi double [ 1.800000e+01, [[BB]] ], [ [[TMP1:%.*]], [[BB17]] ], [ [[TMP1]], [[BB16]] ], [ [[TMP1]], [[BB16]] ]
+; CHECK-NEXT:    store double [[I8]], ptr [[I]], align 8
+; CHECK-NEXT:    store double [[I7]], ptr [[I1]], align 8
+; CHECK-NEXT:    [[I10]] = load double, ptr [[I3]], align 8
+; CHECK-NEXT:    [[TMP0]] = load <2 x double>, ptr [[I2]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]]
 ; CHECK:       bb11:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb12:
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[I4]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr [[I4]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB13:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    br label [[BB14]]
@@ -40,9 +45,10 @@ define void @bar() {
 ; CHECK:       bb15:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb16:
+; CHECK-NEXT:    [[TMP1]] = extractelement <2 x double> [[TMP0]], i32 0
 ; CHECK-NEXT:    switch i32 undef, label [[BB17]] [
-; CHECK-NEXT:    i32 32, label [[BB6]]
-; CHECK-NEXT:    i32 103, label [[BB6]]
+; CHECK-NEXT:      i32 32, label [[BB6]]
+; CHECK-NEXT:      i32 103, label [[BB6]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb17:
 ; CHECK-NEXT:    br i1 undef, label [[BB6]], label [[BB18:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll
index bcee81f901987d..73f9b42ee72b5f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll
@@ -7,17 +7,17 @@ define void @test(ptr %ptr, ptr noalias %s)  {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR:%.*]], null
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 4
-; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[S:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr [[S:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop1:
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[S]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[S]], align 4
 ; CHECK-NEXT:    br i1 true, label [[LOOP1]], label [[CONT:%.*]]
 ; CHECK:       cont:
 ; CHECK-NEXT:    br i1 true, label [[LOOP]], label [[BAIL_OUT]]
 ; CHECK:       bail_out:
-; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP3]], [[CONT]] ]
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP1]], [[CONT]] ]
 ; CHECK-NEXT:    store i32 [[DUMMY_PHI]], ptr [[S]], align 4
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
index b2bcdb178b21be..1b34d31eb623d2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -60,12 +60,12 @@ define i32 @extr_user(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
-; CHECK-NEXT:    ret i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %0 = load i32, ptr %A, align 4
@@ -95,13 +95,14 @@ define i32 @extr_user1(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %
 ; CHECK-LABEL: @extr_user1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    ret i32 [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %0 = load i32, ptr %A, align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
index 84f7e219f50666..f58379b46dc19e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -12,6 +12,7 @@ define i8 @test() {
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @c, align 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = or i32 [[CONV]], 32769
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32769>
@@ -20,7 +21,6 @@ define i8 @test() {
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
 ; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
 ; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
 ; CHECK-NEXT:    ret i8 [[CONV4_30]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
index eb7498fea6f793..3b03ca13ea65d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
@@ -6,31 +6,30 @@ define i1 @test(float %0, double %1) {
 ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 4, i32 poison, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP7]], <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP11]], <4 x i32> <i32 2, i32 0, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x double> [[TMP16]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul <8 x double> [[TMP16]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP24:%.*]] = fptrunc <8 x double> [[TMP23]] to <8 x float>
-; CHECK-NEXT:    [[TMP25:%.*]] = fmul <8 x float> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = fcmp oeq <8 x float> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = freeze <8 x i1> [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP27]])
-; CHECK-NEXT:    ret i1 [[TMP28]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fpext float 0.000000e+00 to double
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> <i32 2, i32 0, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> <double 0.000000e+00, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float>
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]])
+; CHECK-NEXT:    ret i1 [[TMP27]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
index 1b54a604cd6f3a..f90456297d7cb9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
@@ -6,8 +6,9 @@ define void @test() {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10:%.*]] ], [ zeroinitializer, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <2 x i32> <i32 7, i32 7>
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[PHI7]], i32 0
 ; CHECK-NEXT:    switch i32 0, label [[BB16:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[BB14:%.*]]
 ; CHECK-NEXT:      i32 1, label [[BB11:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
index f1a5709d07f021..7a860719505f05 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
@@ -4,19 +4,21 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr addrspace(1)> [[TMP0]], i32 0
 ; CHECK-NEXT:    br label %[[BB43:.*]]
 ; CHECK:       [[BB20:.*]]:
 ; CHECK-NEXT:    br label %[[BB105:.*]]
 ; CHECK:       [[BB43]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP1:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP3:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ]
 ; CHECK-NEXT:    br i1 false, label %[[BB105]], label %[[BB51]]
 ; CHECK:       [[BB51]]:
-; CHECK-NEXT:    [[TMP1]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ]
+; CHECK-NEXT:    [[TMP3]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ]
 ; CHECK-NEXT:    br label %[[BB43]]
 ; CHECK:       [[BB54]]:
 ; CHECK-NEXT:    br label %[[BB51]]
 ; CHECK:       [[BB105]]:
-; CHECK-NEXT:    [[PHI106:%.*]] = phi ptr addrspace(1) [ null, %[[BB20]] ], [ null, %[[BB43]] ]
+; CHECK-NEXT:    [[PHI106:%.*]] = phi ptr addrspace(1) [ [[TMP1]], %[[BB20]] ], [ null, %[[BB43]] ]
 ; CHECK-NEXT:    ret void
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index 0eb18239ae3fb6..6033e8def34367 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -89,10 +89,10 @@ define void @externally_used_ptrs() {
 ; CHECK-LABEL: @externally_used_ptrs(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a, align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[ADD_PTR]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
index ba406c8f20bb08..73b73735da0210 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
@@ -9,13 +9,14 @@ define void @test(double %i) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[I]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[I75:%.*]] = fsub double 0.000000e+00, [[I]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP3]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> <i32 poison, i32 0, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 7
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> <i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP28]], <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 poison, i32 12, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[I75]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index f2b1c78ce0aac9..aba45fe6bd5199 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -9,10 +9,11 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP22]], <4 x i32> <i32 poison, i32 0, i32 5, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[I82:%.*]] = fsub double 0.000000e+00, poison
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 1, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <8 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, [[TMP7]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer
@@ -26,6 +27,7 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, <4 x double> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
index 0222e0aaeea3ed..783eca2221357a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
@@ -4,11 +4,12 @@
 define double @test() {
 ; CHECK-LABEL: define double @test() {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 9), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, double [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll
index abf277fb8ba348..b6de2d4fbcb11f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll
@@ -7,7 +7,7 @@ define void @test() {
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
 ; CHECK-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
-; CHECK-NEXT:    [[PHI2:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
 ; CHECK-NEXT:    [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
 ; CHECK-NEXT:    [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
@@ -16,16 +16,15 @@ define void @test() {
 ; CHECK:       exit:
 ; CHECK-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
 ; CHECK:       if.then135.i:
-; CHECK-NEXT:    [[CMP145_I:%.*]] = fcmp fast olt double [[PHI1]], 0.000000e+00
-; CHECK-NEXT:    [[CMP152_I:%.*]] = fcmp fast olt double [[PHI2]], 0.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i1> <i1 poison, i1 false>, i1 [[CMP152_I]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[TMP0]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    br label [[IF_END209_I]]
 ; CHECK:       if.end209.i:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x double> [ [[TMP4]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
index f197b2480d61ca..fa33621de5ae76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
@@ -25,6 +25,7 @@ define i64 @foo() {
 ;
 ; FORCED-LABEL: define i64 @foo() {
 ; FORCED-NEXT:  bb:
+; FORCED-NEXT:    [[TMP8:%.*]] = add i64 0, 0
 ; FORCED-NEXT:    br label [[BB3:%.*]]
 ; FORCED:       bb1:
 ; FORCED-NEXT:    [[TMP0:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ]
@@ -38,7 +39,6 @@ define i64 @foo() {
 ; FORCED-NEXT:    [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
 ; FORCED-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> <i64 poison, i64 0>, <2 x i32> <i32 0, i32 3>
 ; FORCED-NEXT:    [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]]
-; FORCED-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
 ; FORCED-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]]
 ; FORCED-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
 ; FORCED-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
index 37d166953c3336..cea95c11024976 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
@@ -4,9 +4,7 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x ptr> zeroinitializer, i32 0
-; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 872
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GETELEMENTPTR6]], align 8
+; CHECK-NEXT:    store double 0.000000e+00, ptr inttoptr (i64 872 to ptr), align 8
 ; CHECK-NEXT:    br label [[BB9:%.*]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 32, i64 872>), [[BB:%.*]] ]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index e94dd2119270ce..e0d7c12f70c2ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -14,6 +14,7 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 1
@@ -32,7 +33,6 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 7ff4a1a231c222..69561785182154 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -968,11 +968,11 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[X4:%.*]] = xor i32 [[ARG]], [[BAR]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP7]], [[X4]]
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]]
 ; CHECK-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -983,11 +983,11 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]]
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32>
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; THRESHOLD-NEXT:    [[X4:%.*]] = xor i32 [[ARG]], [[BAR]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP7]], [[X4]]
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]]
 ; THRESHOLD-NEXT:    ret i32 [[OP_RDX1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll
index 799d0a055d5c41..78b3f8b1012845 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll
@@ -7,6 +7,7 @@ define void @test(ptr %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x ptr> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr null to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
 ; CHECK-NEXT:    switch i32 0, label %[[NEWFUNCROOT994:.*]] [
 ; CHECK-NEXT:      i32 1, label %[[NEWFUNCROOT994]]
@@ -17,7 +18,6 @@ define void @test(ptr %0) {
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[NEWFUNCROOT994]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP5]], i64 [[TMP6]]
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index 2a9e40156420a9..5a28581913b8c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -201,8 +201,9 @@ entry:
 define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) {
 ; CHECK-LABEL: @lookahead_external_uses(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
-; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
 ; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
@@ -210,6 +211,7 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr
 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
 ; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
@@ -221,8 +223,7 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT1:%.*]], align 8
+; CHECK-NEXT:    store double [[A1]], ptr [[EXT1:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -279,8 +280,9 @@ entry:
 define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) {
 ; CHECK-LABEL: @lookahead_limit_users_budget(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
-; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
 ; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
@@ -288,6 +290,7 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S
 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
 ; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
@@ -299,10 +302,9 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT1:%.*]], align 8
-; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT2:%.*]], align 8
-; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT3:%.*]], align 8
+; CHECK-NEXT:    store double [[A1]], ptr [[EXT1:%.*]], align 8
+; CHECK-NEXT:    store double [[A1]], ptr [[EXT2:%.*]], align 8
+; CHECK-NEXT:    store double [[A1]], ptr [[EXT3:%.*]], align 8
 ; CHECK-NEXT:    store double [[B1]], ptr [[EXT4:%.*]], align 8
 ; CHECK-NEXT:    store double [[B1]], ptr [[EXT5:%.*]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
index a8d481a3e28a5c..2a5bfa73907704 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll
@@ -5,21 +5,16 @@ define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_PROMOTED:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[DEC_4:%.*]] = add i8 [[A_PROMOTED]], 0
-; CHECK-NEXT:    [[CONV_I_4:%.*]] = zext i8 [[DEC_4]] to i32
-; CHECK-NEXT:    [[SUB_I_4:%.*]] = add nuw nsw i32 [[CONV_I_4]], 0
-; CHECK-NEXT:    [[DEC_5:%.*]] = add i8 [[A_PROMOTED]], 0
-; CHECK-NEXT:    [[CONV_I_5:%.*]] = zext i8 [[DEC_5]] to i32
-; CHECK-NEXT:    [[SUB_I_5:%.*]] = add nuw nsw i32 [[CONV_I_5]], 65535
-; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[SUB_I_4]], [[SUB_I_5]]
-; CHECK-NEXT:    [[DEC_6:%.*]] = or i8 [[A_PROMOTED]], 0
-; CHECK-NEXT:    [[CONV_I_6:%.*]] = zext i8 [[DEC_6]] to i32
-; CHECK-NEXT:    [[SUB_I_6:%.*]] = add nuw nsw i32 [[CONV_I_6]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP0]], [[SUB_I_6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[A_PROMOTED]], 0
-; CHECK-NEXT:    [[CONV_I_7:%.*]] = zext i8 [[TMP10]] to i32
-; CHECK-NEXT:    [[SUB_I_7:%.*]] = add nuw nsw i32 [[CONV_I_7]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP1]], [[SUB_I_7]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], <i16 0, i16 -1, i16 0, i16 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 65535
 ; CHECK-NEXT:    store i8 [[TMP10]], ptr null, align 1
 ; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 (ptr, ...) null(ptr null, i32 [[TMP9]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 9df2b9a8e8f3ec..61938d01e57acc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -345,7 +345,7 @@ define void @good_load_order() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @a, align 16
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
@@ -355,17 +355,17 @@ define void @good_load_order() {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
 ; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
-; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
 ; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
@@ -380,7 +380,7 @@ define void @good_load_order() {
 ; SSE2-NEXT:    [[TMP0:%.*]] = load float, ptr @a, align 16
 ; SSE2-NEXT:    br label [[FOR_BODY3:%.*]]
 ; SSE2:       for.body3:
-; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
+; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ]
 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 ; SSE2-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; SSE2-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
@@ -390,17 +390,17 @@ define void @good_load_order() {
 ; SSE2-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; SSE2-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
 ; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
-; SSE2-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
-; SSE2-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
-; SSE2-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
+; SSE2-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
+; SSE2-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0
+; SSE2-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
+; SSE2-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; SSE2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; SSE2-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
-; SSE2-NEXT:    [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
-; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
-; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
+; SSE2-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]]
+; SSE2-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
+; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
 ; SSE2-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
 ; SSE2-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; SSE2-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
index efb11d2756c3cf..e1c794a6fd2794 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
@@ -10,15 +10,15 @@
 define void @f(i1 %x) #0 {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([[STRUCT_A:%.*]], ptr @a, i32 0, i32 0, i32 1), align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @a, align 8
 ; CHECK-NEXT:    br i1 [[X:%.*]], label [[WHILE_BODY_LR_PH:%.*]], label [[WHILE_END:%.*]]
 ; CHECK:       while.body.lr.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @b, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> [[SHUFFLE]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]]
+; CHECK-NEXT:    [[ICMP_A1:%.*]] = icmp eq i64 [[A1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @b, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[TMP1]], <2 x i64> [[TMP0]]
 ; CHECK-NEXT:    br label [[WHILE_END]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4]], [[WHILE_BODY_LR_PH]] ]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll
index ea6989b8bbabb8..9979bb9170d485 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll
@@ -8,19 +8,19 @@ target triple = "x86_64-pc-windows-msvc18.0.0"
 define void @test1(ptr %p) personality ptr @__CxxFrameHandler3 {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  invoke.cont:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[P]], align 8
 ; CHECK-NEXT:    invoke void @throw()
-; CHECK-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %invoke.cont1] unwind label [[EHCLEANUP:%.*]]
 ; CHECK:       invoke.cont1:
 ; CHECK-NEXT:    [[CATCH:%.*]] = catchpad within [[CS]] [ptr null, i32 64, ptr null]
 ; CHECK-NEXT:    invoke void @throw() [ "funclet"(token [[CATCH]]) ]
-; CHECK-NEXT:    to label [[UNREACHABLE]] unwind label [[EHCLEANUP]]
+; CHECK-NEXT:            to label [[UNREACHABLE]] unwind label [[EHCLEANUP]]
 ; CHECK:       ehcleanup:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[LOAD1]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
 ; CHECK-NEXT:    [[CLEANUP:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ]
 ; CHECK-NEXT:    cleanupret from [[CLEANUP]] unwind to caller

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll
index edf8756fd06df0..5cbf78435233bb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll
@@ -4,6 +4,7 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 0 to i32
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ]
@@ -11,7 +12,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = mul i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TRUNC]]
 ; CHECK-NEXT:    [[TMP4]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[OP_RDX1]], i32 1
 ; CHECK-NEXT:    br label %[[BB1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
index 254525c942356a..6d6dd502415e5d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
@@ -6,10 +6,11 @@ define void @test() {
 ; CHECK-NEXT:    br i1 false, label [[PH:%.*]], label [[EXIT:%.*]]
 ; CHECK:       ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer)
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = and i8 0, [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i8 0, [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], 0
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ [[OP_RDX2]], [[PH]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ [[OP_RDX1]], [[PH]] ], [ 0, [[BB:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
index 1940e1bc8d18ac..3a456798d7818d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll
@@ -7,8 +7,9 @@ define void @test(ptr noalias %arg, ptr noalias %arg1, ptr %arg2) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP_I_I:%.*]] = getelementptr i8, ptr [[ARG1]], i64 24
 ; CHECK-NEXT:    [[TMP_I_I4:%.*]] = getelementptr i8, ptr [[ARG]], i64 24
+; CHECK-NEXT:    [[TMP_I_I13:%.*]] = getelementptr i8, ptr [[ARG1]], i64 28
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP_I_I13]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[TMP_I_I]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    store float [[TMP1]], ptr [[ARG2]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
index d88135df5c96a2..bc1eaaac5d1bbc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
@@ -15,6 +15,7 @@ define void @test() {
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP0]], i32 0
@@ -23,7 +24,6 @@ define void @test() {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[TMP16]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP14]], i32 3
 ; CHECK-NEXT:    [[MUL45:%.*]] = fmul fast float [[TMP16]], [[TMP6]]
 ; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 31990

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
index 9df7aa1c727c87..8fa84699a267cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
@@ -5,23 +5,22 @@ define void @test(i32 %0, ptr %p) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, i32 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[PH:%.*]]
 ; CHECK:       ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 [[TMP0]], i32 2
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x i32> [ [[TMP2]], [[ENTRY:%.*]] ], [ zeroinitializer, [[PH]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x i32> [ [[TMP2]], [[ENTRY]] ], [ [[TMP4]], [[PH]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ [[TMP2]], [[ENTRY]] ], [ zeroinitializer, [[PH]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <8 x i32> [ [[TMP8]], [[ENTRY:%.*]] ], [ [[TMP6]], [[PH]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ [[TMP5]], [[ENTRY]] ], [ zeroinitializer, [[PH]] ]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i32 [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i32 [[OP_RDX1]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX5:%.*]] = or i32 [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i32 [[OP_RDX5]], [[OP_RDX]]
 ; CHECK-NEXT:    store i32 [[OP_RDX2]], ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
index f1be11d0d0fc51..8bcf650d41d931 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
@@ -5,6 +5,7 @@ define void @test(i32 %arg) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
@@ -14,8 +15,6 @@ define void @test(i32 %arg) {
 ; CHECK-NEXT:      i32 1, label [[BB4:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    switch i32 0, label [[BB10]] [
 ; CHECK-NEXT:      i32 18, label [[BB7:%.*]]
 ; CHECK-NEXT:      i32 1, label [[BB7]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
index e146a0a365a845..55e155840f8588 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
@@ -6,10 +6,11 @@
 define i16 @D134605() {
 ; CHECK-LABEL: @D134605(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
-; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr poison, align 1
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i16 [[TMP2]], 2
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = add i16 [[TMP3]], poison
 ; CHECK-NEXT:    [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX]], 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
index 51ce40b7a178bd..d9496a3e3e343a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,15 +5,17 @@ define dso_local void @rftbsub(ptr %a) local_unnamed_addr #0 {
 ; CHECK-LABEL: @rftbsub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = or disjoint i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP1]], undef
 ; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
 ; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[SUB22]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[ARRAYIDX6]], align 8
 ; CHECK-NEXT:    unreachable
 ;


        


More information about the llvm-commits mailing list