[llvm] 8f54861 - Revert "[SLP]Use source vector type as the original vector type instead of"

David Spickett via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 22 00:53:36 PDT 2023


Author: David Spickett
Date: 2023-09-22T07:52:16Z
New Revision: 8f548610a61a449316cf2a8c212cb6b64dd9acc3

URL: https://github.com/llvm/llvm-project/commit/8f548610a61a449316cf2a8c212cb6b64dd9acc3
DIFF: https://github.com/llvm/llvm-project/commit/8f548610a61a449316cf2a8c212cb6b64dd9acc3.diff

LOG: Revert "[SLP]Use source vector type as the original vector type instead of"

This reverts commit 9a99944df068b29b905cd8ba9a2132cc6382b6fb.

Due to test suite failures on all our SVE buildbots e.g.:
https://lab.llvm.org/buildbot/#/builders/184/builds/7375

clang: ../llvm/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp:3565:
InstructionCost llvm::AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind,
VectorType *, ArrayRef<int>, TTI::TargetCostKind, int, VectorType *,
ArrayRef<const Value *>): Assertion `Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"' failed.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 031b195189d8af6..dc8130954979678 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7073,10 +7073,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   /// extracted values from \p VL.
   InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
                                      TTI::ShuffleKind ShuffleKind) {
-    auto *VecTy = cast<FixedVectorType>(
-        cast<ExtractElementInst>(*find_if(VL, [](Value *V) {
-          return isa<ExtractElementInst>(V);
-        }))->getVectorOperandType());
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
 
     if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 290560151b79a32..9fd20da8e4bbfdc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -291,19 +291,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
-; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index e62749c4c71f104..34bc2b338df5c90 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -291,19 +291,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
-; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
index 0a020c855cc22d7..e474bab2ad965ae 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
@@ -4,15 +4,10 @@
 define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
 ; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
-; CHECK-NEXT:    [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
-; CHECK-NEXT:    [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
-; CHECK-NEXT:    [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; CHECK-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
-; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
-; CHECK-NEXT:    ret <2 x i16> [[INS_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x i16> [[ARG0:%.*]], <9 x i16> poison, <2 x i32> <i32 poison, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i16> [[TMP2]]
 ;
 bb:
   %arg0.1 = extractelement <9 x i16> undef, i64 7

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 46980b33e4018ac..3b63c1e35610fe2 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -4,23 +4,20 @@
 define <4 x half> @phis(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY:%.*]] ], [ [[B2]], [[BB0]] ]
-; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY]] ], [ [[B3]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
-; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
-; CHECK-NEXT:    ret <4 x half> [[O3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
@@ -52,23 +49,20 @@ bb1:
 define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY:%.*]] ], [ [[B3]], [[BB0]] ]
-; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY]] ], [ [[B2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
-; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
-; CHECK-NEXT:    ret <4 x half> [[O3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 059e4c38b519bdd..9608608a1809821 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,13 +155,11 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -173,13 +171,11 @@ define <4 x float> @exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -212,13 +208,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -230,13 +224,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -269,13 +261,11 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -287,13 +277,11 @@ define <4 x float> @log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -326,13 +314,11 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -344,13 +330,11 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -383,13 +367,11 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -401,13 +383,11 @@ define <4 x float> @sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -440,13 +420,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -458,13 +436,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 6c21cc1cfc5be8c..da6f4e70c7c6f07 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -2,9 +2,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
@@ -51,47 +51,24 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX2-LABEL: @ceil_floor(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
-; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
-; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX2-NEXT:    ret <8 x float> [[R71]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index 435c677c3afbc57..3b84e70d847ffb2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -2,9 +2,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
@@ -51,47 +51,24 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
-;
-; AVX2-LABEL: @ceil_floor(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
-; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
-; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX2-NEXT:    ret <8 x float> [[R71]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index cf9b974a724cf3d..5fcd339f333e860 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -167,42 +167,24 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SSE-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
-; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R031]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -231,29 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x double> [[R031]]
-;
-; SLM-LABEL: @test_v4f64_partial_swizzle(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; SLM-NEXT:    ret <4 x double> [[R031]]
+; CHECK-LABEL: @test_v4f64_partial_swizzle(
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
@@ -278,23 +247,13 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SLM-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -408,48 +367,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
-; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
-; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
-; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
-; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
-; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
-; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
-; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
-; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
-; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
-; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R4:%.*]] = add i16 [[B0]], [[B1]]
-; SSE-NEXT:    [[R5:%.*]] = add i16 [[B2]], [[B3]]
-; SSE-NEXT:    [[R6:%.*]] = add i16 [[B4]], [[B5]]
-; SSE-NEXT:    [[R7:%.*]] = add i16 [[B6]], [[B7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R12:%.*]] = add i16 [[B8]], [[B9]]
-; SSE-NEXT:    [[R13:%.*]] = add i16 [[B10]], [[B11]]
-; SSE-NEXT:    [[R14:%.*]] = add i16 [[B12]], [[B13]]
-; SSE-NEXT:    [[R15:%.*]] = add i16 [[B14]], [[B15]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
-; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
-; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
-; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
-; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
-; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
-; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV151]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index ee60ffe39cc16c7..c3e03973cdac32d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -167,42 +167,24 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SSE-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
-; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R031]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -231,29 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SSE-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; SSE-NEXT:    ret <4 x double> [[R031]]
-;
-; SLM-LABEL: @test_v4f64_partial_swizzle(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; SLM-NEXT:    ret <4 x double> [[R031]]
+; CHECK-LABEL: @test_v4f64_partial_swizzle(
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> undef, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
@@ -278,23 +247,13 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SLM-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -408,48 +367,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
-; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
-; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
-; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
-; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
-; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
-; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
-; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
-; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
-; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
-; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R4:%.*]] = add i16 [[B0]], [[B1]]
-; SSE-NEXT:    [[R5:%.*]] = add i16 [[B2]], [[B3]]
-; SSE-NEXT:    [[R6:%.*]] = add i16 [[B4]], [[B5]]
-; SSE-NEXT:    [[R7:%.*]] = add i16 [[B6]], [[B7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R12:%.*]] = add i16 [[B8]], [[B9]]
-; SSE-NEXT:    [[R13:%.*]] = add i16 [[B10]], [[B11]]
-; SSE-NEXT:    [[R14:%.*]] = add i16 [[B12]], [[B13]]
-; SSE-NEXT:    [[R15:%.*]] = add i16 [[B14]], [[B15]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
-; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
-; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
-; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
-; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
-; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
-; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV151]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index fa305da9730febb..4822d52086e2708 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -146,42 +146,24 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SSE-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
-; SSE-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
-; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R031]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -216,23 +198,13 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
-; SLM-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -346,48 +318,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
-; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
-; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
-; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
-; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
-; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
-; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
-; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
-; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
-; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
-; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R4:%.*]] = sub i16 [[B0]], [[B1]]
-; SSE-NEXT:    [[R5:%.*]] = sub i16 [[B2]], [[B3]]
-; SSE-NEXT:    [[R6:%.*]] = sub i16 [[B4]], [[B5]]
-; SSE-NEXT:    [[R7:%.*]] = sub i16 [[B6]], [[B7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R12:%.*]] = sub i16 [[B8]], [[B9]]
-; SSE-NEXT:    [[R13:%.*]] = sub i16 [[B10]], [[B11]]
-; SSE-NEXT:    [[R14:%.*]] = sub i16 [[B12]], [[B13]]
-; SSE-NEXT:    [[R15:%.*]] = sub i16 [[B14]], [[B15]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
-; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
-; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
-; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
-; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
-; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
-; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV151]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 27c2fb7b66b2e1e..38be89c0901ec8d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -146,42 +146,24 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SSE-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SSE-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
-; SSE-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
-; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R031]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -216,23 +198,13 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
-; SLM-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -346,48 +318,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
-; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
-; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
-; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
-; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
-; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
-; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
-; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
-; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
-; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
-; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R4:%.*]] = sub i16 [[B0]], [[B1]]
-; SSE-NEXT:    [[R5:%.*]] = sub i16 [[B2]], [[B3]]
-; SSE-NEXT:    [[R6:%.*]] = sub i16 [[B4]], [[B5]]
-; SSE-NEXT:    [[R7:%.*]] = sub i16 [[B6]], [[B7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R12:%.*]] = sub i16 [[B8]], [[B9]]
-; SSE-NEXT:    [[R13:%.*]] = sub i16 [[B10]], [[B11]]
-; SSE-NEXT:    [[R14:%.*]] = sub i16 [[B12]], [[B13]]
-; SSE-NEXT:    [[R15:%.*]] = sub i16 [[B14]], [[B15]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
-; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
-; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
-; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
-; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
-; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
-; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV151]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index 7cbb8261d126bb7..6c7d5e6324ca6ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -28,13 +28,15 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
 ;
 ; SSE42-LABEL: @reduce_and4(
 ; SSE42-NEXT:  entry:
-; SSE42-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
-; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; SSE42-NEXT:    ret i32 [[OP_RDX1]]
+; SSE42-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]])
+; SSE42-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]])
+; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0]], [[TMP1]]
+; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]])
+; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP2]]
+; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]])
+; SSE42-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP3]]
+; SSE42-NEXT:    [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]]
+; SSE42-NEXT:    ret i32 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @reduce_and4(
 ; AVX-NEXT:  entry:
@@ -101,13 +103,15 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; SSE42-LABEL: @reduce_and4_transpose(
-; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; SSE42-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; SSE42-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
-; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
-; SSE42-NEXT:    ret i32 [[OP_RDX1]]
+; SSE42-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]])
+; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]])
+; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP2]]
+; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]])
+; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]]
+; SSE42-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]])
+; SSE42-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP4]]
+; SSE42-NEXT:    [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]]
+; SSE42-NEXT:    ret i32 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @reduce_and4_transpose(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>


        


More information about the llvm-commits mailing list