[llvm] X86: Remove LowerToHorizontalOp and modified test case (PR #148477)

via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 13 08:19:55 PDT 2025


https://github.com/houngkoungting created https://github.com/llvm/llvm-project/pull/148477

FIX #143000 
Remove LowerToHorizontalOp and adjust test case ; all tests pass after the change.

@RKSimon 

>From c4b205c93a6403316bcf94a27a7a44b5e8861bcd Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Sun, 13 Jul 2025 21:53:59 +0800
Subject: [PATCH] X86: Remove LowerToHorizontalOp and modified test case

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 118 ----
 .../PhaseOrdering}/X86/haddsub-2.ll           | 195 +++++-
 .../PhaseOrdering}/X86/haddsub-shuf.ll        | 285 ++++++++-
 .../PhaseOrdering}/X86/haddsub-undef.ll       | 407 +++++++++++-
 .../PhaseOrdering}/X86/haddsub.ll             | 590 +++++++++++++++++-
 .../PhaseOrdering}/X86/phaddsub-undef.ll      |  78 ++-
 6 files changed, 1502 insertions(+), 171 deletions(-)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub-2.ll (81%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub-shuf.ll (73%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub-undef.ll (57%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub.ll (64%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/phaddsub-undef.ll (53%)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8f29b9f2cdc7..677ecf8801e2d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8569,122 +8569,6 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
   return DAG.getNode(HOpcode, DL, VT, V0, V1);
 }
 
-/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
-static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL,
-                                   const X86Subtarget &Subtarget,
-                                   SelectionDAG &DAG) {
-  // We need at least 2 non-undef elements to make this worthwhile by default.
-  unsigned NumNonUndefs =
-      count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
-  if (NumNonUndefs < 2)
-    return SDValue();
-
-  // There are 4 sets of horizontal math operations distinguished by type:
-  // int/FP at 128-bit/256-bit. Each type was introduced with a different
-  // subtarget feature. Try to match those "native" patterns first.
-  MVT VT = BV->getSimpleValueType(0);
-  if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
-      ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
-      ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
-      ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
-    unsigned HOpcode;
-    SDValue V0, V1;
-    if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
-      return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
-  }
-
-  // Try harder to match 256-bit ops by using extract/concat.
-  if (!Subtarget.hasAVX() || !VT.is256BitVector())
-    return SDValue();
-
-  // Count the number of UNDEF operands in the build_vector in input.
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned Half = NumElts / 2;
-  unsigned NumUndefsLO = 0;
-  unsigned NumUndefsHI = 0;
-  for (unsigned i = 0, e = Half; i != e; ++i)
-    if (BV->getOperand(i)->isUndef())
-      NumUndefsLO++;
-
-  for (unsigned i = Half, e = NumElts; i != e; ++i)
-    if (BV->getOperand(i)->isUndef())
-      NumUndefsHI++;
-
-  SDValue InVec0, InVec1;
-  if (VT == MVT::v8i32 || VT == MVT::v16i16) {
-    SDValue InVec2, InVec3;
-    unsigned X86Opcode;
-    bool CanFold = true;
-
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
-        isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
-                              InVec3) &&
-        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
-        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
-      X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
-                                   InVec1) &&
-             isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
-                                   InVec3) &&
-             ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
-             ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
-      X86Opcode = X86ISD::HSUB;
-    else
-      CanFold = false;
-
-    if (CanFold) {
-      // Do not try to expand this build_vector into a pair of horizontal
-      // add/sub if we can emit a pair of scalar add/sub.
-      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
-        return SDValue();
-
-      // Convert this build_vector into a pair of horizontal binops followed by
-      // a concat vector. We must adjust the outputs from the partial horizontal
-      // matching calls above to account for undefined vector halves.
-      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
-      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
-      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
-      bool isUndefLO = NumUndefsLO == Half;
-      bool isUndefHI = NumUndefsHI == Half;
-      return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
-                                   isUndefHI);
-    }
-  }
-
-  if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
-      VT == MVT::v16i16) {
-    unsigned X86Opcode;
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
-                              InVec1))
-      X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
-                                   InVec1))
-      X86Opcode = X86ISD::HSUB;
-    else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
-                                   InVec1))
-      X86Opcode = X86ISD::FHADD;
-    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
-                                   InVec1))
-      X86Opcode = X86ISD::FHSUB;
-    else
-      return SDValue();
-
-    // Don't try to expand this build_vector into a pair of horizontal add/sub
-    // if we can simply emit a pair of scalar add/sub.
-    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
-      return SDValue();
-
-    // Convert this build_vector into two horizontal add/sub followed by
-    // a concat vector.
-    bool isUndefLO = NumUndefsLO == Half;
-    bool isUndefHI = NumUndefsHI == Half;
-    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
-                                 isUndefLO, isUndefHI);
-  }
-
-  return SDValue();
-}
-
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG);
 
@@ -9270,8 +9154,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
     return AddSub;
-  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
-    return HorizontalOp;
   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
     return Broadcast;
   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-2.ll
similarity index 81%
rename from llvm/test/CodeGen/X86/haddsub-2.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub-2.ll
index bca446fa8fb56..4eb5bdba9edb6 100644
--- a/llvm/test/CodeGen/X86/haddsub-2.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-2.ll
@@ -1,38 +1,39 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
-; SSE-LABEL: hadd_ps_test1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_ps_test1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
-; AVX-LABEL: hadd_ps_test1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+
   %vecext = extractelement <4 x float> %A, i32 0
   %vecext1 = extractelement <4 x float> %A, i32 1
   %add = fadd float %vecext, %vecext1
   %vecinit = insertelement <4 x float> undef, float %add, i32 0
+
   %vecext2 = extractelement <4 x float> %A, i32 2
   %vecext3 = extractelement <4 x float> %A, i32 3
   %add4 = fadd float %vecext2, %vecext3
   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+
   %vecext6 = extractelement <4 x float> %B, i32 0
   %vecext7 = extractelement <4 x float> %B, i32 1
   %add8 = fadd float %vecext6, %vecext7
   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
+
   %vecext10 = extractelement <4 x float> %B, i32 2
   %vecext11 = extractelement <4 x float> %B, i32 3
   %add12 = fadd float %vecext10, %vecext11
   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
+
   ret <4 x float> %vecinit13
 }
 
+
 define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
 ; SSE-LABEL: hadd_ps_test2:
 ; SSE:       # %bb.0:
@@ -43,6 +44,13 @@ define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_ps_test2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
   %add = fadd float %vecext, %vecext1
@@ -72,6 +80,13 @@ define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_ps_test1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 0
   %vecext1 = extractelement <4 x float> %A, i32 1
   %sub = fsub float %vecext, %vecext1
@@ -101,6 +116,13 @@ define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_ps_test2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
   %sub = fsub float %vecext, %vecext1
@@ -159,6 +181,13 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phadd_d_test1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 0
   %vecext1 = extractelement <4 x i32> %A, i32 1
   %add = add i32 %vecext, %vecext1
@@ -217,6 +246,13 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phadd_d_test2(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 2
   %vecext1 = extractelement <4 x i32> %A, i32 3
   %add = add i32 %vecext, %vecext1
@@ -275,6 +311,13 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phsub_d_test1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 0
   %vecext1 = extractelement <4 x i32> %A, i32 1
   %sub = sub i32 %vecext, %vecext1
@@ -333,6 +376,13 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phsub_d_test2(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 2
   %vecext1 = extractelement <4 x i32> %A, i32 3
   %sub = sub i32 %vecext, %vecext1
@@ -362,6 +412,13 @@ define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_pd_test1(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %A, i32 0
   %vecext1 = extractelement <2 x double> %A, i32 1
   %add = fadd double %vecext, %vecext1
@@ -383,6 +440,13 @@ define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_pd_test2(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %A, i32 1
   %vecext1 = extractelement <2 x double> %A, i32 0
   %add = fadd double %vecext, %vecext1
@@ -404,6 +468,13 @@ define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsub_pd_test1(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %A, i32 0
   %vecext1 = extractelement <2 x double> %A, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -425,6 +496,13 @@ define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsub_pd_test2(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %B, i32 0
   %vecext1 = extractelement <2 x double> %B, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -456,6 +534,13 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
 ; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_vhadd_pd_test(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %A, i32 0
   %vecext1 = extractelement <4 x double> %A, i32 1
   %add = fadd double %vecext, %vecext1
@@ -495,6 +580,13 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
 ; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_vhsub_pd_test(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %A, i32 0
   %vecext1 = extractelement <4 x double> %A, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -590,6 +682,13 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @avx2_vphadd_d_test(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
   %vecext = extractelement <8 x i32> %A, i32 0
   %vecext1 = extractelement <8 x i32> %A, i32 1
   %add = add i32 %vecext, %vecext1
@@ -745,6 +844,13 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @avx2_vphadd_w_test(
+; CHECK-SAME: <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
   %vecext = extractelement <16 x i16> %a, i32 0
   %vecext1 = extractelement <16 x i16> %a, i32 1
   %add = add i16 %vecext, %vecext1
@@ -863,6 +969,13 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
 ; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @not_a_hsub_1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 0
   %vecext1 = extractelement <4 x i32> %A, i32 1
   %sub = sub i32 %vecext, %vecext1
@@ -920,6 +1033,13 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @not_a_hsub_2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
   %sub = fsub float %vecext, %vecext1
@@ -960,6 +1080,13 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
 ; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @not_a_hsub_3(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %B, i32 0
   %vecext1 = extractelement <2 x double> %B, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -985,6 +1112,13 @@ define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @avx_vhadd_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -1031,6 +1165,13 @@ define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @avx_vhsub_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %sub = fsub float %vecext, %vecext1
@@ -1077,6 +1218,13 @@ define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_hadd_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
   %add = fadd double %vecext, %vecext1
@@ -1107,6 +1255,13 @@ define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_hsub_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -1202,6 +1357,13 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @avx2_hadd_d(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -1355,6 +1517,13 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @avx2_hadd_w(
+; CHECK-SAME: <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
   %vecext = extractelement <16 x i16> %a, i32 0
   %vecext1 = extractelement <16 x i16> %a, i32 1
   %add = add i16 %vecext, %vecext1
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-shuf.ll
similarity index 73%
rename from llvm/test/CodeGen/X86/haddsub-shuf.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub-shuf.ll
index 364ad953a11d4..f425550c1c6df 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-shuf.ll
@@ -1,15 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3           | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
-
-; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
-; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <4 x float> @hadd_v4f32(<4 x float> %a) {
 ; SSE-LABEL: hadd_v4f32:
@@ -21,6 +11,13 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUF]]
+;
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fadd <2 x float> %a02, %a13
@@ -65,6 +62,13 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) {
 ; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hadd_v8f32a(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = fadd <4 x float> %a0, %a1
@@ -83,6 +87,13 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hadd_v8f32b(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fadd <8 x float> %a0, %a1
@@ -100,6 +111,13 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUF]]
+;
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fsub <2 x float> %a02, %a13
@@ -144,6 +162,13 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) {
 ; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hsub_v8f32a(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = fsub <4 x float> %a0, %a1
@@ -162,6 +187,13 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hsub_v8f32b(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fsub <8 x float> %a0, %a1
@@ -206,6 +238,13 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[SHUF]]
+;
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fadd <2 x double> %a0, %a1
@@ -250,6 +289,13 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_v2f64_scalar_splat(
+; CHECK-SAME: <2 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[SHUF]]
+;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
   %hop = fadd double %a0, %a1
@@ -281,6 +327,13 @@ define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_v4f64_scalar_splat(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
   %hop0 = fadd double %a0, %a1
@@ -335,6 +388,13 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_v4f64_scalar_broadcast(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
   %hop0 = fadd double %a0, %a1
@@ -370,6 +430,13 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_v4f64(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fadd <4 x double> %a0, %a1
@@ -414,6 +481,12 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsub_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <2 x double> [[TMP1]], [[A]]
+; CHECK-NEXT:    ret <2 x double> [[SHUF]]
+;
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fsub <2 x double> %a0, %a1
@@ -444,6 +517,13 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hsub_v4f64(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fsub <4 x double> %a0, %a1
@@ -468,6 +548,13 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hadd_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = add <4 x i32> %a02, %a13
@@ -524,6 +611,13 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
 ; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hadd_v8i32a(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = add <4 x i32> %a0, %a1
@@ -560,6 +654,13 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hadd_v8i32b(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = add <8 x i32> %a0, %a1
@@ -584,6 +685,13 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hsub_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 1, i32 poison>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = sub <4 x i32> %a02, %a13
@@ -640,6 +748,13 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
 ; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hsub_v8i32a(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = sub <4 x i32> %a0, %a1
@@ -676,6 +791,13 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hsub_v8i32b(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = sub <8 x i32> %a0, %a1
@@ -705,6 +827,13 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i16> @hadd_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[SHUF]]
+;
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <8 x i16> %a0246, %a1357
@@ -768,6 +897,13 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
 ; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hadd_v16i16a(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %hop = add <8 x i16> %a0, %a1
@@ -820,6 +956,13 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hadd_v16i16b(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <16 x i16> %a0, %a1
@@ -845,6 +988,13 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i16> @hsub_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 4, i32 poison, i32 poison, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 5, i32 poison, i32 poison, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[SHUF]]
+;
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <8 x i16> %a0246, %a1357
@@ -908,6 +1058,13 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
 ; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hsub_v16i16a(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %hop = sub <8 x i16> %a0, %a1
@@ -960,6 +1117,13 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hsub_v16i16b(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <16 x i16> %a0, %a1
@@ -985,6 +1149,12 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
 ; AVX2-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @broadcast_haddps_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> [[A0]], <4 x float> [[A0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
   %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %2
@@ -1002,6 +1172,13 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[VECINIT13:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
   %t2 = fadd <2 x float> %t0, %t1
@@ -1022,6 +1199,13 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[VECINIT13:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
   %t2 = fadd <4 x float> %t0, %t1
@@ -1051,6 +1235,13 @@ define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_4f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = fadd <4 x float> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x float> [[HADD]]
+;
   %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1074,6 +1265,13 @@ define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_4f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HSUB0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HSUB1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HSUB:%.*]] = fadd <4 x float> [[HSUB0]], [[HSUB1]]
+; CHECK-NEXT:    ret <4 x float> [[HSUB]]
+;
   %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1113,6 +1311,13 @@ define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hadd_4i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = add <4 x i32> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x i32> [[HADD]]
+;
   %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1152,6 +1357,13 @@ define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hsub_4i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HSUB0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HSUB1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HSUB:%.*]] = add <4 x i32> [[HSUB0]], [[HSUB1]]
+; CHECK-NEXT:    ret <4 x i32> [[HSUB]]
+;
   %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1185,6 +1397,13 @@ define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1)
 ; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_4f64_v4f64_shuffle(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = fadd <4 x double> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x double> [[HADD]]
+;
   %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1213,6 +1432,13 @@ define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1)
 ; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hsub_4f64_v4f64_shuffle(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = fsub <4 x double> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x double> [[HADD]]
+;
   %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1241,6 +1467,13 @@ define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
 ; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hadd_8f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HADD:%.*]] = fadd <8 x float> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <8 x float> [[HADD]]
+;
   %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1269,6 +1502,13 @@ define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
 ; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hsub_8f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HSUB0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HSUB1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HSUB:%.*]] = fadd <8 x float> [[HSUB0]], [[HSUB1]]
+; CHECK-NEXT:    ret <8 x float> [[HSUB]]
+;
   %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1312,6 +1552,13 @@ define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hadd_8i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HADD:%.*]] = add <8 x i32> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <8 x i32> [[HADD]]
+;
   %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1356,6 +1603,13 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hsub_8i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HADD:%.*]] = sub <8 x i32> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <8 x i32> [[HADD]]
+;
   %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1413,6 +1667,13 @@ define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hadd_16i16_16i16_shuffle(
+; CHECK-SAME: <16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> [[A1]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> [[A1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[HADD:%.*]] = add <16 x i16> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <16 x i16> [[HADD]]
+;
   %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-undef.ll
similarity index 57%
rename from llvm/test/CodeGen/X86/haddsub-undef.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub-undef.ll
index 94fa81742ba71..678b0a10717ac 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-undef.ll
@@ -1,12 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE,SSE-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE,SSE-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
-
-; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: test1_undef:
@@ -18,6 +11,19 @@ define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test1_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[TMP1]], float undef, i64 2
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 2, i32 poison>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[VECINIT13:%.*]] = shufflevector <4 x float> [[VECINIT5]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -43,6 +49,19 @@ define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test2_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[TMP1]], float undef, i64 1
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT9:%.*]] = shufflevector <4 x float> [[VECINIT3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[VECINIT13:%.*]] = shufflevector <4 x float> [[VECINIT9]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -68,6 +87,19 @@ define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test3_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[TMP1]], float undef, i64 3
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 poison, i32 3>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[B]], [[SHIFT2]]
+; CHECK-NEXT:    [[VECINIT9:%.*]] = shufflevector <4 x float> [[VECINIT5]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT9]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -105,6 +137,13 @@ define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test4_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -135,6 +174,13 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @test5_undef(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <2 x double> [[TMP1]], double undef, i64 1
+; CHECK-NEXT:    ret <2 x double> [[VECINIT1]]
+;
   %vecext = extractelement <2 x double> %a, i32 0
   %vecext1 = extractelement <2 x double> %a, i32 1
   %add = fadd double %vecext, %vecext1
@@ -152,6 +198,16 @@ define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test6_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float poison, float undef, float undef>, <4 x i32> <i32 0, i32 poison, i32 6, i32 7>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -173,6 +229,16 @@ define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test7_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float undef, float undef, float poison, float poison>, <4 x i32> <i32 4, i32 5, i32 0, i32 poison>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %b, i32 0
   %vecext1 = extractelement <4 x float> %b, i32 1
   %add = fadd float %vecext, %vecext1
@@ -218,6 +284,16 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test8_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 0, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -239,6 +315,16 @@ define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test9_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -260,6 +346,16 @@ define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test10_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float poison, float undef, float undef, float poison, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x float> [[VECINIT]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[VECINIT5]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -292,6 +388,16 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test11_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float poison, float undef, float undef, float undef, float undef, float undef, float poison, float undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x float> [[VECINIT]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 12, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[VECINIT5]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -313,6 +419,16 @@ define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test12_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float poison, float poison, float undef, float undef, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 poison, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x float> [[VECINIT]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 10, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[VECINIT5]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -335,6 +451,14 @@ define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test13_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP4]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add1 = fadd float %vecext, %vecext1
@@ -389,6 +513,14 @@ define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
 ; AVX512-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
 ; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512-SLOW-NEXT:    retq
+; CHECK-LABEL: define <16 x float> @test13_v16f32_undef(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    ret <16 x float> [[TMP4]]
+;
   %vecext = extractelement <16 x float> %a, i32 0
   %vecext1 = extractelement <16 x float> %a, i32 1
   %add1 = fadd float %vecext, %vecext1
@@ -429,6 +561,12 @@ define <2 x double> @add_pd_003(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @add_pd_003(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[X]], [[L]]
+; CHECK-NEXT:    ret <2 x double> [[ADD]]
+;
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
@@ -459,6 +597,12 @@ define <2 x double> @add_pd_003_2(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @add_pd_003_2(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[X]], [[L]]
+; CHECK-NEXT:    ret <2 x double> [[ADD]]
+;
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
@@ -481,6 +625,12 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @add_pd_010(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <2 x double> [[TMP1]], [[X]]
+; CHECK-NEXT:    ret <2 x double> [[SHUFFLE2]]
+;
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -497,6 +647,13 @@ define <4 x float> @add_ps_007(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_007(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[L]], [[R]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -530,6 +687,13 @@ define <4 x float> @add_ps_030(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_030(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 3, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE2]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -547,6 +711,13 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_007_2(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[L]], [[R]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
@@ -575,6 +746,12 @@ define <4 x float> @add_ps_008(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_008(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[X]], [[L]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   ret <4 x float> %add
@@ -593,6 +770,13 @@ define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3]
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_016(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP0]], <4 x i32> <i32 2, i32 0, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP0]], <4 x i32> <i32 3, i32 1, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6>
   %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7>
   %5 = fadd <2 x float> %3, %4
@@ -630,6 +814,13 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_017(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE2]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -660,6 +851,12 @@ define <4 x float> @add_ps_018(<4 x float> %x) {
 ; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_018(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <4 x float> [[TMP1]], [[X]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE2]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
@@ -704,6 +901,13 @@ define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) {
 ; AVX512-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @add_pd_011(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x double> [[TMP5]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 undef, i32 5, i32 undef>
   %5 = fadd <4 x double> %3, %4
@@ -722,6 +926,18 @@ define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_inputs_v4f32_output_0101(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R0:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 0, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R0]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
   %b0 = extractelement <8 x float> %b, i32 0
@@ -744,6 +960,17 @@ define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_input0_v4f32_output_0123(
+; CHECK-SAME: <8 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[R0:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
   %b2 = extractelement <4 x float> %b, i32 2
@@ -766,6 +993,17 @@ define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_input1_v4f32_output_2301(
+; CHECK-SAME: <4 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float undef, float poison, float poison, float undef>, <4 x i32> <i32 4, i32 2, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
   %b0 = extractelement <8 x float> %b, i32 0
@@ -788,6 +1026,18 @@ define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_inputs_v4f32_output_2323(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> <float undef, float poison, float undef, float poison>, <4 x i32> <i32 4, i32 2, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a2 = extractelement <8 x float> %a, i32 2
   %a3 = extractelement <8 x float> %a, i32 3
   %b2 = extractelement <8 x float> %b, i32 2
@@ -822,6 +1072,18 @@ define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float
 ; AVX512-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v16f32_inputs_v4f32_output_0123(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R0:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R0]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a0 = extractelement <16 x float> %a, i32 0
   %a1 = extractelement <16 x float> %a, i32 1
   %b2 = extractelement <16 x float> %b, i32 2
@@ -853,6 +1115,18 @@ define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @v16f32_inputs_v8f32_output_4567(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> <float undef, float undef, float undef, float undef, float poison, float undef, float undef, float poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a4 = extractelement <16 x float> %a, i32 4
   %a5 = extractelement <16 x float> %a, i32 5
   %b6 = extractelement <16 x float> %b, i32 6
@@ -874,6 +1148,16 @@ define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @PR40243(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float undef, float undef, float undef, float undef, float poison, float undef, float undef, float poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a4 = extractelement <8 x float> %a, i32 4
   %a5 = extractelement <8 x float> %a, i32 5
   %add4 = fadd float %a4, %a5
@@ -921,6 +1205,13 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
 ; AVX512-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR44694(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x double> [[TMP5]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %5 = fadd <4 x double> %3, %4
@@ -952,6 +1243,13 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR45747_1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
+;
   %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
   %t1 = fadd <4 x float> %t0, %a
   %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
@@ -985,6 +1283,13 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR45747_2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
+;
   %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
   %t1 = fadd <4 x float> %t0, %b
   %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
@@ -1001,6 +1306,13 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_u123(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 poison, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
   %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
   %5 = fadd <2 x float> %3, %4
@@ -1040,6 +1352,13 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_0u23(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %4 = fadd <4 x float> %3, %0
   %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1061,6 +1380,13 @@ define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_01u3(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 2, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x float> %3, %4
@@ -1081,6 +1407,13 @@ define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_012u(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x float> %3, %4
@@ -1129,6 +1462,20 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX-FAST-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_u123(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 2, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 3, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double undef, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
   %5 = fadd <2 x double> %3, %4
@@ -1176,6 +1523,20 @@ define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_0u23(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double undef, i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
   %5 = fadd <2 x double> %3, %4
@@ -1230,6 +1591,20 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
 ; AVX512-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
 ; AVX512-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_01u3(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double undef, i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 2, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x double> %3, %4
@@ -1276,6 +1651,20 @@ define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_012u(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double undef, i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 poison, i32 3>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x double> %3, %4
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub.ll
similarity index 64%
rename from llvm/test/CodeGen/X86/haddsub.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub.ll
index a0778195b5c73..91289087689ef 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub.ll
@@ -1,12 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
 ; SSE3-LABEL: haddpd1:
@@ -18,6 +11,13 @@ define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @haddpd1(
+; CHECK-SAME: <2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
   %r = fadd <2 x double> %a, %b
@@ -34,6 +34,13 @@ define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @haddpd2(
+; CHECK-SAME: <2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
   %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
   %r = fadd <2 x double> %a, %b
@@ -63,6 +70,13 @@ define <2 x double> @haddpd3(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @haddpd3(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fadd <2 x double> %a, %b
@@ -79,6 +93,13 @@ define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps1(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %r = fadd <4 x float> %a, %b
@@ -95,6 +116,13 @@ define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps2(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[X]], <4 x i32> <i32 4, i32 7, i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
   %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
   %r = fadd <4 x float> %a, %b
@@ -111,6 +139,13 @@ define <4 x float> @haddps3(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps3(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float undef, float poison, float undef, float poison>, <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fadd <4 x float> %a, %b
@@ -127,6 +162,13 @@ define <4 x float> @haddps4(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps4(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -143,6 +185,13 @@ define <4 x float> @haddps5(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps5(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -171,6 +220,13 @@ define <4 x float> @haddps6(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps6(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -187,6 +243,13 @@ define <4 x float> @haddps7(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps7(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -203,6 +266,13 @@ define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsubpd1(
+; CHECK-SAME: <2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
   %r = fsub <2 x double> %a, %b
@@ -232,6 +302,13 @@ define <2 x double> @hsubpd2(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsubpd2(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fsub <2 x double> %a, %b
@@ -248,6 +325,13 @@ define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps1(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %r = fsub <4 x float> %a, %b
@@ -264,6 +348,13 @@ define <4 x float> @hsubps2(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps2(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float undef, float poison, float undef, float poison>, <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fsub <4 x float> %a, %b
@@ -280,6 +371,13 @@ define <4 x float> @hsubps3(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps3(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -308,6 +406,13 @@ define <4 x float> @hsubps4(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps4(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -325,6 +430,13 @@ define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhaddps1(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
   %r = fadd <8 x float> %a, %b
@@ -342,6 +454,13 @@ define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhaddps2(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[Y]], <8 x float> [[X]], <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
   %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
   %r = fadd <8 x float> %a, %b
@@ -359,6 +478,13 @@ define <8 x float> @vhaddps3(<8 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhaddps3(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float undef, float poison, float undef, float poison, float poison, float poison, float undef, float poison>, <8 x i32> <i32 poison, i32 2, i32 8, i32 10, i32 4, i32 6, i32 poison, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float poison, float undef, float poison, float poison, float poison, float undef, float poison, float undef>, <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fadd <8 x float> %a, %b
@@ -376,6 +502,13 @@ define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhsubps1(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
   %r = fsub <8 x float> %a, %b
@@ -393,6 +526,13 @@ define <8 x float> @vhsubps3(<8 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhsubps3(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float undef, float poison, float undef, float poison, float poison, float poison, float undef, float poison>, <8 x i32> <i32 poison, i32 2, i32 8, i32 10, i32 4, i32 6, i32 poison, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float poison, float undef, float poison, float poison, float poison, float undef, float poison, float undef>, <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fsub <8 x float> %a, %b
@@ -410,6 +550,13 @@ define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @vhaddpd1(
+; CHECK-SAME: <4 x double> [[X:%.*]], <4 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %r = fadd <4 x double> %a, %b
@@ -427,6 +574,13 @@ define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @vhsubpd1(
+; CHECK-SAME: <4 x double> [[X:%.*]], <4 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %r = fsub <4 x double> %a, %b
@@ -443,6 +597,13 @@ define <2 x float> @haddps_v2f32(<4 x float> %v0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x float> @haddps_v2f32(
+; CHECK-SAME: <4 x float> [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[V0]], <4 x float> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[V0]], <4 x float> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <2 x float> [[TMP6]]
+;
   %v0.0 = extractelement <4 x float> %v0, i32 0
   %v0.1 = extractelement <4 x float> %v0, i32 1
   %v0.2 = extractelement <4 x float> %v0, i32 2
@@ -478,6 +639,13 @@ define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fadd float %x0, %x1
@@ -511,6 +679,13 @@ define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fadd_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fadd float %x0, %x1
@@ -539,6 +714,13 @@ define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fadd float %x1, %x0
@@ -572,6 +754,13 @@ define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fadd_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fadd float %x1, %x0
@@ -601,6 +790,13 @@ define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fadd_f64(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fadd double %x0, %x1
@@ -630,6 +826,13 @@ define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fadd_f64_commute(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fadd double %x1, %x0
@@ -658,6 +861,13 @@ define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fsub_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fsub float %x0, %x1
@@ -692,6 +902,13 @@ define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fsub_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fsub float %x0, %x1
@@ -711,6 +928,13 @@ define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fsub_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fsub float %x1, %x0
@@ -732,6 +956,13 @@ define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fsub_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fsub float %x1, %x0
@@ -761,6 +992,13 @@ define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fsub_f64(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fsub double %x0, %x1
@@ -781,6 +1019,13 @@ define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fsub_f64_commute(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fsub double %x1, %x0
@@ -813,6 +1058,13 @@ define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fadd_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fadd float %x0, %x1
@@ -848,6 +1100,13 @@ define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v8f32_fadd_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 2
   %x1 = extractelement <8 x float> %x, i32 3
   %x01 = fadd float %x0, %x1
@@ -885,6 +1144,13 @@ define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract67_v8f32_fadd_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 6
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 6
   %x1 = extractelement <8 x float> %x, i32 7
   %x01 = fadd float %x0, %x1
@@ -915,6 +1181,13 @@ define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fadd_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fadd float %x1, %x0
@@ -950,6 +1223,13 @@ define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v8f32_fadd_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 2
   %x1 = extractelement <8 x float> %x, i32 3
   %x01 = fadd float %x1, %x0
@@ -987,6 +1267,13 @@ define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract67_v8f32_fadd_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 6
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 6
   %x1 = extractelement <8 x float> %x, i32 7
   %x01 = fadd float %x1, %x0
@@ -1018,6 +1305,13 @@ define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fadd_f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fadd double %x0, %x1
@@ -1052,6 +1346,13 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract23_v4f64_fadd_f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 2
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 2
   %x1 = extractelement <4 x double> %x, i32 3
   %x01 = fadd double %x0, %x1
@@ -1083,6 +1384,13 @@ define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fadd_f64_commute(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fadd double %x1, %x0
@@ -1117,6 +1425,13 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract23_v4f64_fadd_f64_commute(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 2
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 2
   %x1 = extractelement <4 x double> %x, i32 3
   %x01 = fadd double %x1, %x0
@@ -1147,6 +1462,13 @@ define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fsub_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fsub float %x0, %x1
@@ -1183,6 +1505,13 @@ define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v8f32_fsub_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 2
   %x1 = extractelement <8 x float> %x, i32 3
   %x01 = fsub float %x0, %x1
@@ -1217,6 +1546,13 @@ define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract45_v8f32_fsub_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 4
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 4
   %x1 = extractelement <8 x float> %x, i32 5
   %x01 = fsub float %x0, %x1
@@ -1239,6 +1575,13 @@ define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fsub_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fsub float %x1, %x0
@@ -1270,6 +1613,13 @@ define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fsub_f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fsub double %x0, %x1
@@ -1293,6 +1643,13 @@ define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fsub_f64_commute(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fsub double %x1, %x0
@@ -1325,6 +1682,13 @@ define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fadd_f32(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fadd float %x0, %x1
@@ -1355,6 +1719,13 @@ define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fadd_f32_commute(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fadd float %x1, %x0
@@ -1386,6 +1757,13 @@ define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fadd_f64(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fadd double %x0, %x1
@@ -1417,6 +1795,13 @@ define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fadd_f64_commute(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fadd double %x1, %x0
@@ -1447,6 +1832,13 @@ define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fsub_f32(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fsub float %x0, %x1
@@ -1467,6 +1859,13 @@ define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fsub_f32_commute(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fsub float %x1, %x0
@@ -1498,6 +1897,13 @@ define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fsub_f64(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fsub double %x0, %x1
@@ -1519,6 +1925,13 @@ define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fsub_f64_commute(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fsub double %x1, %x0
@@ -1553,6 +1966,14 @@ define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, ptr %p) {
 ; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_uses1(
+; CHECK-SAME: <4 x float> [[X:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    store float [[X0]], ptr [[P]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i64 1
+; CHECK-NEXT:    [[X01:%.*]] = fadd float [[X0]], [[X1]]
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   store float %x0, ptr %p
   %x1 = extractelement <4 x float> %x, i32 1
@@ -1587,6 +2008,14 @@ define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, ptr %p) {
 ; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_uses2(
+; CHECK-SAME: <4 x float> [[X:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i64 1
+; CHECK-NEXT:    store float [[X1]], ptr [[P]], align 4
+; CHECK-NEXT:    [[X01:%.*]] = fadd float [[X0]], [[X1]]
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   store float %x1, ptr %p
@@ -1610,6 +2039,15 @@ define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, ptr %p1, pt
 ; AVX-NEXT:    vmovss %xmm1, (%rsi)
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_uses3(
+; CHECK-SAME: <4 x float> [[X:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P1:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P2:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    store float [[X0]], ptr [[P1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i64 1
+; CHECK-NEXT:    store float [[X1]], ptr [[P2]], align 4
+; CHECK-NEXT:    [[X01:%.*]] = fadd float [[X0]], [[X1]]
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   store float %x0, ptr %p1
   %x1 = extractelement <4 x float> %x, i32 1
@@ -1665,6 +2103,11 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @fadd_reduce_v8f32(
+; CHECK-SAME: float [[A0:%.*]], <8 x float> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v8f32(float [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    ret float [[R]]
+;
   %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %r
 }
@@ -1704,6 +2147,11 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @fadd_reduce_v4f64(
+; CHECK-SAME: double [[A0:%.*]], <4 x double> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v4f64(double [[A0]], <4 x double> [[A1]])
+; CHECK-NEXT:    ret double [[R]]
+;
   %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %r
 }
@@ -1760,6 +2208,19 @@ define float @PR39936_v8f32(<8 x float>) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @PR39936_v8f32(
+; CHECK-SAME: <8 x float> [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP9]], i64 0
+; CHECK-NEXT:    ret float [[TMP10]]
+;
   %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %4 = fadd <8 x float> %2, %3
@@ -1804,6 +2265,15 @@ define float @hadd32_4(<4 x float> %x225) {
 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_4(
+; CHECK-SAME: <4 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <4 x float> [[X225]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <4 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <4 x float> [[X227]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <4 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <4 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %x227 = fadd <4 x float> %x225, %x226
   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1846,6 +2316,15 @@ define float @hadd32_8(<8 x float> %x225) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_8(
+; CHECK-SAME: <8 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <8 x float> [[X225]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <8 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <8 x float> [[X227]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <8 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <8 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <8 x float> %x225, %x226
   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1888,6 +2367,15 @@ define float @hadd32_16(<16 x float> %x225) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_16(
+; CHECK-SAME: <16 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <16 x float> [[X225]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <16 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <16 x float> [[X227]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <16 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <16 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <16 x float> %x225, %x226
   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1911,6 +2399,15 @@ define float @hadd32_4_optsize(<4 x float> %x225) optsize {
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_4_optsize(
+; CHECK-SAME: <4 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <4 x float> [[X225]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <4 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <4 x float> [[X227]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <4 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <4 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %x227 = fadd <4 x float> %x225, %x226
   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1935,6 +2432,15 @@ define float @hadd32_8_optsize(<8 x float> %x225) optsize {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_8_optsize(
+; CHECK-SAME: <8 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <8 x float> [[X225]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <8 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <8 x float> [[X227]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <8 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <8 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <8 x float> %x225, %x226
   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1959,6 +2465,15 @@ define float @hadd32_16_optsize(<16 x float> %x225) optsize {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_16_optsize(
+; CHECK-SAME: <16 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <16 x float> [[X225]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <16 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <16 x float> [[X227]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <16 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <16 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <16 x float> %x225, %x226
   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1982,6 +2497,15 @@ define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_4_pgso(
+; CHECK-SAME: <4 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] !prof [[PROF14:![0-9]+]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <4 x float> [[X225]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <4 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <4 x float> [[X227]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <4 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <4 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %x227 = fadd <4 x float> %x225, %x226
   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -2006,6 +2530,15 @@ define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_8_pgso(
+; CHECK-SAME: <8 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] !prof [[PROF14]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <8 x float> [[X225]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <8 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <8 x float> [[X227]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <8 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <8 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <8 x float> %x225, %x226
   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2030,6 +2563,15 @@ define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_16_pgso(
+; CHECK-SAME: <16 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] !prof [[PROF14]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <16 x float> [[X225]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <16 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <16 x float> [[X227]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <16 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <16 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <16 x float> %x225, %x226
   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2071,6 +2613,15 @@ define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @partial_reduction_fadd_v8f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X23:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0213:%.*]] = fadd <8 x float> [[X]], [[X23]]
+; CHECK-NEXT:    [[X13:%.*]] = shufflevector <8 x float> [[X0213]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0123:%.*]] = fadd reassoc nsz <8 x float> [[X13]], [[X0213]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <8 x float> [[X0123]], i64 0
+; CHECK-NEXT:    ret float [[R]]
+;
   %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x0213 = fadd <8 x float> %x, %x23
   %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2116,6 +2667,15 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @partial_reduction_fadd_v8f32_wrong_flags(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X23:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0213:%.*]] = fadd fast <8 x float> [[X23]], [[X]]
+; CHECK-NEXT:    [[X13:%.*]] = shufflevector <8 x float> [[X0213]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0123:%.*]] = fadd nnan ninf <8 x float> [[X0213]], [[X13]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <8 x float> [[X0123]], i64 0
+; CHECK-NEXT:    ret float [[R]]
+;
   %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x0213 = fadd fast <8 x float> %x, %x23
   %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2157,6 +2717,15 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @partial_reduction_fadd_v16f32(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X23:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0213:%.*]] = fadd <16 x float> [[X]], [[X23]]
+; CHECK-NEXT:    [[X13:%.*]] = shufflevector <16 x float> [[X0213]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0123:%.*]] = fadd reassoc nsz <16 x float> [[X13]], [[X0213]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x float> [[X0123]], i64 0
+; CHECK-NEXT:    ret float [[R]]
+;
   %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x0213 = fadd <16 x float> %x, %x23
   %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2181,3 +2750,6 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
 !12 = !{i32 999000, i64 100, i32 1}
 !13 = !{i32 999999, i64 1, i32 2}
 !14 = !{!"function_entry_count", i64 0}
+;.
+; CHECK: [[PROF14]] = !{!"function_entry_count", i64 0}
+;.
diff --git a/llvm/test/CodeGen/X86/phaddsub-undef.ll b/llvm/test/Transforms/PhaseOrdering/X86/phaddsub-undef.ll
similarity index 53%
rename from llvm/test/CodeGen/X86/phaddsub-undef.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/phaddsub-undef.ll
index 8aa40939994fd..3cfd1b797209c 100644
--- a/llvm/test/CodeGen/X86/phaddsub-undef.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/phaddsub-undef.ll
@@ -1,14 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3              | FileCheck %s --check-prefixes=SSE,SSE-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops    | FileCheck %s --check-prefixes=SSE,SSE-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl           | FileCheck %s --check-prefixes=AVX,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
-; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
 
 define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: test14_undef:
@@ -20,6 +12,16 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test14_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x i32> [[VECINIT]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -87,6 +89,16 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test15_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x i32> [[VECINIT]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 12, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -121,6 +133,16 @@ define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @PR40243_alt(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[R4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[R4]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
   %a4 = extractelement <8 x i32> %a, i32 4
   %a5 = extractelement <8 x i32> %a, i32 5
   %add4 = add i32 %a4, %a5
@@ -142,6 +164,16 @@ define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test16_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 poison, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x i32> [[VECINIT]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 10, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -163,6 +195,16 @@ define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <16 x i32> @test16_v16i32_undef(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> <i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <16 x i32> <i32 0, i32 poison, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <16 x i32> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <16 x i32> [[VECINIT]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 18, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <16 x i32> %a, i32 0
   %vecext1 = extractelement <16 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -197,6 +239,14 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test17_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add1 = add i32 %vecext, %vecext1
@@ -239,6 +289,14 @@ define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <16 x i32> @test17_v16i32_undef(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
   %vecext = extractelement <16 x i32> %a, i32 0
   %vecext1 = extractelement <16 x i32> %a, i32 1
   %add1 = add i32 %vecext, %vecext1



More information about the llvm-commits mailing list