[llvm] caacf86 - [DAG] Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m) (#90952)

via llvm-commits llvm-commits at lists.llvm.org
Sat May 4 04:03:13 PDT 2024


Author: Simon Pilgrim
Date: 2024-05-04T12:03:10+01:00
New Revision: caacf8685ac49526103b748b6b439dea84c30274

URL: https://github.com/llvm/llvm-project/commit/caacf8685ac49526103b748b6b439dea84c30274
DIFF: https://github.com/llvm/llvm-project/commit/caacf8685ac49526103b748b6b439dea84c30274.diff

LOG: [DAG] Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m) (#90952)

If the shuffle mask contains no undef elements, then we can move the freeze through a shuffle node.

This requires special case handling to create a new ShuffleVectorSDNode.

Includes VECTOR_SHUFFLE support for isGuaranteedNotToBeUndefOrPoison  / canCreateUndefOrPoison.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/test/CodeGen/SystemZ/pr60413.ll
    llvm/test/CodeGen/X86/freeze-binary.ll
    llvm/test/CodeGen/X86/sdiv_fix_sat.ll
    llvm/test/CodeGen/X86/setcc-non-simple-type.ll
    llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fc6bbc119d3c1b..fe932ca68c1288 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15462,9 +15462,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
-  bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR ||
-                                          N0.getOpcode() == ISD::BUILD_PAIR ||
-                                          N0.getOpcode() == ISD::CONCAT_VECTORS;
+  bool AllowMultipleMaybePoisonOperands =
+      N0.getOpcode() == ISD::BUILD_VECTOR ||
+      N0.getOpcode() == ISD::BUILD_PAIR ||
+      N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+      N0.getOpcode() == ISD::CONCAT_VECTORS;
 
   // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
   // ones" or "constant" into something that depends on FrozenUndef. We can
@@ -15537,8 +15539,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
     if (Op.getOpcode() == ISD::UNDEF)
       Op = DAG.getFreeze(Op);
   }
-  // NOTE: this strips poison generating flags.
-  SDValue R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+
+  SDValue R;
+  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N0)) {
+    // Special case handling for ShuffleVectorSDNode nodes.
+    R = DAG.getVectorShuffle(N0.getValueType(), SDLoc(N0), Ops[0], Ops[1],
+                             SVN->getMask());
+  } else {
+    // NOTE: this strips poison generating flags.
+    R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+  }
   assert(DAG.isGuaranteedNotToBeUndefOrPoison(R, /*PoisonOnly*/ false) &&
          "Can't create node that may be undef/poison!");
   return R;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 25a728bf9ba3e7..eef5acd032345e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5088,6 +5088,24 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     }
     return true;
 
+  case ISD::VECTOR_SHUFFLE: {
+    APInt DemandedLHS, DemandedRHS;
+    auto *SVN = cast<ShuffleVectorSDNode>(Op);
+    if (!getShuffleDemandedElts(DemandedElts.getBitWidth(), SVN->getMask(),
+                                DemandedElts, DemandedLHS, DemandedRHS,
+                                /*AllowUndefElts=*/false))
+      return false;
+    if (!DemandedLHS.isZero() &&
+        !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
+                                          PoisonOnly, Depth + 1))
+      return false;
+    if (!DemandedRHS.isZero() &&
+        !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
+                                          PoisonOnly, Depth + 1))
+      return false;
+    return true;
+  }
+
     // TODO: Search for noundef attributes from library functions.
 
     // TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
@@ -5225,6 +5243,15 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
   }
 
+  case ISD::VECTOR_SHUFFLE: {
+    // Check for any demanded shuffle element that is undef.
+    auto *SVN = cast<ShuffleVectorSDNode>(Op);
+    for (auto [Idx, Elt] : enumerate(SVN->getMask()))
+      if (Elt < 0 && DemandedElts[Idx])
+        return true;
+    return false;
+  }
+
   default:
     // Allow the target to implement this method for its nodes.
     if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||

diff  --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 5a629567d07069..8a6a30318ae583 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,114 +13,110 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
 define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-LABEL: m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
-; CHECK-NEXT:    llhrl %r2, f+4
-; CHECK-NEXT:    sll %r2, 8
-; CHECK-NEXT:    larl %r1, f
-; CHECK-NEXT:    ic %r2, 6(%r1)
-; CHECK-NEXT:    larl %r1, e
-; CHECK-NEXT:    lb %r0, 3(%r1)
-; CHECK-NEXT:    clfi %r2, 128
+; CHECK-NEXT:    lhrl %r1, f+4
+; CHECK-NEXT:    sll %r1, 8
+; CHECK-NEXT:    larl %r2, f
+; CHECK-NEXT:    ic %r1, 6(%r2)
+; CHECK-NEXT:    larl %r2, e
+; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlvgf %v1, %r1, 0
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT:    nilh %r1, 255
+; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT:    vlvgp %v1, %r2, %r0
-; CHECK-NEXT:    vlvgf %v1, %r2, 0
-; CHECK-NEXT:    vlvgf %v1, %r2, 2
-; CHECK-NEXT:    vlvgp %v0, %r0, %r2
-; CHECK-NEXT:    vlvgp %v2, %r2, %r2
-; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
-; CHECK-NEXT:    nilh %r2, 255
-; CHECK-NEXT:    chi %r2, 128
-; CHECK-NEXT:    ipm %r2
-; CHECK-NEXT:    risbg %r2, %r2, 63, 191, 36
 ; CHECK-NEXT:    vlvgf %v0, %r0, 0
 ; CHECK-NEXT:    vlvgf %v0, %r0, 2
-; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vgbm %v3, 30583
 ; CHECK-NEXT:    vn %v0, %v0, %v3
 ; CHECK-NEXT:    vn %v1, %v1, %v3
+; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vn %v2, %v2, %v3
 ; CHECK-NEXT:    vrepif %v3, 127
 ; CHECK-NEXT:    vchlf %v1, %v1, %v3
-; CHECK-NEXT:    vlgvf %r12, %v1, 0
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
 ; CHECK-NEXT:    vchlf %v2, %v2, %v3
-; CHECK-NEXT:    vlgvf %r4, %v2, 1
-; CHECK-NEXT:    nilf %r4, 1
-; CHECK-NEXT:    vlgvf %r5, %v2, 0
-; CHECK-NEXT:    risbg %r3, %r5, 48, 176, 15
-; CHECK-NEXT:    rosbg %r3, %r4, 32, 49, 14
-; CHECK-NEXT:    vlgvf %r14, %v2, 2
+; CHECK-NEXT:    vlgvf %r3, %v2, 1
+; CHECK-NEXT:    nilf %r3, 1
+; CHECK-NEXT:    vlgvf %r4, %v2, 0
+; CHECK-NEXT:    risbg %r2, %r4, 48, 176, 15
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 49, 14
+; CHECK-NEXT:    vlgvf %r5, %v2, 2
+; CHECK-NEXT:    nilf %r5, 1
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 50, 13
+; CHECK-NEXT:    vlgvf %r14, %v2, 3
 ; CHECK-NEXT:    nilf %r14, 1
-; CHECK-NEXT:    rosbg %r3, %r14, 32, 50, 13
-; CHECK-NEXT:    vlgvf %r13, %v2, 3
-; CHECK-NEXT:    nilf %r13, 1
-; CHECK-NEXT:    rosbg %r3, %r13, 32, 51, 12
-; CHECK-NEXT:    rosbg %r3, %r12, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r12, %v1, 1
-; CHECK-NEXT:    rosbg %r3, %r12, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r12, %v1, 2
-; CHECK-NEXT:    rosbg %r3, %r12, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r12, %v1, 3
-; CHECK-NEXT:    rosbg %r3, %r12, 55, 55, 8
+; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
+; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
 ; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r12, %v0, 0
-; CHECK-NEXT:    rosbg %r3, %r12, 56, 56, 7
-; CHECK-NEXT:    vlgvf %r12, %v0, 1
-; CHECK-NEXT:    rosbg %r3, %r12, 57, 57, 6
-; CHECK-NEXT:    vlgvf %r12, %v0, 2
-; CHECK-NEXT:    rosbg %r3, %r12, 58, 58, 5
-; CHECK-NEXT:    vlgvf %r12, %v0, 3
-; CHECK-NEXT:    rosbg %r3, %r12, 59, 59, 4
-; CHECK-NEXT:    nilf %r5, 1
-; CHECK-NEXT:    rosbg %r3, %r5, 32, 60, 3
-; CHECK-NEXT:    rosbg %r3, %r4, 32, 61, 2
-; CHECK-NEXT:    rosbg %r3, %r14, 32, 62, 1
-; CHECK-NEXT:    or %r3, %r13
-; CHECK-NEXT:    vlgvb %r5, %v0, 1
-; CHECK-NEXT:    vlgvb %r4, %v0, 0
-; CHECK-NEXT:    risbg %r4, %r4, 48, 176, 15
-; CHECK-NEXT:    rosbg %r4, %r5, 49, 49, 14
-; CHECK-NEXT:    vlgvb %r5, %v0, 2
-; CHECK-NEXT:    rosbg %r4, %r5, 50, 50, 13
-; CHECK-NEXT:    vlgvb %r5, %v0, 3
-; CHECK-NEXT:    rosbg %r4, %r5, 51, 51, 12
-; CHECK-NEXT:    vlgvb %r5, %v0, 4
-; CHECK-NEXT:    rosbg %r4, %r5, 52, 52, 11
-; CHECK-NEXT:    vlgvb %r5, %v0, 5
-; CHECK-NEXT:    rosbg %r4, %r5, 53, 53, 10
-; CHECK-NEXT:    vlgvb %r5, %v0, 6
-; CHECK-NEXT:    rosbg %r4, %r5, 54, 54, 9
-; CHECK-NEXT:    vlgvb %r5, %v0, 7
-; CHECK-NEXT:    rosbg %r4, %r5, 55, 55, 8
-; CHECK-NEXT:    vlgvb %r5, %v0, 8
-; CHECK-NEXT:    rosbg %r4, %r5, 56, 56, 7
-; CHECK-NEXT:    vlgvb %r5, %v0, 9
-; CHECK-NEXT:    rosbg %r4, %r5, 57, 57, 6
-; CHECK-NEXT:    vlgvb %r5, %v0, 10
-; CHECK-NEXT:    rosbg %r4, %r5, 58, 58, 5
-; CHECK-NEXT:    vlgvb %r5, %v0, 11
-; CHECK-NEXT:    rosbg %r4, %r5, 59, 59, 4
-; CHECK-NEXT:    vlgvb %r5, %v0, 12
-; CHECK-NEXT:    rosbg %r4, %r5, 60, 60, 3
-; CHECK-NEXT:    vlgvb %r5, %v0, 13
-; CHECK-NEXT:    rosbg %r4, %r5, 61, 61, 2
-; CHECK-NEXT:    vlgvb %r5, %v0, 14
-; CHECK-NEXT:    rosbg %r4, %r5, 62, 62, 1
-; CHECK-NEXT:    vlgvb %r5, %v0, 15
-; CHECK-NEXT:    rosbg %r4, %r5, 63, 63, 0
-; CHECK-NEXT:    xilf %r4, 4294967295
-; CHECK-NEXT:    or %r4, %r3
-; CHECK-NEXT:    tmll %r4, 65535
-; CHECK-NEXT:    ipm %r3
-; CHECK-NEXT:    afi %r3, -268435456
-; CHECK-NEXT:    srl %r3, 31
+; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
+; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 57, 57, 6
+; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 58, 58, 5
+; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 59, 59, 4
+; CHECK-NEXT:    nilf %r4, 1
+; CHECK-NEXT:    rosbg %r2, %r4, 32, 60, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 61, 2
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 62, 1
+; CHECK-NEXT:    or %r2, %r14
+; CHECK-NEXT:    vlgvb %r4, %v0, 1
+; CHECK-NEXT:    vlgvb %r3, %v0, 0
+; CHECK-NEXT:    risbg %r3, %r3, 48, 176, 15
+; CHECK-NEXT:    rosbg %r3, %r4, 49, 49, 14
+; CHECK-NEXT:    vlgvb %r4, %v0, 2
+; CHECK-NEXT:    rosbg %r3, %r4, 50, 50, 13
+; CHECK-NEXT:    vlgvb %r4, %v0, 3
+; CHECK-NEXT:    rosbg %r3, %r4, 51, 51, 12
+; CHECK-NEXT:    vlgvb %r4, %v0, 4
+; CHECK-NEXT:    rosbg %r3, %r4, 52, 52, 11
+; CHECK-NEXT:    vlgvb %r4, %v0, 5
+; CHECK-NEXT:    rosbg %r3, %r4, 53, 53, 10
+; CHECK-NEXT:    vlgvb %r4, %v0, 6
+; CHECK-NEXT:    rosbg %r3, %r4, 54, 54, 9
+; CHECK-NEXT:    vlgvb %r4, %v0, 7
+; CHECK-NEXT:    rosbg %r3, %r4, 55, 55, 8
+; CHECK-NEXT:    vlgvb %r4, %v0, 8
+; CHECK-NEXT:    rosbg %r3, %r4, 56, 56, 7
+; CHECK-NEXT:    vlgvb %r4, %v0, 9
+; CHECK-NEXT:    rosbg %r3, %r4, 57, 57, 6
+; CHECK-NEXT:    vlgvb %r4, %v0, 10
+; CHECK-NEXT:    rosbg %r3, %r4, 58, 58, 5
+; CHECK-NEXT:    vlgvb %r4, %v0, 11
+; CHECK-NEXT:    rosbg %r3, %r4, 59, 59, 4
+; CHECK-NEXT:    vlgvb %r4, %v0, 12
+; CHECK-NEXT:    rosbg %r3, %r4, 60, 60, 3
+; CHECK-NEXT:    vlgvb %r4, %v0, 13
+; CHECK-NEXT:    rosbg %r3, %r4, 61, 61, 2
+; CHECK-NEXT:    vlgvb %r4, %v0, 14
+; CHECK-NEXT:    rosbg %r3, %r4, 62, 62, 1
+; CHECK-NEXT:    vlgvb %r4, %v0, 15
+; CHECK-NEXT:    rosbg %r3, %r4, 63, 63, 0
+; CHECK-NEXT:    xilf %r3, 4294967295
+; CHECK-NEXT:    or %r3, %r2
+; CHECK-NEXT:    tmll %r3, 65535
+; CHECK-NEXT:    ipm %r2
+; CHECK-NEXT:    afi %r2, -268435456
+; CHECK-NEXT:    srl %r2, 31
 ; CHECK-NEXT:    nr %r2, %r1
-; CHECK-NEXT:    nr %r2, %r3
 ; CHECK-NEXT:    nr %r2, %r0
 ; CHECK-NEXT:    larl %r1, g
 ; CHECK-NEXT:    stc %r2, 0(%r1)
-; CHECK-NEXT:    lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT:    lmg %r13, %r15, 272(%r15)
 ; CHECK-NEXT:    br %r14
 entry:
   %n = alloca i32, align 4

diff  --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index c79da37988e40b..dbc027495297b1 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,8 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrad $1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrad $2, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -660,8 +660,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrld $1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrld $2, %xmm0
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index a1cabb433d879b..e7727a0ab6178c 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,20 +563,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $120, %rsp
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pxor %xmm3, %xmm3
-; X64-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT:    psrlq $31, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm2
-; X64-NEXT:    psrlq $31, %xmm3
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT:    movq %xmm3, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
@@ -584,113 +582,112 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm1, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3 at PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rdx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rdx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rdx, %r13
-; X64-NEXT:    cmovgeq %rcx, %r12
+; X64-NEXT:    cmovgeq %rcx, %r14
+; X64-NEXT:    cmovgeq %rdx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    cmpq %rbp, %rcx
 ; X64-NEXT:    movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    sbbq %r14, %rax
+; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3 at PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rcx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r12
+; X64-NEXT:    cmovgeq %rax, %r14
+; X64-NEXT:    cmovgeq %rcx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    cmpq %rbp, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    sbbq %r14, %rax
+; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %xmm0
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-NEXT:    psrlq $1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    psrlq $31, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    psrad $31, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    pxor %xmm1, %xmm1
 ; X64-NEXT:    pcmpgtd %xmm0, %xmm1
@@ -698,92 +695,94 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3 at PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rcx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r12
+; X64-NEXT:    cmovgeq %rax, %r14
+; X64-NEXT:    cmovgeq %rcx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rcx
+; X64-NEXT:    cmpq %rbp, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
-; X64-NEXT:    movq %r13, %xmm0
+; X64-NEXT:    sbbq %r14, %rax
+; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbp
-; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    shldq $31, %rbp, %r14
-; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    shlq $31, %r15
+; X64-NEXT:    movq %xmm0, %rbx
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    sarq $63, %r13
+; X64-NEXT:    shldq $31, %rbx, %r13
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    shlq $31, %r12
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %r13
-; X64-NEXT:    sbbq $0, %r12
-; X64-NEXT:    movq %r15, %rdi
-; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    subq $1, %rbp
+; X64-NEXT:    sbbq $0, %r14
+; X64-NEXT:    shrq $63, %rbx
+; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    callq __modti3 at PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    shrq $63, %rbp
-; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %r13
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    cmpq %rcx, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r12
+; X64-NEXT:    cmovgeq %rax, %r14
+; X64-NEXT:    cmovgeq %rcx, %rbp
 ; X64-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %r13, %rax
-; X64-NEXT:    sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    cmovgeq %rax, %r13
-; X64-NEXT:    movq %r13, %xmm1
+; X64-NEXT:    cmpq %rbp, %rax
+; X64-NEXT:    sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    cmovgeq %rax, %rbp
+; X64-NEXT:    movq %rbp, %xmm1
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    psrlq $1, %xmm0

diff  --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index a80d8d8cd01b85..76cf2423a254fe 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -121,12 +121,10 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
 ; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
-; CHECK-AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; CHECK-AVX2-NEXT:    vmovq %xmm5, %rdi
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %r8
-; CHECK-AVX2-NEXT:    vmovq %xmm7, %r9
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
+; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
+; CHECK-AVX2-NEXT:    vmovq %xmm6, %r10
 ; CHECK-AVX2-NEXT:    negq %r10
 ; CHECK-AVX2-NEXT:    movq %rcx, %r10
 ; CHECK-AVX2-NEXT:    sbbq %r8, %r10

diff  --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index f7a27a5b914466..9ae1f270e88337 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,7 +65,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -75,7 +74,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -83,15 +81,14 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NEXT:    movb %al, (%edx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -100,7 +97,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -123,7 +119,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -133,7 +128,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -145,7 +139,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -158,7 +151,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
@@ -179,9 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -189,9 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -199,99 +189,49 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -308,9 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -318,107 +257,58 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %si, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -434,9 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -444,107 +333,58 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -560,88 +400,51 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movb %al, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -670,88 +473,51 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -779,88 +545,51 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -888,88 +617,51 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1941,7 +1633,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
+; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X64-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X86-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}


        


More information about the llvm-commits mailing list