[llvm] [DAG] Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m) (PR #90952)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri May 3 02:27:08 PDT 2024


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/90952

If the shuffle mask contains no undef elements, then we can move the freeze through a shuffle node.

This requires special case handling to create a new ShuffleVectorSDNode.

>From 160f34fdd55129c8eaa27269c6d7652f82c6075d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 3 May 2024 10:25:36 +0100
Subject: [PATCH] [DAG] Fold freeze(shuffle(x,y,m)) ->
 shuffle(freeze(x),freeze(y),m)

If the shuffle mask contains no undef elements, then we can move the freeze through a shuffle node.

This requires special case handling to create a new ShuffleVectorSDNode.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  54 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   9 +
 llvm/test/CodeGen/SystemZ/pr60413.ll          | 176 ++---
 llvm/test/CodeGen/X86/freeze-binary.ll        |   4 +-
 .../test/CodeGen/X86/setcc-non-simple-type.ll |  10 +-
 ...ad-of-small-alloca-with-zero-upper-half.ll | 734 +++++-------------
 6 files changed, 353 insertions(+), 634 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c0bbea16a64262..d8c2059fdff47e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15458,6 +15458,23 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
+  auto FreezeOperand = [&](SDValue MaybePoisonOperand) {
+    // Don't replace every single UNDEF everywhere with frozen UNDEF, though.
+    if (MaybePoisonOperand.getOpcode() == ISD::UNDEF)
+      return;
+    // First, freeze each offending operand.
+    SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
+    // Then, change all other uses of unfrozen operand to use frozen operand.
+    DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
+    if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
+        FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
+      // But, that also updated the use in the freeze we just created, thus
+      // creating a cycle in a DAG. Let's undo that by mutating the freeze.
+      DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
+                             MaybePoisonOperand);
+    }
+  };
+
   bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR ||
                                           N0.getOpcode() == ISD::BUILD_PAIR ||
                                           N0.getOpcode() == ISD::CONCAT_VECTORS;
@@ -15482,6 +15499,24 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
     }
   }
 
+  // Special case handling for ShuffleVectorSDNode nodes.
+  // Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m)
+  // iff shuffle mask doesn't contain any undef elements.
+  if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N0)) {
+    FreezeOperand(SVN->getOperand(0));
+    FreezeOperand(SVN->getOperand(1));
+    // This node has been merged with another.
+    if (N->getOpcode() == ISD::DELETED_NODE)
+      return SDValue(N, 0);
+    // NOTE: Get the operands again in case they've been updated.
+    SDValue Op0 = SVN->getOperand(0);
+    SDValue Op1 = SVN->getOperand(1);
+    Op0 = Op0.isUndef() ? DAG.getFreeze(Op0) : Op0;
+    Op1 = Op1.isUndef() ? DAG.getFreeze(Op1) : Op1;
+    return DAG.getVectorShuffle(N0.getValueType(), SDLoc(N), Op0, Op1,
+                                SVN->getMask());
+  }
+
   SmallSetVector<SDValue, 8> MaybePoisonOperands;
   for (SDValue Op : N0->ops()) {
     if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
@@ -15500,22 +15535,9 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // it could create undef or poison due to it's poison-generating flags.
   // So not finding any maybe-poison operands is fine.
 
-  for (SDValue MaybePoisonOperand : MaybePoisonOperands) {
-    // Don't replace every single UNDEF everywhere with frozen UNDEF, though.
-    if (MaybePoisonOperand.getOpcode() == ISD::UNDEF)
-      continue;
-    // First, freeze each offending operand.
-    SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
-    // Then, change all other uses of unfrozen operand to use frozen operand.
-    DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
-    if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
-        FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
-      // But, that also updated the use in the freeze we just created, thus
-      // creating a cycle in a DAG. Let's undo that by mutating the freeze.
-      DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
-                             MaybePoisonOperand);
-    }
-  }
+  // Freeze all uses of the different operands.
+  for (SDValue MaybePoisonOperand : MaybePoisonOperands)
+    FreezeOperand(MaybePoisonOperand);
 
   // This node has been merged with another.
   if (N->getOpcode() == ISD::DELETED_NODE)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 25a728bf9ba3e7..fef4b7dbf558ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5225,6 +5225,15 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
   }
 
+  case ISD::VECTOR_SHUFFLE: {
+    // Check for any demanded shuffle element that is undef.
+    auto *SVN = cast<ShuffleVectorSDNode>(Op);
+    for (auto [Idx, Elt] : enumerate(SVN->getMask()))
+      if (Elt < 0 && DemandedElts[Idx])
+        return true;
+    return false;
+  }
+
   default:
     // Allow the target to implement this method for its nodes.
     if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 5a629567d07069..8a6a30318ae583 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,114 +13,110 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
 define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-LABEL: m:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
-; CHECK-NEXT:    llhrl %r2, f+4
-; CHECK-NEXT:    sll %r2, 8
-; CHECK-NEXT:    larl %r1, f
-; CHECK-NEXT:    ic %r2, 6(%r1)
-; CHECK-NEXT:    larl %r1, e
-; CHECK-NEXT:    lb %r0, 3(%r1)
-; CHECK-NEXT:    clfi %r2, 128
+; CHECK-NEXT:    lhrl %r1, f+4
+; CHECK-NEXT:    sll %r1, 8
+; CHECK-NEXT:    larl %r2, f
+; CHECK-NEXT:    ic %r1, 6(%r2)
+; CHECK-NEXT:    larl %r2, e
+; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlvgf %v1, %r1, 0
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT:    nilh %r1, 255
+; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT:    vlvgp %v1, %r2, %r0
-; CHECK-NEXT:    vlvgf %v1, %r2, 0
-; CHECK-NEXT:    vlvgf %v1, %r2, 2
-; CHECK-NEXT:    vlvgp %v0, %r0, %r2
-; CHECK-NEXT:    vlvgp %v2, %r2, %r2
-; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
-; CHECK-NEXT:    nilh %r2, 255
-; CHECK-NEXT:    chi %r2, 128
-; CHECK-NEXT:    ipm %r2
-; CHECK-NEXT:    risbg %r2, %r2, 63, 191, 36
 ; CHECK-NEXT:    vlvgf %v0, %r0, 0
 ; CHECK-NEXT:    vlvgf %v0, %r0, 2
-; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vgbm %v3, 30583
 ; CHECK-NEXT:    vn %v0, %v0, %v3
 ; CHECK-NEXT:    vn %v1, %v1, %v3
+; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vn %v2, %v2, %v3
 ; CHECK-NEXT:    vrepif %v3, 127
 ; CHECK-NEXT:    vchlf %v1, %v1, %v3
-; CHECK-NEXT:    vlgvf %r12, %v1, 0
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
 ; CHECK-NEXT:    vchlf %v2, %v2, %v3
-; CHECK-NEXT:    vlgvf %r4, %v2, 1
-; CHECK-NEXT:    nilf %r4, 1
-; CHECK-NEXT:    vlgvf %r5, %v2, 0
-; CHECK-NEXT:    risbg %r3, %r5, 48, 176, 15
-; CHECK-NEXT:    rosbg %r3, %r4, 32, 49, 14
-; CHECK-NEXT:    vlgvf %r14, %v2, 2
+; CHECK-NEXT:    vlgvf %r3, %v2, 1
+; CHECK-NEXT:    nilf %r3, 1
+; CHECK-NEXT:    vlgvf %r4, %v2, 0
+; CHECK-NEXT:    risbg %r2, %r4, 48, 176, 15
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 49, 14
+; CHECK-NEXT:    vlgvf %r5, %v2, 2
+; CHECK-NEXT:    nilf %r5, 1
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 50, 13
+; CHECK-NEXT:    vlgvf %r14, %v2, 3
 ; CHECK-NEXT:    nilf %r14, 1
-; CHECK-NEXT:    rosbg %r3, %r14, 32, 50, 13
-; CHECK-NEXT:    vlgvf %r13, %v2, 3
-; CHECK-NEXT:    nilf %r13, 1
-; CHECK-NEXT:    rosbg %r3, %r13, 32, 51, 12
-; CHECK-NEXT:    rosbg %r3, %r12, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r12, %v1, 1
-; CHECK-NEXT:    rosbg %r3, %r12, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r12, %v1, 2
-; CHECK-NEXT:    rosbg %r3, %r12, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r12, %v1, 3
-; CHECK-NEXT:    rosbg %r3, %r12, 55, 55, 8
+; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
+; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
 ; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r12, %v0, 0
-; CHECK-NEXT:    rosbg %r3, %r12, 56, 56, 7
-; CHECK-NEXT:    vlgvf %r12, %v0, 1
-; CHECK-NEXT:    rosbg %r3, %r12, 57, 57, 6
-; CHECK-NEXT:    vlgvf %r12, %v0, 2
-; CHECK-NEXT:    rosbg %r3, %r12, 58, 58, 5
-; CHECK-NEXT:    vlgvf %r12, %v0, 3
-; CHECK-NEXT:    rosbg %r3, %r12, 59, 59, 4
-; CHECK-NEXT:    nilf %r5, 1
-; CHECK-NEXT:    rosbg %r3, %r5, 32, 60, 3
-; CHECK-NEXT:    rosbg %r3, %r4, 32, 61, 2
-; CHECK-NEXT:    rosbg %r3, %r14, 32, 62, 1
-; CHECK-NEXT:    or %r3, %r13
-; CHECK-NEXT:    vlgvb %r5, %v0, 1
-; CHECK-NEXT:    vlgvb %r4, %v0, 0
-; CHECK-NEXT:    risbg %r4, %r4, 48, 176, 15
-; CHECK-NEXT:    rosbg %r4, %r5, 49, 49, 14
-; CHECK-NEXT:    vlgvb %r5, %v0, 2
-; CHECK-NEXT:    rosbg %r4, %r5, 50, 50, 13
-; CHECK-NEXT:    vlgvb %r5, %v0, 3
-; CHECK-NEXT:    rosbg %r4, %r5, 51, 51, 12
-; CHECK-NEXT:    vlgvb %r5, %v0, 4
-; CHECK-NEXT:    rosbg %r4, %r5, 52, 52, 11
-; CHECK-NEXT:    vlgvb %r5, %v0, 5
-; CHECK-NEXT:    rosbg %r4, %r5, 53, 53, 10
-; CHECK-NEXT:    vlgvb %r5, %v0, 6
-; CHECK-NEXT:    rosbg %r4, %r5, 54, 54, 9
-; CHECK-NEXT:    vlgvb %r5, %v0, 7
-; CHECK-NEXT:    rosbg %r4, %r5, 55, 55, 8
-; CHECK-NEXT:    vlgvb %r5, %v0, 8
-; CHECK-NEXT:    rosbg %r4, %r5, 56, 56, 7
-; CHECK-NEXT:    vlgvb %r5, %v0, 9
-; CHECK-NEXT:    rosbg %r4, %r5, 57, 57, 6
-; CHECK-NEXT:    vlgvb %r5, %v0, 10
-; CHECK-NEXT:    rosbg %r4, %r5, 58, 58, 5
-; CHECK-NEXT:    vlgvb %r5, %v0, 11
-; CHECK-NEXT:    rosbg %r4, %r5, 59, 59, 4
-; CHECK-NEXT:    vlgvb %r5, %v0, 12
-; CHECK-NEXT:    rosbg %r4, %r5, 60, 60, 3
-; CHECK-NEXT:    vlgvb %r5, %v0, 13
-; CHECK-NEXT:    rosbg %r4, %r5, 61, 61, 2
-; CHECK-NEXT:    vlgvb %r5, %v0, 14
-; CHECK-NEXT:    rosbg %r4, %r5, 62, 62, 1
-; CHECK-NEXT:    vlgvb %r5, %v0, 15
-; CHECK-NEXT:    rosbg %r4, %r5, 63, 63, 0
-; CHECK-NEXT:    xilf %r4, 4294967295
-; CHECK-NEXT:    or %r4, %r3
-; CHECK-NEXT:    tmll %r4, 65535
-; CHECK-NEXT:    ipm %r3
-; CHECK-NEXT:    afi %r3, -268435456
-; CHECK-NEXT:    srl %r3, 31
+; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
+; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    rosbg %r2, %r13, 57, 57, 6
+; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    rosbg %r2, %r13, 58, 58, 5
+; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    rosbg %r2, %r13, 59, 59, 4
+; CHECK-NEXT:    nilf %r4, 1
+; CHECK-NEXT:    rosbg %r2, %r4, 32, 60, 3
+; CHECK-NEXT:    rosbg %r2, %r3, 32, 61, 2
+; CHECK-NEXT:    rosbg %r2, %r5, 32, 62, 1
+; CHECK-NEXT:    or %r2, %r14
+; CHECK-NEXT:    vlgvb %r4, %v0, 1
+; CHECK-NEXT:    vlgvb %r3, %v0, 0
+; CHECK-NEXT:    risbg %r3, %r3, 48, 176, 15
+; CHECK-NEXT:    rosbg %r3, %r4, 49, 49, 14
+; CHECK-NEXT:    vlgvb %r4, %v0, 2
+; CHECK-NEXT:    rosbg %r3, %r4, 50, 50, 13
+; CHECK-NEXT:    vlgvb %r4, %v0, 3
+; CHECK-NEXT:    rosbg %r3, %r4, 51, 51, 12
+; CHECK-NEXT:    vlgvb %r4, %v0, 4
+; CHECK-NEXT:    rosbg %r3, %r4, 52, 52, 11
+; CHECK-NEXT:    vlgvb %r4, %v0, 5
+; CHECK-NEXT:    rosbg %r3, %r4, 53, 53, 10
+; CHECK-NEXT:    vlgvb %r4, %v0, 6
+; CHECK-NEXT:    rosbg %r3, %r4, 54, 54, 9
+; CHECK-NEXT:    vlgvb %r4, %v0, 7
+; CHECK-NEXT:    rosbg %r3, %r4, 55, 55, 8
+; CHECK-NEXT:    vlgvb %r4, %v0, 8
+; CHECK-NEXT:    rosbg %r3, %r4, 56, 56, 7
+; CHECK-NEXT:    vlgvb %r4, %v0, 9
+; CHECK-NEXT:    rosbg %r3, %r4, 57, 57, 6
+; CHECK-NEXT:    vlgvb %r4, %v0, 10
+; CHECK-NEXT:    rosbg %r3, %r4, 58, 58, 5
+; CHECK-NEXT:    vlgvb %r4, %v0, 11
+; CHECK-NEXT:    rosbg %r3, %r4, 59, 59, 4
+; CHECK-NEXT:    vlgvb %r4, %v0, 12
+; CHECK-NEXT:    rosbg %r3, %r4, 60, 60, 3
+; CHECK-NEXT:    vlgvb %r4, %v0, 13
+; CHECK-NEXT:    rosbg %r3, %r4, 61, 61, 2
+; CHECK-NEXT:    vlgvb %r4, %v0, 14
+; CHECK-NEXT:    rosbg %r3, %r4, 62, 62, 1
+; CHECK-NEXT:    vlgvb %r4, %v0, 15
+; CHECK-NEXT:    rosbg %r3, %r4, 63, 63, 0
+; CHECK-NEXT:    xilf %r3, 4294967295
+; CHECK-NEXT:    or %r3, %r2
+; CHECK-NEXT:    tmll %r3, 65535
+; CHECK-NEXT:    ipm %r2
+; CHECK-NEXT:    afi %r2, -268435456
+; CHECK-NEXT:    srl %r2, 31
 ; CHECK-NEXT:    nr %r2, %r1
-; CHECK-NEXT:    nr %r2, %r3
 ; CHECK-NEXT:    nr %r2, %r0
 ; CHECK-NEXT:    larl %r1, g
 ; CHECK-NEXT:    stc %r2, 0(%r1)
-; CHECK-NEXT:    lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT:    lmg %r13, %r15, 272(%r15)
 ; CHECK-NEXT:    br %r14
 entry:
   %n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index c79da37988e40b..dbc027495297b1 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,8 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrad $1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrad $2, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -660,8 +660,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrld $1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; X86-NEXT:    psrld $2, %xmm0
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index a80d8d8cd01b85..76cf2423a254fe 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -121,12 +121,10 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
 ; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
-; CHECK-AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; CHECK-AVX2-NEXT:    vmovq %xmm5, %rdi
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %r8
-; CHECK-AVX2-NEXT:    vmovq %xmm7, %r9
-; CHECK-AVX2-NEXT:    vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
+; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
+; CHECK-AVX2-NEXT:    vmovq %xmm6, %r10
 ; CHECK-AVX2-NEXT:    negq %r10
 ; CHECK-AVX2-NEXT:    movq %rcx, %r10
 ; CHECK-AVX2-NEXT:    sbbq %r8, %r10
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index f7a27a5b914466..9ae1f270e88337 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,7 +65,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -75,7 +74,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -83,15 +81,14 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NEXT:    movb %al, (%edx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -100,7 +97,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -123,7 +119,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -133,7 +128,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
-; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -145,7 +139,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -158,7 +151,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
@@ -179,9 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
@@ -189,9 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
 ; X64-BMI2-NEXT:    retq
@@ -199,99 +189,49 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -308,9 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
@@ -318,107 +257,58 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %si, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -434,9 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
-; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
@@ -444,107 +333,58 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-BMI2-NEXT:    shll $3, %esi
-; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    movl (%rdi), %eax
 ; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
 ; X64-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
+; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD:       # %bb.0:
+; X86-SHLD-NEXT:    pushl %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl (%edx), %edx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    xorl %esi, %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    testb $32, %cl
+; X86-SHLD-NEXT:    cmovnel %esi, %edx
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -560,88 +400,51 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %sil, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movb %al, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -670,88 +473,51 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -779,88 +545,51 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -888,88 +617,51 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD:       # %bb.0:
+; X64-SHLD-NEXT:    movq %rsi, %rcx
+; X64-SHLD-NEXT:    movq (%rdi), %rax
+; X64-SHLD-NEXT:    shll $3, %ecx
+; X64-SHLD-NEXT:    xorl %esi, %esi
+; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT:    testb $64, %cl
+; X64-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
 ; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1941,7 +1633,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
+; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X64-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X86-NO-BMI2-HAVE-SHLD: {{.*}}
 ; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}



More information about the llvm-commits mailing list