[llvm] caacf86 - [DAG] Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m) (#90952)
via llvm-commits
llvm-commits at lists.llvm.org
Sat May 4 04:03:13 PDT 2024
Author: Simon Pilgrim
Date: 2024-05-04T12:03:10+01:00
New Revision: caacf8685ac49526103b748b6b439dea84c30274
URL: https://github.com/llvm/llvm-project/commit/caacf8685ac49526103b748b6b439dea84c30274
DIFF: https://github.com/llvm/llvm-project/commit/caacf8685ac49526103b748b6b439dea84c30274.diff
LOG: [DAG] Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m) (#90952)
If the shuffle mask contains no undef elements, then we can move the freeze through a shuffle node.
This requires special case handling to create a new ShuffleVectorSDNode.
Includes VECTOR_SHUFFLE support for isGuaranteedNotToBeUndefOrPoison / canCreateUndefOrPoison.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/SystemZ/pr60413.ll
llvm/test/CodeGen/X86/freeze-binary.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/setcc-non-simple-type.ll
llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fc6bbc119d3c1b..fe932ca68c1288 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15462,9 +15462,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0->getNumValues() != 1 || !N0->hasOneUse())
return SDValue();
- bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR ||
- N0.getOpcode() == ISD::BUILD_PAIR ||
- N0.getOpcode() == ISD::CONCAT_VECTORS;
+ bool AllowMultipleMaybePoisonOperands =
+ N0.getOpcode() == ISD::BUILD_VECTOR ||
+ N0.getOpcode() == ISD::BUILD_PAIR ||
+ N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+ N0.getOpcode() == ISD::CONCAT_VECTORS;
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
// ones" or "constant" into something that depends on FrozenUndef. We can
@@ -15537,8 +15539,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (Op.getOpcode() == ISD::UNDEF)
Op = DAG.getFreeze(Op);
}
- // NOTE: this strips poison generating flags.
- SDValue R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+
+ SDValue R;
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N0)) {
+ // Special case handling for ShuffleVectorSDNode nodes.
+ R = DAG.getVectorShuffle(N0.getValueType(), SDLoc(N0), Ops[0], Ops[1],
+ SVN->getMask());
+ } else {
+ // NOTE: this strips poison generating flags.
+ R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+ }
assert(DAG.isGuaranteedNotToBeUndefOrPoison(R, /*PoisonOnly*/ false) &&
"Can't create node that may be undef/poison!");
return R;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 25a728bf9ba3e7..eef5acd032345e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5088,6 +5088,24 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
}
return true;
+ case ISD::VECTOR_SHUFFLE: {
+ APInt DemandedLHS, DemandedRHS;
+ auto *SVN = cast<ShuffleVectorSDNode>(Op);
+ if (!getShuffleDemandedElts(DemandedElts.getBitWidth(), SVN->getMask(),
+ DemandedElts, DemandedLHS, DemandedRHS,
+ /*AllowUndefElts=*/false))
+ return false;
+ if (!DemandedLHS.isZero() &&
+ !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
+ PoisonOnly, Depth + 1))
+ return false;
+ if (!DemandedRHS.isZero() &&
+ !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
+ PoisonOnly, Depth + 1))
+ return false;
+ return true;
+ }
+
// TODO: Search for noundef attributes from library functions.
// TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
@@ -5225,6 +5243,15 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
}
+ case ISD::VECTOR_SHUFFLE: {
+ // Check for any demanded shuffle element that is undef.
+ auto *SVN = cast<ShuffleVectorSDNode>(Op);
+ for (auto [Idx, Elt] : enumerate(SVN->getMask()))
+ if (Elt < 0 && DemandedElts[Idx])
+ return true;
+ return false;
+ }
+
default:
// Allow the target to implement this method for its nodes.
if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 5a629567d07069..8a6a30318ae583 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,114 +13,110 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
define dso_local void @m() local_unnamed_addr #1 {
; CHECK-LABEL: m:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: llhrl %r2, f+4
-; CHECK-NEXT: sll %r2, 8
-; CHECK-NEXT: larl %r1, f
-; CHECK-NEXT: ic %r2, 6(%r1)
-; CHECK-NEXT: larl %r1, e
-; CHECK-NEXT: lb %r0, 3(%r1)
-; CHECK-NEXT: clfi %r2, 128
+; CHECK-NEXT: lhrl %r1, f+4
+; CHECK-NEXT: sll %r1, 8
+; CHECK-NEXT: larl %r2, f
+; CHECK-NEXT: ic %r1, 6(%r2)
+; CHECK-NEXT: larl %r2, e
+; CHECK-NEXT: lb %r0, 3(%r2)
+; CHECK-NEXT: vlvgp %v0, %r0, %r1
+; CHECK-NEXT: vlvgp %v1, %r1, %r0
+; CHECK-NEXT: vlvgf %v1, %r1, 0
+; CHECK-NEXT: vlvgf %v1, %r1, 2
+; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT: nilh %r1, 255
+; CHECK-NEXT: chi %r1, 128
; CHECK-NEXT: ipm %r1
; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT: vlvgp %v1, %r2, %r0
-; CHECK-NEXT: vlvgf %v1, %r2, 0
-; CHECK-NEXT: vlvgf %v1, %r2, 2
-; CHECK-NEXT: vlvgp %v0, %r0, %r2
-; CHECK-NEXT: vlvgp %v2, %r2, %r2
-; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
-; CHECK-NEXT: nilh %r2, 255
-; CHECK-NEXT: chi %r2, 128
-; CHECK-NEXT: ipm %r2
-; CHECK-NEXT: risbg %r2, %r2, 63, 191, 36
; CHECK-NEXT: vlvgf %v0, %r0, 0
; CHECK-NEXT: vlvgf %v0, %r0, 2
-; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vgbm %v3, 30583
; CHECK-NEXT: vn %v0, %v0, %v3
; CHECK-NEXT: vn %v1, %v1, %v3
+; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vn %v2, %v2, %v3
; CHECK-NEXT: vrepif %v3, 127
; CHECK-NEXT: vchlf %v1, %v1, %v3
-; CHECK-NEXT: vlgvf %r12, %v1, 0
+; CHECK-NEXT: vlgvf %r13, %v1, 0
; CHECK-NEXT: vchlf %v2, %v2, %v3
-; CHECK-NEXT: vlgvf %r4, %v2, 1
-; CHECK-NEXT: nilf %r4, 1
-; CHECK-NEXT: vlgvf %r5, %v2, 0
-; CHECK-NEXT: risbg %r3, %r5, 48, 176, 15
-; CHECK-NEXT: rosbg %r3, %r4, 32, 49, 14
-; CHECK-NEXT: vlgvf %r14, %v2, 2
+; CHECK-NEXT: vlgvf %r3, %v2, 1
+; CHECK-NEXT: nilf %r3, 1
+; CHECK-NEXT: vlgvf %r4, %v2, 0
+; CHECK-NEXT: risbg %r2, %r4, 48, 176, 15
+; CHECK-NEXT: rosbg %r2, %r3, 32, 49, 14
+; CHECK-NEXT: vlgvf %r5, %v2, 2
+; CHECK-NEXT: nilf %r5, 1
+; CHECK-NEXT: rosbg %r2, %r5, 32, 50, 13
+; CHECK-NEXT: vlgvf %r14, %v2, 3
; CHECK-NEXT: nilf %r14, 1
-; CHECK-NEXT: rosbg %r3, %r14, 32, 50, 13
-; CHECK-NEXT: vlgvf %r13, %v2, 3
-; CHECK-NEXT: nilf %r13, 1
-; CHECK-NEXT: rosbg %r3, %r13, 32, 51, 12
-; CHECK-NEXT: rosbg %r3, %r12, 52, 52, 11
-; CHECK-NEXT: vlgvf %r12, %v1, 1
-; CHECK-NEXT: rosbg %r3, %r12, 53, 53, 10
-; CHECK-NEXT: vlgvf %r12, %v1, 2
-; CHECK-NEXT: rosbg %r3, %r12, 54, 54, 9
-; CHECK-NEXT: vlgvf %r12, %v1, 3
-; CHECK-NEXT: rosbg %r3, %r12, 55, 55, 8
+; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12
+; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11
+; CHECK-NEXT: vlgvf %r13, %v1, 1
+; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10
+; CHECK-NEXT: vlgvf %r13, %v1, 2
+; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9
+; CHECK-NEXT: vlgvf %r13, %v1, 3
+; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8
; CHECK-NEXT: vchlf %v0, %v0, %v3
-; CHECK-NEXT: vlgvf %r12, %v0, 0
-; CHECK-NEXT: rosbg %r3, %r12, 56, 56, 7
-; CHECK-NEXT: vlgvf %r12, %v0, 1
-; CHECK-NEXT: rosbg %r3, %r12, 57, 57, 6
-; CHECK-NEXT: vlgvf %r12, %v0, 2
-; CHECK-NEXT: rosbg %r3, %r12, 58, 58, 5
-; CHECK-NEXT: vlgvf %r12, %v0, 3
-; CHECK-NEXT: rosbg %r3, %r12, 59, 59, 4
-; CHECK-NEXT: nilf %r5, 1
-; CHECK-NEXT: rosbg %r3, %r5, 32, 60, 3
-; CHECK-NEXT: rosbg %r3, %r4, 32, 61, 2
-; CHECK-NEXT: rosbg %r3, %r14, 32, 62, 1
-; CHECK-NEXT: or %r3, %r13
-; CHECK-NEXT: vlgvb %r5, %v0, 1
-; CHECK-NEXT: vlgvb %r4, %v0, 0
-; CHECK-NEXT: risbg %r4, %r4, 48, 176, 15
-; CHECK-NEXT: rosbg %r4, %r5, 49, 49, 14
-; CHECK-NEXT: vlgvb %r5, %v0, 2
-; CHECK-NEXT: rosbg %r4, %r5, 50, 50, 13
-; CHECK-NEXT: vlgvb %r5, %v0, 3
-; CHECK-NEXT: rosbg %r4, %r5, 51, 51, 12
-; CHECK-NEXT: vlgvb %r5, %v0, 4
-; CHECK-NEXT: rosbg %r4, %r5, 52, 52, 11
-; CHECK-NEXT: vlgvb %r5, %v0, 5
-; CHECK-NEXT: rosbg %r4, %r5, 53, 53, 10
-; CHECK-NEXT: vlgvb %r5, %v0, 6
-; CHECK-NEXT: rosbg %r4, %r5, 54, 54, 9
-; CHECK-NEXT: vlgvb %r5, %v0, 7
-; CHECK-NEXT: rosbg %r4, %r5, 55, 55, 8
-; CHECK-NEXT: vlgvb %r5, %v0, 8
-; CHECK-NEXT: rosbg %r4, %r5, 56, 56, 7
-; CHECK-NEXT: vlgvb %r5, %v0, 9
-; CHECK-NEXT: rosbg %r4, %r5, 57, 57, 6
-; CHECK-NEXT: vlgvb %r5, %v0, 10
-; CHECK-NEXT: rosbg %r4, %r5, 58, 58, 5
-; CHECK-NEXT: vlgvb %r5, %v0, 11
-; CHECK-NEXT: rosbg %r4, %r5, 59, 59, 4
-; CHECK-NEXT: vlgvb %r5, %v0, 12
-; CHECK-NEXT: rosbg %r4, %r5, 60, 60, 3
-; CHECK-NEXT: vlgvb %r5, %v0, 13
-; CHECK-NEXT: rosbg %r4, %r5, 61, 61, 2
-; CHECK-NEXT: vlgvb %r5, %v0, 14
-; CHECK-NEXT: rosbg %r4, %r5, 62, 62, 1
-; CHECK-NEXT: vlgvb %r5, %v0, 15
-; CHECK-NEXT: rosbg %r4, %r5, 63, 63, 0
-; CHECK-NEXT: xilf %r4, 4294967295
-; CHECK-NEXT: or %r4, %r3
-; CHECK-NEXT: tmll %r4, 65535
-; CHECK-NEXT: ipm %r3
-; CHECK-NEXT: afi %r3, -268435456
-; CHECK-NEXT: srl %r3, 31
+; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7
+; CHECK-NEXT: vlgvf %r13, %v0, 1
+; CHECK-NEXT: rosbg %r2, %r13, 57, 57, 6
+; CHECK-NEXT: vlgvf %r13, %v0, 2
+; CHECK-NEXT: rosbg %r2, %r13, 58, 58, 5
+; CHECK-NEXT: vlgvf %r13, %v0, 3
+; CHECK-NEXT: rosbg %r2, %r13, 59, 59, 4
+; CHECK-NEXT: nilf %r4, 1
+; CHECK-NEXT: rosbg %r2, %r4, 32, 60, 3
+; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2
+; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1
+; CHECK-NEXT: or %r2, %r14
+; CHECK-NEXT: vlgvb %r4, %v0, 1
+; CHECK-NEXT: vlgvb %r3, %v0, 0
+; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15
+; CHECK-NEXT: rosbg %r3, %r4, 49, 49, 14
+; CHECK-NEXT: vlgvb %r4, %v0, 2
+; CHECK-NEXT: rosbg %r3, %r4, 50, 50, 13
+; CHECK-NEXT: vlgvb %r4, %v0, 3
+; CHECK-NEXT: rosbg %r3, %r4, 51, 51, 12
+; CHECK-NEXT: vlgvb %r4, %v0, 4
+; CHECK-NEXT: rosbg %r3, %r4, 52, 52, 11
+; CHECK-NEXT: vlgvb %r4, %v0, 5
+; CHECK-NEXT: rosbg %r3, %r4, 53, 53, 10
+; CHECK-NEXT: vlgvb %r4, %v0, 6
+; CHECK-NEXT: rosbg %r3, %r4, 54, 54, 9
+; CHECK-NEXT: vlgvb %r4, %v0, 7
+; CHECK-NEXT: rosbg %r3, %r4, 55, 55, 8
+; CHECK-NEXT: vlgvb %r4, %v0, 8
+; CHECK-NEXT: rosbg %r3, %r4, 56, 56, 7
+; CHECK-NEXT: vlgvb %r4, %v0, 9
+; CHECK-NEXT: rosbg %r3, %r4, 57, 57, 6
+; CHECK-NEXT: vlgvb %r4, %v0, 10
+; CHECK-NEXT: rosbg %r3, %r4, 58, 58, 5
+; CHECK-NEXT: vlgvb %r4, %v0, 11
+; CHECK-NEXT: rosbg %r3, %r4, 59, 59, 4
+; CHECK-NEXT: vlgvb %r4, %v0, 12
+; CHECK-NEXT: rosbg %r3, %r4, 60, 60, 3
+; CHECK-NEXT: vlgvb %r4, %v0, 13
+; CHECK-NEXT: rosbg %r3, %r4, 61, 61, 2
+; CHECK-NEXT: vlgvb %r4, %v0, 14
+; CHECK-NEXT: rosbg %r3, %r4, 62, 62, 1
+; CHECK-NEXT: vlgvb %r4, %v0, 15
+; CHECK-NEXT: rosbg %r3, %r4, 63, 63, 0
+; CHECK-NEXT: xilf %r3, 4294967295
+; CHECK-NEXT: or %r3, %r2
+; CHECK-NEXT: tmll %r3, 65535
+; CHECK-NEXT: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
; CHECK-NEXT: nr %r2, %r1
-; CHECK-NEXT: nr %r2, %r3
; CHECK-NEXT: nr %r2, %r0
; CHECK-NEXT: larl %r1, g
; CHECK-NEXT: stc %r2, 0(%r1)
-; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
; CHECK-NEXT: br %r14
entry:
%n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index c79da37988e40b..dbc027495297b1 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,8 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
; X86-LABEL: freeze_ashr_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrad $1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrad $2, %xmm0
; X86-NEXT: retl
;
@@ -660,8 +660,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
; X86-LABEL: freeze_lshr_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrld $1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrld $2, %xmm0
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index a1cabb433d879b..e7727a0ab6178c 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,20 +563,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: subq $120, %rsp
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT: psrlq $31, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT: psrad $31, %xmm2
-; X64-NEXT: psrlq $31, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm3, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
@@ -584,113 +582,112 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm1, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rdx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rdx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rdx, %r13
-; X64-NEXT: cmovgeq %rcx, %r12
+; X64-NEXT: cmovgeq %rcx, %r14
+; X64-NEXT: cmovgeq %rdx, %rbp
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: cmpq %rbp, %rcx
; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r12, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: sbbq %r14, %rax
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: movq %rbp, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rcx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r12
+; X64-NEXT: cmovgeq %rax, %r14
+; X64-NEXT: cmovgeq %rcx, %rbp
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: cmpq %rbp, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r12, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: sbbq %r14, %rax
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: movq %rbp, %xmm0
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-NEXT: psrad $31, %xmm1
+; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: psrlq $31, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: psrad $31, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pcmpgtd %xmm0, %xmm1
@@ -698,92 +695,94 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rcx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r12
+; X64-NEXT: cmovgeq %rax, %r14
+; X64-NEXT: cmovgeq %rcx, %rbp
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: cmpq %rbp, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r12, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: sbbq %r14, %rax
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: movq %rbp, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rcx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r12
+; X64-NEXT: cmovgeq %rax, %r14
+; X64-NEXT: cmovgeq %rcx, %rbp
; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rax
-; X64-NEXT: sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: cmovgeq %rax, %r13
-; X64-NEXT: movq %r13, %xmm1
+; X64-NEXT: cmpq %rbp, %rax
+; X64-NEXT: sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: cmovgeq %rax, %rbp
+; X64-NEXT: movq %rbp, %xmm1
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index a80d8d8cd01b85..76cf2423a254fe 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -121,12 +121,10 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
-; CHECK-AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; CHECK-AVX2-NEXT: vmovq %xmm5, %rdi
-; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %r8
-; CHECK-AVX2-NEXT: vmovq %xmm7, %r9
-; CHECK-AVX2-NEXT: vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi
+; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8
+; CHECK-AVX2-NEXT: vmovq %xmm5, %r9
+; CHECK-AVX2-NEXT: vmovq %xmm6, %r10
; CHECK-AVX2-NEXT: negq %r10
; CHECK-AVX2-NEXT: movq %rcx, %r10
; CHECK-AVX2-NEXT: sbbq %r8, %r10
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index f7a27a5b914466..9ae1f270e88337 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,7 +65,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -75,7 +74,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movb %al, (%rdx)
@@ -83,15 +81,14 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-NO-BMI2: # %bb.0:
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT: movzwl %dx, %edx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT: movzwl (%eax), %eax
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NEXT: movb %al, (%edx)
; X86-NO-BMI2-NEXT: retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -100,7 +97,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
-; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movb %cl, (%eax)
@@ -123,7 +119,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -133,7 +128,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movw %ax, (%rdx)
@@ -145,7 +139,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT: movzwl %dx, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT: shrl %cl, %edx
@@ -158,7 +151,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
-; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movw %cx, (%eax)
@@ -179,9 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-NEXT: movl (%rdi), %eax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -189,9 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movq %xmm0, %rax
+; X64-BMI2-NEXT: movl (%rdi), %eax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movb %al, (%rdx)
; X64-BMI2-NEXT: retq
@@ -199,99 +189,49 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: retl
;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
+; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl (%edx), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorl %esi, %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: testb $32, %cl
+; X86-SHLD-NEXT: cmovnel %esi, %edx
+; X86-SHLD-NEXT: movb %dl, (%eax)
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorl %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %bl, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <4 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -308,9 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-NEXT: movl (%rdi), %eax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -318,107 +257,58 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movq %xmm0, %rax
+; X64-BMI2-NEXT: movl (%rdi), %eax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movw %ax, (%rdx)
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movw %si, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %si, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: retl
;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
+; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl (%edx), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorl %esi, %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: testb $32, %cl
+; X86-SHLD-NEXT: cmovnel %esi, %edx
+; X86-SHLD-NEXT: movw %dx, (%eax)
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %si, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <4 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -434,9 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-NEXT: movl (%rdi), %eax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -444,107 +333,58 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movq %xmm0, %rax
+; X64-BMI2-NEXT: movl (%rdi), %eax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movl %eax, (%rdx)
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: retl
;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
+; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl (%edx), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorl %esi, %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: testb $32, %cl
+; X86-SHLD-NEXT: cmovnel %esi, %edx
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <4 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -560,88 +400,51 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movb %sil, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movb %al, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -670,88 +473,51 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movw %si, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movw %ax, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -779,88 +545,51 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movl %eax, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -888,88 +617,51 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movq %rax, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1941,7 +1633,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
+; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X64-NO-BMI2-HAVE-SHLD: {{.*}}
; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X86-NO-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
More information about the llvm-commits
mailing list