[llvm] [DAG] Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m) (PR #90952)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri May 3 02:27:08 PDT 2024
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/90952
If the shuffle mask contains no undef elements, then we can move the freeze through a shuffle node.
This requires special case handling to create a new ShuffleVectorSDNode.
>From 160f34fdd55129c8eaa27269c6d7652f82c6075d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 3 May 2024 10:25:36 +0100
Subject: [PATCH] [DAG] Fold freeze(shuffle(x,y,m)) ->
shuffle(freeze(x),freeze(y),m)
If the shuffle mask contains no undef elements, then we can move the freeze through a shuffle node.
This requires special case handling to create a new ShuffleVectorSDNode.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 54 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 9 +
llvm/test/CodeGen/SystemZ/pr60413.ll | 176 ++---
llvm/test/CodeGen/X86/freeze-binary.ll | 4 +-
.../test/CodeGen/X86/setcc-non-simple-type.ll | 10 +-
...ad-of-small-alloca-with-zero-upper-half.ll | 734 +++++-------------
6 files changed, 353 insertions(+), 634 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c0bbea16a64262..d8c2059fdff47e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15458,6 +15458,23 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0->getNumValues() != 1 || !N0->hasOneUse())
return SDValue();
+ auto FreezeOperand = [&](SDValue MaybePoisonOperand) {
+ // Don't replace every single UNDEF everywhere with frozen UNDEF, though.
+ if (MaybePoisonOperand.getOpcode() == ISD::UNDEF)
+ return;
+ // First, freeze each offending operand.
+ SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
+ // Then, change all other uses of unfrozen operand to use frozen operand.
+ DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
+ if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
+ FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
+ // But, that also updated the use in the freeze we just created, thus
+ // creating a cycle in a DAG. Let's undo that by mutating the freeze.
+ DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
+ MaybePoisonOperand);
+ }
+ };
+
bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR ||
N0.getOpcode() == ISD::BUILD_PAIR ||
N0.getOpcode() == ISD::CONCAT_VECTORS;
@@ -15482,6 +15499,24 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
}
}
+ // Special case handling for ShuffleVectorSDNode nodes.
+ // Fold freeze(shuffle(x,y,m)) -> shuffle(freeze(x),freeze(y),m)
+ // iff shuffle mask doesn't contain any undef elements.
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N0)) {
+ FreezeOperand(SVN->getOperand(0));
+ FreezeOperand(SVN->getOperand(1));
+ // This node has been merged with another.
+ if (N->getOpcode() == ISD::DELETED_NODE)
+ return SDValue(N, 0);
+ // NOTE: Get the operands again in case they've been updated.
+ SDValue Op0 = SVN->getOperand(0);
+ SDValue Op1 = SVN->getOperand(1);
+ Op0 = Op0.isUndef() ? DAG.getFreeze(Op0) : Op0;
+ Op1 = Op1.isUndef() ? DAG.getFreeze(Op1) : Op1;
+ return DAG.getVectorShuffle(N0.getValueType(), SDLoc(N), Op0, Op1,
+ SVN->getMask());
+ }
+
SmallSetVector<SDValue, 8> MaybePoisonOperands;
for (SDValue Op : N0->ops()) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
@@ -15500,22 +15535,9 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
// it could create undef or poison due to it's poison-generating flags.
// So not finding any maybe-poison operands is fine.
- for (SDValue MaybePoisonOperand : MaybePoisonOperands) {
- // Don't replace every single UNDEF everywhere with frozen UNDEF, though.
- if (MaybePoisonOperand.getOpcode() == ISD::UNDEF)
- continue;
- // First, freeze each offending operand.
- SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
- // Then, change all other uses of unfrozen operand to use frozen operand.
- DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
- if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
- FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
- // But, that also updated the use in the freeze we just created, thus
- // creating a cycle in a DAG. Let's undo that by mutating the freeze.
- DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
- MaybePoisonOperand);
- }
- }
+ // Freeze all uses of the different operands.
+ for (SDValue MaybePoisonOperand : MaybePoisonOperands)
+ FreezeOperand(MaybePoisonOperand);
// This node has been merged with another.
if (N->getOpcode() == ISD::DELETED_NODE)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 25a728bf9ba3e7..fef4b7dbf558ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5225,6 +5225,15 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
}
+ case ISD::VECTOR_SHUFFLE: {
+ // Check for any demanded shuffle element that is undef.
+ auto *SVN = cast<ShuffleVectorSDNode>(Op);
+ for (auto [Idx, Elt] : enumerate(SVN->getMask()))
+ if (Elt < 0 && DemandedElts[Idx])
+ return true;
+ return false;
+ }
+
default:
// Allow the target to implement this method for its nodes.
if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 5a629567d07069..8a6a30318ae583 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,114 +13,110 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
define dso_local void @m() local_unnamed_addr #1 {
; CHECK-LABEL: m:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
; CHECK-NEXT: aghi %r15, -168
-; CHECK-NEXT: llhrl %r2, f+4
-; CHECK-NEXT: sll %r2, 8
-; CHECK-NEXT: larl %r1, f
-; CHECK-NEXT: ic %r2, 6(%r1)
-; CHECK-NEXT: larl %r1, e
-; CHECK-NEXT: lb %r0, 3(%r1)
-; CHECK-NEXT: clfi %r2, 128
+; CHECK-NEXT: lhrl %r1, f+4
+; CHECK-NEXT: sll %r1, 8
+; CHECK-NEXT: larl %r2, f
+; CHECK-NEXT: ic %r1, 6(%r2)
+; CHECK-NEXT: larl %r2, e
+; CHECK-NEXT: lb %r0, 3(%r2)
+; CHECK-NEXT: vlvgp %v0, %r0, %r1
+; CHECK-NEXT: vlvgp %v1, %r1, %r0
+; CHECK-NEXT: vlvgf %v1, %r1, 0
+; CHECK-NEXT: vlvgf %v1, %r1, 2
+; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT: nilh %r1, 255
+; CHECK-NEXT: chi %r1, 128
; CHECK-NEXT: ipm %r1
; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT: vlvgp %v1, %r2, %r0
-; CHECK-NEXT: vlvgf %v1, %r2, 0
-; CHECK-NEXT: vlvgf %v1, %r2, 2
-; CHECK-NEXT: vlvgp %v0, %r0, %r2
-; CHECK-NEXT: vlvgp %v2, %r2, %r2
-; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
-; CHECK-NEXT: nilh %r2, 255
-; CHECK-NEXT: chi %r2, 128
-; CHECK-NEXT: ipm %r2
-; CHECK-NEXT: risbg %r2, %r2, 63, 191, 36
; CHECK-NEXT: vlvgf %v0, %r0, 0
; CHECK-NEXT: vlvgf %v0, %r0, 2
-; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vgbm %v3, 30583
; CHECK-NEXT: vn %v0, %v0, %v3
; CHECK-NEXT: vn %v1, %v1, %v3
+; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vn %v2, %v2, %v3
; CHECK-NEXT: vrepif %v3, 127
; CHECK-NEXT: vchlf %v1, %v1, %v3
-; CHECK-NEXT: vlgvf %r12, %v1, 0
+; CHECK-NEXT: vlgvf %r13, %v1, 0
; CHECK-NEXT: vchlf %v2, %v2, %v3
-; CHECK-NEXT: vlgvf %r4, %v2, 1
-; CHECK-NEXT: nilf %r4, 1
-; CHECK-NEXT: vlgvf %r5, %v2, 0
-; CHECK-NEXT: risbg %r3, %r5, 48, 176, 15
-; CHECK-NEXT: rosbg %r3, %r4, 32, 49, 14
-; CHECK-NEXT: vlgvf %r14, %v2, 2
+; CHECK-NEXT: vlgvf %r3, %v2, 1
+; CHECK-NEXT: nilf %r3, 1
+; CHECK-NEXT: vlgvf %r4, %v2, 0
+; CHECK-NEXT: risbg %r2, %r4, 48, 176, 15
+; CHECK-NEXT: rosbg %r2, %r3, 32, 49, 14
+; CHECK-NEXT: vlgvf %r5, %v2, 2
+; CHECK-NEXT: nilf %r5, 1
+; CHECK-NEXT: rosbg %r2, %r5, 32, 50, 13
+; CHECK-NEXT: vlgvf %r14, %v2, 3
; CHECK-NEXT: nilf %r14, 1
-; CHECK-NEXT: rosbg %r3, %r14, 32, 50, 13
-; CHECK-NEXT: vlgvf %r13, %v2, 3
-; CHECK-NEXT: nilf %r13, 1
-; CHECK-NEXT: rosbg %r3, %r13, 32, 51, 12
-; CHECK-NEXT: rosbg %r3, %r12, 52, 52, 11
-; CHECK-NEXT: vlgvf %r12, %v1, 1
-; CHECK-NEXT: rosbg %r3, %r12, 53, 53, 10
-; CHECK-NEXT: vlgvf %r12, %v1, 2
-; CHECK-NEXT: rosbg %r3, %r12, 54, 54, 9
-; CHECK-NEXT: vlgvf %r12, %v1, 3
-; CHECK-NEXT: rosbg %r3, %r12, 55, 55, 8
+; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12
+; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11
+; CHECK-NEXT: vlgvf %r13, %v1, 1
+; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10
+; CHECK-NEXT: vlgvf %r13, %v1, 2
+; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9
+; CHECK-NEXT: vlgvf %r13, %v1, 3
+; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8
; CHECK-NEXT: vchlf %v0, %v0, %v3
-; CHECK-NEXT: vlgvf %r12, %v0, 0
-; CHECK-NEXT: rosbg %r3, %r12, 56, 56, 7
-; CHECK-NEXT: vlgvf %r12, %v0, 1
-; CHECK-NEXT: rosbg %r3, %r12, 57, 57, 6
-; CHECK-NEXT: vlgvf %r12, %v0, 2
-; CHECK-NEXT: rosbg %r3, %r12, 58, 58, 5
-; CHECK-NEXT: vlgvf %r12, %v0, 3
-; CHECK-NEXT: rosbg %r3, %r12, 59, 59, 4
-; CHECK-NEXT: nilf %r5, 1
-; CHECK-NEXT: rosbg %r3, %r5, 32, 60, 3
-; CHECK-NEXT: rosbg %r3, %r4, 32, 61, 2
-; CHECK-NEXT: rosbg %r3, %r14, 32, 62, 1
-; CHECK-NEXT: or %r3, %r13
-; CHECK-NEXT: vlgvb %r5, %v0, 1
-; CHECK-NEXT: vlgvb %r4, %v0, 0
-; CHECK-NEXT: risbg %r4, %r4, 48, 176, 15
-; CHECK-NEXT: rosbg %r4, %r5, 49, 49, 14
-; CHECK-NEXT: vlgvb %r5, %v0, 2
-; CHECK-NEXT: rosbg %r4, %r5, 50, 50, 13
-; CHECK-NEXT: vlgvb %r5, %v0, 3
-; CHECK-NEXT: rosbg %r4, %r5, 51, 51, 12
-; CHECK-NEXT: vlgvb %r5, %v0, 4
-; CHECK-NEXT: rosbg %r4, %r5, 52, 52, 11
-; CHECK-NEXT: vlgvb %r5, %v0, 5
-; CHECK-NEXT: rosbg %r4, %r5, 53, 53, 10
-; CHECK-NEXT: vlgvb %r5, %v0, 6
-; CHECK-NEXT: rosbg %r4, %r5, 54, 54, 9
-; CHECK-NEXT: vlgvb %r5, %v0, 7
-; CHECK-NEXT: rosbg %r4, %r5, 55, 55, 8
-; CHECK-NEXT: vlgvb %r5, %v0, 8
-; CHECK-NEXT: rosbg %r4, %r5, 56, 56, 7
-; CHECK-NEXT: vlgvb %r5, %v0, 9
-; CHECK-NEXT: rosbg %r4, %r5, 57, 57, 6
-; CHECK-NEXT: vlgvb %r5, %v0, 10
-; CHECK-NEXT: rosbg %r4, %r5, 58, 58, 5
-; CHECK-NEXT: vlgvb %r5, %v0, 11
-; CHECK-NEXT: rosbg %r4, %r5, 59, 59, 4
-; CHECK-NEXT: vlgvb %r5, %v0, 12
-; CHECK-NEXT: rosbg %r4, %r5, 60, 60, 3
-; CHECK-NEXT: vlgvb %r5, %v0, 13
-; CHECK-NEXT: rosbg %r4, %r5, 61, 61, 2
-; CHECK-NEXT: vlgvb %r5, %v0, 14
-; CHECK-NEXT: rosbg %r4, %r5, 62, 62, 1
-; CHECK-NEXT: vlgvb %r5, %v0, 15
-; CHECK-NEXT: rosbg %r4, %r5, 63, 63, 0
-; CHECK-NEXT: xilf %r4, 4294967295
-; CHECK-NEXT: or %r4, %r3
-; CHECK-NEXT: tmll %r4, 65535
-; CHECK-NEXT: ipm %r3
-; CHECK-NEXT: afi %r3, -268435456
-; CHECK-NEXT: srl %r3, 31
+; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7
+; CHECK-NEXT: vlgvf %r13, %v0, 1
+; CHECK-NEXT: rosbg %r2, %r13, 57, 57, 6
+; CHECK-NEXT: vlgvf %r13, %v0, 2
+; CHECK-NEXT: rosbg %r2, %r13, 58, 58, 5
+; CHECK-NEXT: vlgvf %r13, %v0, 3
+; CHECK-NEXT: rosbg %r2, %r13, 59, 59, 4
+; CHECK-NEXT: nilf %r4, 1
+; CHECK-NEXT: rosbg %r2, %r4, 32, 60, 3
+; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2
+; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1
+; CHECK-NEXT: or %r2, %r14
+; CHECK-NEXT: vlgvb %r4, %v0, 1
+; CHECK-NEXT: vlgvb %r3, %v0, 0
+; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15
+; CHECK-NEXT: rosbg %r3, %r4, 49, 49, 14
+; CHECK-NEXT: vlgvb %r4, %v0, 2
+; CHECK-NEXT: rosbg %r3, %r4, 50, 50, 13
+; CHECK-NEXT: vlgvb %r4, %v0, 3
+; CHECK-NEXT: rosbg %r3, %r4, 51, 51, 12
+; CHECK-NEXT: vlgvb %r4, %v0, 4
+; CHECK-NEXT: rosbg %r3, %r4, 52, 52, 11
+; CHECK-NEXT: vlgvb %r4, %v0, 5
+; CHECK-NEXT: rosbg %r3, %r4, 53, 53, 10
+; CHECK-NEXT: vlgvb %r4, %v0, 6
+; CHECK-NEXT: rosbg %r3, %r4, 54, 54, 9
+; CHECK-NEXT: vlgvb %r4, %v0, 7
+; CHECK-NEXT: rosbg %r3, %r4, 55, 55, 8
+; CHECK-NEXT: vlgvb %r4, %v0, 8
+; CHECK-NEXT: rosbg %r3, %r4, 56, 56, 7
+; CHECK-NEXT: vlgvb %r4, %v0, 9
+; CHECK-NEXT: rosbg %r3, %r4, 57, 57, 6
+; CHECK-NEXT: vlgvb %r4, %v0, 10
+; CHECK-NEXT: rosbg %r3, %r4, 58, 58, 5
+; CHECK-NEXT: vlgvb %r4, %v0, 11
+; CHECK-NEXT: rosbg %r3, %r4, 59, 59, 4
+; CHECK-NEXT: vlgvb %r4, %v0, 12
+; CHECK-NEXT: rosbg %r3, %r4, 60, 60, 3
+; CHECK-NEXT: vlgvb %r4, %v0, 13
+; CHECK-NEXT: rosbg %r3, %r4, 61, 61, 2
+; CHECK-NEXT: vlgvb %r4, %v0, 14
+; CHECK-NEXT: rosbg %r3, %r4, 62, 62, 1
+; CHECK-NEXT: vlgvb %r4, %v0, 15
+; CHECK-NEXT: rosbg %r3, %r4, 63, 63, 0
+; CHECK-NEXT: xilf %r3, 4294967295
+; CHECK-NEXT: or %r3, %r2
+; CHECK-NEXT: tmll %r3, 65535
+; CHECK-NEXT: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
; CHECK-NEXT: nr %r2, %r1
-; CHECK-NEXT: nr %r2, %r3
; CHECK-NEXT: nr %r2, %r0
; CHECK-NEXT: larl %r1, g
; CHECK-NEXT: stc %r2, 0(%r1)
-; CHECK-NEXT: lmg %r12, %r15, 264(%r15)
+; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
; CHECK-NEXT: br %r14
entry:
%n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index c79da37988e40b..dbc027495297b1 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,8 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
; X86-LABEL: freeze_ashr_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrad $1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrad $2, %xmm0
; X86-NEXT: retl
;
@@ -660,8 +660,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
; X86-LABEL: freeze_lshr_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrld $1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; X86-NEXT: psrld $2, %xmm0
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index a80d8d8cd01b85..76cf2423a254fe 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -121,12 +121,10 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
-; CHECK-AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; CHECK-AVX2-NEXT: vmovq %xmm5, %rdi
-; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %r8
-; CHECK-AVX2-NEXT: vmovq %xmm7, %r9
-; CHECK-AVX2-NEXT: vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi
+; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8
+; CHECK-AVX2-NEXT: vmovq %xmm5, %r9
+; CHECK-AVX2-NEXT: vmovq %xmm6, %r10
; CHECK-AVX2-NEXT: negq %r10
; CHECK-AVX2-NEXT: movq %rcx, %r10
; CHECK-AVX2-NEXT: sbbq %r8, %r10
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index f7a27a5b914466..9ae1f270e88337 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,7 +65,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -75,7 +74,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movb %al, (%rdx)
@@ -83,15 +81,14 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-NO-BMI2: # %bb.0:
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT: movzwl %dx, %edx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT: movzwl (%eax), %eax
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NEXT: movb %al, (%edx)
; X86-NO-BMI2-NEXT: retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -100,7 +97,6 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
-; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movb %cl, (%eax)
@@ -123,7 +119,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
; X64-NO-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-NO-BMI2-NEXT: movzwl %ax, %eax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrl %cl, %eax
@@ -133,7 +128,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
; X64-BMI2-NEXT: movzwl (%rdi), %eax
-; X64-BMI2-NEXT: movzwl %ax, %eax
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: shrxl %esi, %eax, %eax
; X64-BMI2-NEXT: movw %ax, (%rdx)
@@ -145,7 +139,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT: movzwl (%edx), %edx
-; X86-NO-BMI2-NEXT: movzwl %dx, %edx
; X86-NO-BMI2-NEXT: shll $3, %ecx
; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT: shrl %cl, %edx
@@ -158,7 +151,6 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT: movzwl (%edx), %edx
-; X86-BMI2-NEXT: movzwl %dx, %edx
; X86-BMI2-NEXT: shll $3, %ecx
; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT: movw %cx, (%eax)
@@ -179,9 +171,8 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-NEXT: movl (%rdi), %eax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -189,9 +180,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movq %xmm0, %rax
+; X64-BMI2-NEXT: movl (%rdi), %eax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movb %al, (%rdx)
; X64-BMI2-NEXT: retq
@@ -199,99 +189,49 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: retl
;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
+; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl (%edx), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorl %esi, %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: testb $32, %cl
+; X86-SHLD-NEXT: cmovnel %esi, %edx
+; X86-SHLD-NEXT: movb %dl, (%eax)
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorl %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %bl, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <4 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -308,9 +248,8 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-NEXT: movl (%rdi), %eax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -318,107 +257,58 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movq %xmm0, %rax
+; X64-BMI2-NEXT: movl (%rdi), %eax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movw %ax, (%rdx)
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movw %si, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %si, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: retl
;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
+; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl (%edx), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorl %esi, %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: testb $32, %cl
+; X86-SHLD-NEXT: cmovnel %esi, %edx
+; X86-SHLD-NEXT: movw %dx, (%eax)
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %si, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <4 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -434,9 +324,8 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movq %xmm0, %rax
+; X64-NO-BMI2-NEXT: movl (%rdi), %eax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -444,107 +333,58 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movq %xmm0, %rax
+; X64-BMI2-NEXT: movl (%rdi), %eax
; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT: movl %eax, (%rdx)
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: retl
;
-; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
+; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl (%edx), %edx
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorl %esi, %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: testb $32, %cl
+; X86-SHLD-NEXT: cmovnel %esi, %edx
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: cmovel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
-;
-; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
-; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <4 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
@@ -560,88 +400,51 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movb %sil, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movb %al, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -670,88 +473,51 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movw %si, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movw %ax, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -779,88 +545,51 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movl %eax, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -888,88 +617,51 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: xorl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movq %rsi, %rcx
+; X64-SHLD-NEXT: movq (%rdi), %rax
+; X64-SHLD-NEXT: shll $3, %ecx
+; X64-SHLD-NEXT: xorl %esi, %esi
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: testb $64, %cl
+; X64-SHLD-NEXT: cmovneq %rsi, %rax
+; X64-SHLD-NEXT: movq %rax, (%rdx)
+; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
-;
; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86: # %bb.0:
; X86-NEXT: subl $32, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT: movss %xmm0, (%esp)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1941,7 +1633,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
+; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X64-NO-BMI2-HAVE-SHLD: {{.*}}
; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
+; X86-NO-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
More information about the llvm-commits
mailing list