[llvm] [DAGCombiner] Combine (fshl A, B, S) | (fshr C, D, BW-S) --> (fshl (A|C), (B|D), S) (PR #180889)
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 10 21:47:12 PST 2026
- Previous message: [llvm] [DAGCombiner] Combine (fshl A, X, Y) | (shl X, Y) --> fshl (A|X), X, Y (PR #180888)
- Next message: [llvm] [DAGCombiner] Combine (fshl A, B, S) | (fshr C, D, BW-S) --> (fshl (A|C), (B|D), S) (PR #180889)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
https://github.com/topperc created https://github.com/llvm/llvm-project/pull/180889
This is similar to the FSHL/FSHR handling in hoistLogicOpWithSameOpcodeHands.
Here the opcodes aren't exactly the same, but the operations are
equivalent.
Fixes regressions from #180888
Stacked on #180888 and #180887
>From 335e4d96961bed343ac2ce0a557a580e29a2bd21 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 10 Feb 2026 17:44:40 -0800
Subject: [PATCH 1/3] [DAGCombiner] Combine (fshl A, X, Y) | (shl X, Y) -->
fshl (A|X), X, Y
Similar for (fshr X, B, Y) | (srl X, Y) --> fshr X, (X|B), Y
This is similar to the FSHL/FSHR handling in hoistLogicOpWithSameOpcodeHands
but here we treat a shl/shr like a fshl/fshr with 0.
We don't need X to be the same in both sides, but that's what occurred in
the pattern I was looking at.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 ++++++++---
llvm/test/CodeGen/AArch64/funnel-shift.ll | 54 +++++++----------
llvm/test/CodeGen/X86/funnel-shift.ll | 60 +++++++------------
3 files changed, 69 insertions(+), 79 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b05157289892b..11f0f6d8611c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8497,17 +8497,35 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
return V;
};
- // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
if (N0.getOpcode() == ISD::FSHL && N1.getOpcode() == ISD::SHL &&
- N0.getOperand(0) == N1.getOperand(0) &&
- peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
- return N0;
+ peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1))) {
+ // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
+ if (N0.getOperand(0) == N1.getOperand(0))
+ return N0;
+ // (fshl A, X, Y) | (shl X, Y) --> fshl (A|X), X, Y
+ if (N0.getOperand(1) == N1.getOperand(0) && N0.hasOneUse() &&
+ N1.hasOneUse()) {
+ SDValue A = N0.getOperand(0);
+ SDValue X = N1.getOperand(0);
+ SDValue NewLHS = DAG.getNode(ISD::OR, DL, VT, A, X);
+ return DAG.getNode(ISD::FSHL, DL, VT, NewLHS, X, N0.getOperand(2));
+ }
+ }
- // (fshr ?, X, Y) | (srl X, Y) --> fshr ?, X, Y
if (N0.getOpcode() == ISD::FSHR && N1.getOpcode() == ISD::SRL &&
- N0.getOperand(1) == N1.getOperand(0) &&
- peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
- return N0;
+ peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1))) {
+ // (fshr ?, X, Y) | (srl X, Y) --> fshr ?, X, Y
+ if (N0.getOperand(1) == N1.getOperand(0))
+ return N0;
+ // (fshr X, B, Y) | (srl X, Y) --> fshr X, (X|B), Y
+ if (N0.getOperand(0) == N1.getOperand(0) && N0.hasOneUse() &&
+ N1.hasOneUse()) {
+ SDValue X = N1.getOperand(0);
+ SDValue B = N0.getOperand(1);
+ SDValue NewRHS = DAG.getNode(ISD::OR, DL, VT, X, B);
+ return DAG.getNode(ISD::FSHR, DL, VT, X, NewRHS, N0.getOperand(2));
+ }
+ }
// Attempt to match a legalized build_pair-esque pattern:
// or(shl(aext(Hi),BW/2),zext(Lo))
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index e0bbfc620e2f8..9dd5dff896624 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -541,14 +541,12 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
; CHECK-SD-LABEL: or_shl_fshl:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w2
-; CHECK-SD-NEXT: lsr w9, w1, #1
-; CHECK-SD-NEXT: lsl w10, w1, w2
-; CHECK-SD-NEXT: mvn w11, w2
-; CHECK-SD-NEXT: lsl w8, w0, w8
-; CHECK-SD-NEXT: lsr w9, w9, w11
-; CHECK-SD-NEXT: orr w8, w8, w10
-; CHECK-SD-NEXT: orr w0, w8, w9
+; CHECK-SD-NEXT: lsr w8, w1, #1
+; CHECK-SD-NEXT: orr w9, w0, w1
+; CHECK-SD-NEXT: mvn w10, w2
+; CHECK-SD-NEXT: lsl w9, w9, w2
+; CHECK-SD-NEXT: lsr w8, w8, w10
+; CHECK-SD-NEXT: orr w0, w9, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_shl_fshl:
@@ -586,14 +584,12 @@ define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) {
define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
; CHECK-SD-LABEL: or_shl_fshl_commute:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w2
-; CHECK-SD-NEXT: lsr w9, w1, #1
-; CHECK-SD-NEXT: lsl w10, w1, w2
-; CHECK-SD-NEXT: mvn w11, w2
-; CHECK-SD-NEXT: lsl w8, w0, w8
-; CHECK-SD-NEXT: lsr w9, w9, w11
-; CHECK-SD-NEXT: orr w8, w10, w8
-; CHECK-SD-NEXT: orr w0, w8, w9
+; CHECK-SD-NEXT: lsr w8, w1, #1
+; CHECK-SD-NEXT: orr w9, w0, w1
+; CHECK-SD-NEXT: mvn w10, w2
+; CHECK-SD-NEXT: lsl w9, w9, w2
+; CHECK-SD-NEXT: lsr w8, w8, w10
+; CHECK-SD-NEXT: orr w0, w9, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_shl_fshl_commute:
@@ -631,14 +627,12 @@ define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) {
define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
; CHECK-SD-LABEL: or_lshr_fshr:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w2
-; CHECK-SD-NEXT: lsl w9, w1, #1
-; CHECK-SD-NEXT: lsr w10, w1, w2
-; CHECK-SD-NEXT: lsr w8, w0, w8
-; CHECK-SD-NEXT: mvn w11, w2
-; CHECK-SD-NEXT: lsl w9, w9, w11
-; CHECK-SD-NEXT: orr w8, w8, w10
-; CHECK-SD-NEXT: orr w0, w9, w8
+; CHECK-SD-NEXT: lsl w8, w1, #1
+; CHECK-SD-NEXT: orr w9, w1, w0
+; CHECK-SD-NEXT: mvn w10, w2
+; CHECK-SD-NEXT: lsr w9, w9, w2
+; CHECK-SD-NEXT: lsl w8, w8, w10
+; CHECK-SD-NEXT: orr w0, w8, w9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_lshr_fshr:
@@ -675,13 +669,11 @@ define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) {
define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
; CHECK-SD-LABEL: or_lshr_fshr_commute:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w2
-; CHECK-SD-NEXT: lsl w9, w1, #1
-; CHECK-SD-NEXT: lsr w10, w1, w2
-; CHECK-SD-NEXT: lsr w8, w0, w8
-; CHECK-SD-NEXT: mvn w11, w2
-; CHECK-SD-NEXT: lsl w9, w9, w11
-; CHECK-SD-NEXT: orr w8, w10, w8
+; CHECK-SD-NEXT: lsl w8, w1, #1
+; CHECK-SD-NEXT: orr w9, w1, w0
+; CHECK-SD-NEXT: mvn w10, w2
+; CHECK-SD-NEXT: lsr w9, w9, w2
+; CHECK-SD-NEXT: lsl w8, w8, w10
; CHECK-SD-NEXT: orr w0, w8, w9
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 78d7e7eb3c136..4ba161783fff2 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -1150,25 +1150,20 @@ declare dso_local void @_Z3foov()
define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) nounwind {
; X86-SSE2-LABEL: or_shl_fshl:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl %edx, %esi
-; X86-SSE2-NEXT: shll %cl, %esi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: orl %edx, %eax
; X86-SSE2-NEXT: shldl %cl, %edx, %eax
-; X86-SSE2-NEXT: orl %esi, %eax
-; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: or_shl_fshl:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movl %edx, %ecx
-; X64-AVX-NEXT: movl %esi, %eax
-; X64-AVX-NEXT: shll %cl, %eax
+; X64-AVX-NEXT: movl %edi, %eax
+; X64-AVX-NEXT: orl %esi, %eax
; X64-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-AVX-NEXT: shldl %cl, %esi, %edi
-; X64-AVX-NEXT: orl %edi, %eax
+; X64-AVX-NEXT: shldl %cl, %esi, %eax
; X64-AVX-NEXT: retq
%shy = shl i32 %y, %s
%fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
@@ -1205,25 +1200,20 @@ define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) nounwind {
define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) nounwind {
; X86-SSE2-LABEL: or_shl_fshl_commute:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl %edx, %esi
-; X86-SSE2-NEXT: shll %cl, %esi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: orl %edx, %eax
; X86-SSE2-NEXT: shldl %cl, %edx, %eax
-; X86-SSE2-NEXT: orl %esi, %eax
-; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: or_shl_fshl_commute:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movl %edx, %ecx
-; X64-AVX-NEXT: movl %esi, %eax
-; X64-AVX-NEXT: shll %cl, %eax
+; X64-AVX-NEXT: movl %edi, %eax
+; X64-AVX-NEXT: orl %esi, %eax
; X64-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-AVX-NEXT: shldl %cl, %esi, %edi
-; X64-AVX-NEXT: orl %edi, %eax
+; X64-AVX-NEXT: shldl %cl, %esi, %eax
; X64-AVX-NEXT: retq
%shy = shl i32 %y, %s
%fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
@@ -1260,25 +1250,20 @@ define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) nounwind {
define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) nounwind {
; X86-SSE2-LABEL: or_lshr_fshr:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl %edx, %esi
-; X86-SSE2-NEXT: shrl %cl, %esi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: orl %edx, %eax
; X86-SSE2-NEXT: shrdl %cl, %edx, %eax
-; X86-SSE2-NEXT: orl %esi, %eax
-; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: or_lshr_fshr:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movl %edx, %ecx
-; X64-AVX-NEXT: movl %esi, %eax
-; X64-AVX-NEXT: shrl %cl, %eax
+; X64-AVX-NEXT: movl %edi, %eax
+; X64-AVX-NEXT: orl %esi, %eax
; X64-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-AVX-NEXT: shrdl %cl, %esi, %edi
-; X64-AVX-NEXT: orl %edi, %eax
+; X64-AVX-NEXT: shrdl %cl, %esi, %eax
; X64-AVX-NEXT: retq
%shy = lshr i32 %y, %s
%fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
@@ -1315,25 +1300,20 @@ define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) nounwind {
define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) nounwind {
; X86-SSE2-LABEL: or_lshr_fshr_commute:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: movl %edx, %esi
-; X86-SSE2-NEXT: shrl %cl, %esi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: orl %edx, %eax
; X86-SSE2-NEXT: shrdl %cl, %edx, %eax
-; X86-SSE2-NEXT: orl %esi, %eax
-; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: retl
;
; X64-AVX-LABEL: or_lshr_fshr_commute:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movl %edx, %ecx
-; X64-AVX-NEXT: movl %esi, %eax
-; X64-AVX-NEXT: shrl %cl, %eax
+; X64-AVX-NEXT: movl %edi, %eax
+; X64-AVX-NEXT: orl %esi, %eax
; X64-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-AVX-NEXT: shrdl %cl, %esi, %edi
-; X64-AVX-NEXT: orl %edi, %eax
+; X64-AVX-NEXT: shrdl %cl, %esi, %eax
; X64-AVX-NEXT: retq
%shy = lshr i32 %y, %s
%fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
>From fb4389ea2a5f8bc877fa0ee8d327b253cc0b5938 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 10 Feb 2026 17:07:20 -0800
Subject: [PATCH 2/3] [LegalizeTypes] Emit FSHL/FSHR from ExpandShiftByConstant
when Legal.
This avoids needing to combine the SHL/SHR/OR pattern later.
This improves code quality on RISC-V where our slx/srx instructions
clobber the destination register but we don't have an immediate form.
We can't recover the original direction from the SHL/SHR/OR pattern
and we can't commute it during the TwoAddressInstruction pass like X86
due to the shift amount being in a register.
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 54 ++-
llvm/test/CodeGen/RISCV/rv32p.ll | 7 +-
llvm/test/CodeGen/RISCV/rv64p.ll | 7 +-
llvm/test/CodeGen/X86/avgceils-scalar.ll | 40 +-
llvm/test/CodeGen/X86/avgceilu-scalar.ll | 40 +-
llvm/test/CodeGen/X86/avgfloors-scalar.ll | 12 +-
llvm/test/CodeGen/X86/div_i129_v_pow2k.ll | 52 +--
.../CodeGen/X86/expand-large-fp-optnone.ll | 10 +-
llvm/test/CodeGen/X86/fold-tied-op.ll | 98 ++---
llvm/test/CodeGen/X86/icmp-shift-opt.ll | 28 +-
llvm/test/CodeGen/X86/legalize-shl-vec.ll | 88 ++--
llvm/test/CodeGen/X86/load-local-v3i129.ll | 10 +-
llvm/test/CodeGen/X86/midpoint-int.ll | 35 +-
llvm/test/CodeGen/X86/pr32282.ll | 16 +-
llvm/test/CodeGen/X86/pr38539.ll | 14 +-
llvm/test/CodeGen/X86/pr43820.ll | 343 ++++++++--------
llvm/test/CodeGen/X86/pr49162.ll | 5 +-
llvm/test/CodeGen/X86/rotate-extract.ll | 7 +-
llvm/test/CodeGen/X86/scmp.ll | 385 +++++++++---------
llvm/test/CodeGen/X86/sdiv_fix_sat.ll | 368 +++++++++--------
llvm/test/CodeGen/X86/shift-and.ll | 16 +-
llvm/test/CodeGen/X86/shift-i512.ll | 63 +--
llvm/test/CodeGen/X86/smax.ll | 26 +-
llvm/test/CodeGen/X86/smin.ll | 26 +-
llvm/test/CodeGen/X86/udiv_fix_sat.ll | 76 ++--
llvm/test/CodeGen/X86/umax.ll | 26 +-
llvm/test/CodeGen/X86/umin.ll | 26 +-
llvm/test/CodeGen/X86/vector-sext.ll | 106 ++---
llvm/test/CodeGen/X86/vector-zext.ll | 66 ++-
29 files changed, 1019 insertions(+), 1031 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 4d08a22f25ab9..9dcb68f3420f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3298,12 +3298,18 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
} else {
Lo = DAG.getNode(ISD::SHL, DL, NVT, InL,
DAG.getShiftAmountConstant(Amt, NVT, DL));
- Hi = DAG.getNode(
- ISD::OR, DL, NVT,
- DAG.getNode(ISD::SHL, DL, NVT, InH,
- DAG.getShiftAmountConstant(Amt, NVT, DL)),
- DAG.getNode(ISD::SRL, DL, NVT, InL,
- DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+ // Use FSHL if legal so we don't need to combine it later.
+ if (TLI.isOperationLegal(ISD::FSHL, NVT)) {
+ Hi = DAG.getNode(ISD::FSHL, DL, NVT, InH, InL,
+ DAG.getShiftAmountConstant(Amt, NVT, DL));
+ } else {
+ Hi = DAG.getNode(
+ ISD::OR, DL, NVT,
+ DAG.getNode(ISD::SHL, DL, NVT, InH,
+ DAG.getShiftAmountConstant(Amt, NVT, DL)),
+ DAG.getNode(ISD::SRL, DL, NVT, InL,
+ DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+ }
}
return;
}
@@ -3319,12 +3325,18 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
Lo = InH;
Hi = DAG.getConstant(0, DL, NVT);
} else {
- Lo = DAG.getNode(
- ISD::OR, DL, NVT,
- DAG.getNode(ISD::SRL, DL, NVT, InL,
- DAG.getShiftAmountConstant(Amt, NVT, DL)),
- DAG.getNode(ISD::SHL, DL, NVT, InH,
- DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+ // Use FSHR if legal so we don't need to combine it later.
+ if (TLI.isOperationLegal(ISD::FSHR, NVT)) {
+ Lo = DAG.getNode(ISD::FSHR, DL, NVT, InH, InL,
+ DAG.getShiftAmountConstant(Amt, NVT, DL));
+ } else {
+ Lo = DAG.getNode(
+ ISD::OR, DL, NVT,
+ DAG.getNode(ISD::SRL, DL, NVT, InL,
+ DAG.getShiftAmountConstant(Amt, NVT, DL)),
+ DAG.getNode(ISD::SHL, DL, NVT, InH,
+ DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+ }
Hi = DAG.getNode(ISD::SRL, DL, NVT, InH,
DAG.getShiftAmountConstant(Amt, NVT, DL));
}
@@ -3345,12 +3357,18 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
DAG.getShiftAmountConstant(NVTBits - 1, NVT, DL));
} else {
- Lo = DAG.getNode(
- ISD::OR, DL, NVT,
- DAG.getNode(ISD::SRL, DL, NVT, InL,
- DAG.getShiftAmountConstant(Amt, NVT, DL)),
- DAG.getNode(ISD::SHL, DL, NVT, InH,
- DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+ // Use FSHR if legal so we don't need to combine it later.
+ if (TLI.isOperationLegal(ISD::FSHR, NVT)) {
+ Lo = DAG.getNode(ISD::FSHR, DL, NVT, InH, InL,
+ DAG.getShiftAmountConstant(Amt, NVT, DL));
+ } else {
+ Lo = DAG.getNode(
+ ISD::OR, DL, NVT,
+ DAG.getNode(ISD::SRL, DL, NVT, InL,
+ DAG.getShiftAmountConstant(Amt, NVT, DL)),
+ DAG.getNode(ISD::SHL, DL, NVT, InH,
+ DAG.getShiftAmountConstant(-Amt + NVTBits, NVT, DL)));
+ }
Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
DAG.getShiftAmountConstant(Amt, NVT, DL));
}
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
index e4d4c68109dea..651163c9ca7e7 100644
--- a/llvm/test/CodeGen/RISCV/rv32p.ll
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -349,15 +349,12 @@ define i64 @srx_i64(i64 %x, i64 %y) {
ret i64 %b
}
-; FIXME: Using srx instead of slx would avoid the mv.
define i64 @srxi_i64(i64 %x) {
; CHECK-LABEL: srxi_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: li a3, 7
+; CHECK-NEXT: li a2, 25
+; CHECK-NEXT: srx a0, a1, a2
; CHECK-NEXT: srli a1, a1, 25
-; CHECK-NEXT: slx a2, a0, a3
-; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: ret
%a = lshr i64 %x, 25
ret i64 %a
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
index 53ca8476034a1..17dea9130003c 100644
--- a/llvm/test/CodeGen/RISCV/rv64p.ll
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -365,15 +365,12 @@ define i128 @srx_i128(i128 %x, i128 %y) {
ret i128 %b
}
-; FIXME: Using srx instead of slx would avoid the mv.
define i128 @srxi_i128(i128 %x) {
; CHECK-LABEL: srxi_i128:
; CHECK: # %bb.0:
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: li a3, 15
+; CHECK-NEXT: li a2, 49
+; CHECK-NEXT: srx a0, a1, a2
; CHECK-NEXT: srli a1, a1, 49
-; CHECK-NEXT: slx a2, a0, a3
-; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: ret
%a = lshr i128 %x, 49
ret i128 %a
diff --git a/llvm/test/CodeGen/X86/avgceils-scalar.ll b/llvm/test/CodeGen/X86/avgceils-scalar.ll
index 91121bd4ad935..a44c746ad0eda 100644
--- a/llvm/test/CodeGen/X86/avgceils-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgceils-scalar.ll
@@ -175,19 +175,19 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: xorl %esi, %ebx
-; X86-NEXT: shrdl $1, %ebx, %edi
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: sarl %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: shrdl $1, %esi, %ebx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: sarl %esi
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: subl %edi, %eax
-; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: subl %ebx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -215,19 +215,19 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: xorl %esi, %ebx
-; X86-NEXT: shrdl $1, %ebx, %edi
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: sarl %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: shrdl $1, %esi, %ebx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: sarl %esi
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: subl %edi, %eax
-; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: subl %ebx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/avgceilu-scalar.ll b/llvm/test/CodeGen/X86/avgceilu-scalar.ll
index 4ab4851eccd2c..987e0a0188c2d 100644
--- a/llvm/test/CodeGen/X86/avgceilu-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgceilu-scalar.ll
@@ -175,19 +175,19 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: xorl %esi, %ebx
-; X86-NEXT: shrdl $1, %ebx, %edi
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: shrl %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: shrdl $1, %esi, %ebx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: shrl %esi
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: subl %edi, %eax
-; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: subl %ebx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -215,19 +215,19 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: xorl %esi, %ebx
-; X86-NEXT: shrdl $1, %ebx, %edi
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: shrl %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: shrdl $1, %esi, %ebx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: shrl %esi
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: subl %edi, %eax
-; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: subl %ebx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/avgfloors-scalar.ll b/llvm/test/CodeGen/X86/avgfloors-scalar.ll
index 87d72afa90939..eedc7b64ac3b8 100644
--- a/llvm/test/CodeGen/X86/avgfloors-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgfloors-scalar.ll
@@ -260,10 +260,10 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl %esi, %ebx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %esi, %ebx
; X86-NEXT: shrdl $1, %edx, %ebx
; X86-NEXT: andl %edi, %ecx
; X86-NEXT: sarl %edx
@@ -300,10 +300,10 @@ define i64 @test_lsb_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl %esi, %ebx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %esi, %ebx
; X86-NEXT: shrdl $1, %edx, %ebx
; X86-NEXT: andl %edi, %ecx
; X86-NEXT: sarl %edx
@@ -342,10 +342,10 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: xorl %esi, %ebx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: xorl %esi, %ebx
; X86-NEXT: shrdl $1, %edx, %ebx
; X86-NEXT: andl %edi, %ecx
; X86-NEXT: sarl %edx
diff --git a/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll b/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
index 4d6d795e3beb8..d60c626fb900b 100644
--- a/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
+++ b/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
@@ -25,20 +25,20 @@ define i129 @v_sdiv_i129_v_pow2k(i129 %lhs) nounwind {
;
; X64-O0-LABEL: v_sdiv_i129_v_pow2k:
; X64-O0: # %bb.0:
-; X64-O0-NEXT: movl %edx, %eax
-; X64-O0-NEXT: andl $1, %eax
-; X64-O0-NEXT: movl %eax, %ecx
-; X64-O0-NEXT: negq %rcx
-; X64-O0-NEXT: movl %ecx, %r8d
+; X64-O0-NEXT: movq %rdi, %rax
+; X64-O0-NEXT: movl %edx, %ecx
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: movl %ecx, %edi
+; X64-O0-NEXT: negq %rdi
+; X64-O0-NEXT: movl %edi, %r8d
; X64-O0-NEXT: andl $1, %r8d
-; X64-O0-NEXT: # implicit-def: $rax
-; X64-O0-NEXT: movl %r8d, %eax
-; X64-O0-NEXT: shldq $32, %rcx, %rax
-; X64-O0-NEXT: addq %rax, %rdi
+; X64-O0-NEXT: # implicit-def: $rcx
+; X64-O0-NEXT: movl %r8d, %ecx
+; X64-O0-NEXT: shldq $32, %rdi, %rcx
+; X64-O0-NEXT: addq %rcx, %rax
; X64-O0-NEXT: adcq $0, %rsi
; X64-O0-NEXT: adcq $0, %rdx
-; X64-O0-NEXT: movq %rsi, %rax
-; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: shrdq $33, %rsi, %rax
; X64-O0-NEXT: movl %edx, %ecx
; X64-O0-NEXT: andl $1, %ecx
; X64-O0-NEXT: # kill: def $rcx killed $ecx
@@ -154,20 +154,20 @@ define i129 @v_sdiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
;
; X64-O0-LABEL: v_sdiv_exact_i129_v_pow2k:
; X64-O0: # %bb.0:
-; X64-O0-NEXT: movl %edx, %eax
-; X64-O0-NEXT: andl $1, %eax
-; X64-O0-NEXT: movl %eax, %ecx
-; X64-O0-NEXT: negq %rcx
-; X64-O0-NEXT: movl %ecx, %r8d
+; X64-O0-NEXT: movq %rdi, %rax
+; X64-O0-NEXT: movl %edx, %ecx
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: movl %ecx, %edi
+; X64-O0-NEXT: negq %rdi
+; X64-O0-NEXT: movl %edi, %r8d
; X64-O0-NEXT: andl $1, %r8d
-; X64-O0-NEXT: # implicit-def: $rax
-; X64-O0-NEXT: movl %r8d, %eax
-; X64-O0-NEXT: shldq $32, %rcx, %rax
-; X64-O0-NEXT: addq %rax, %rdi
+; X64-O0-NEXT: # implicit-def: $rcx
+; X64-O0-NEXT: movl %r8d, %ecx
+; X64-O0-NEXT: shldq $32, %rdi, %rcx
+; X64-O0-NEXT: addq %rcx, %rax
; X64-O0-NEXT: adcq $0, %rsi
; X64-O0-NEXT: adcq $0, %rdx
-; X64-O0-NEXT: movq %rsi, %rax
-; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: shrdq $33, %rsi, %rax
; X64-O0-NEXT: movl %edx, %ecx
; X64-O0-NEXT: andl $1, %ecx
; X64-O0-NEXT: # kill: def $rcx killed $ecx
@@ -274,8 +274,8 @@ define i129 @v_udiv_i129_v_pow2k(i129 %lhs) nounwind {
;
; X64-O0-LABEL: v_udiv_i129_v_pow2k:
; X64-O0: # %bb.0:
-; X64-O0-NEXT: movq %rsi, %rax
-; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: movq %rdi, %rax
+; X64-O0-NEXT: shrdq $33, %rsi, %rax
; X64-O0-NEXT: movl %edx, %ecx
; X64-O0-NEXT: andl $1, %ecx
; X64-O0-NEXT: movl %ecx, %edx
@@ -345,8 +345,8 @@ define i129 @v_udiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
;
; X64-O0-LABEL: v_udiv_exact_i129_v_pow2k:
; X64-O0: # %bb.0:
-; X64-O0-NEXT: movq %rsi, %rax
-; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: movq %rdi, %rax
+; X64-O0-NEXT: shrdq $33, %rsi, %rax
; X64-O0-NEXT: movl %edx, %ecx
; X64-O0-NEXT: andl $1, %ecx
; X64-O0-NEXT: movl %ecx, %edx
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
index a635d55d2033d..16fe756bfc4e6 100644
--- a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
+++ b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
@@ -181,18 +181,18 @@ define double @main(i224 %0) #0 {
; CHECK-NEXT: adcq $0, %rsi
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: adcq $0, %rcx
-; CHECK-NEXT: movq %rsi, %rdx
-; CHECK-NEXT: shldq $62, %rdi, %rdx
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: shrdq $2, %rsi, %rdx
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: btq $55, %rdi
; CHECK-NEXT: jae .LBB0_9
; CHECK-NEXT: jmp .LBB0_7
; CHECK-NEXT: .LBB0_7: # %itofp-if-then20
-; CHECK-NEXT: shldq $61, %rdi, %rsi
-; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shrdq $3, %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: movq %rdi, %rdx
; CHECK-NEXT: movl %r11d, %r10d
; CHECK-NEXT: jmp .LBB0_9
; CHECK-NEXT: .LBB0_8: # %itofp-if-else
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index d60d397c50e11..3dc083fbd673b 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -24,84 +24,86 @@ define i64 @fn1() #0 {
; CHECK-NEXT: .cfi_offset %esi, -20
; CHECK-NEXT: .cfi_offset %edi, -16
; CHECK-NEXT: .cfi_offset %ebx, -12
-; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D
-; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F
+; CHECK-NEXT: movl $-1028477379, %eax # imm = 0xC2B2AE3D
+; CHECK-NEXT: movl $668265295, %ebx # imm = 0x27D4EB4F
; CHECK-NEXT: movl a, %edi
; CHECK-NEXT: cmpl $0, (%edi)
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.then
-; CHECK-NEXT: movl 8(%edi), %ecx
+; CHECK-NEXT: movl 8(%edi), %esi
; CHECK-NEXT: movl 12(%edi), %edx
; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: shldl $1, %ecx, %eax
+; CHECK-NEXT: shldl $1, %esi, %eax
; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: leal (%ecx,%ecx), %edx
-; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: leal (%esi,%esi), %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 16(%edi), %ecx
+; CHECK-NEXT: movl 20(%edi), %esi
+; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: shldl $2, %ecx, %edx
; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 16(%edi), %ebx
-; CHECK-NEXT: movl 20(%edi), %edx
+; CHECK-NEXT: leal (,%ecx,4), %edx
; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shldl $2, %ebx, %edx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: shldl $31, %ebx, %ecx
-; CHECK-NEXT: shll $2, %ebx
-; CHECK-NEXT: orl %ecx, %ebx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: shrl %ecx
-; CHECK-NEXT: orl %edx, %ecx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: adcl %eax, %ecx
+; CHECK-NEXT: shrdl $1, %esi, %ecx
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT: shrl %esi
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: adcl %eax, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movl 24(%edi), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D
-; CHECK-NEXT: imull %eax, %ebx
-; CHECK-NEXT: mull %esi
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: addl %ebx, %edx
+; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D
+; CHECK-NEXT: imull %eax, %ecx
+; CHECK-NEXT: mull %ebx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: addl %ecx, %edx
; CHECK-NEXT: movl 28(%edi), %edi
-; CHECK-NEXT: imull %edi, %esi
-; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: imull %edi, %ebx
+; CHECK-NEXT: addl %edx, %ebx
; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A
-; CHECK-NEXT: addl %edx, %ebx
+; CHECK-NEXT: imull $-2056954758, %ecx, %ecx # imm = 0x85655C7A
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT: addl %ebx, %edx
-; CHECK-NEXT: shrdl $3, %esi, %ecx
-; CHECK-NEXT: sarl $3, %esi
-; CHECK-NEXT: orl %edx, %esi
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: mull %ebx
+; CHECK-NEXT: addl %ecx, %edx
+; CHECK-NEXT: shrdl $3, %ebx, %esi
+; CHECK-NEXT: sarl $3, %ebx
+; CHECK-NEXT: orl %edx, %ebx
+; CHECK-NEXT: orl %eax, %esi
+; CHECK-NEXT: movl $-66860409, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: mull %ecx
; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC
+; CHECK-NEXT: imull $326129324, %esi, %eax # imm = 0x137056AC
; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT: imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87
; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; CHECK-NEXT: movl %edi, b
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: mull %ebx
+; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87
+; CHECK-NEXT: mull %edx
; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC
; CHECK-NEXT: addl %edx, %esi
; CHECK-NEXT: movl %ecx, b+4
; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %if.else
-; CHECK-NEXT: xorl b+4, %ecx
-; CHECK-NEXT: xorl b, %esi
-; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: mull %edx
-; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9
+; CHECK-NEXT: xorl b+4, %eax
+; CHECK-NEXT: xorl b, %ebx
+; CHECK-NEXT: movl $1419758215, %ecx # imm = 0x549FCA87
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: mull %ecx
+; CHECK-NEXT: imull $93298681, %ebx, %esi # imm = 0x58F9FF9
; CHECK-NEXT: addl %edx, %esi
-; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87
+; CHECK-NEXT: imull $1419758215, %edi, %ecx # imm = 0x549FCA87
; CHECK-NEXT: .LBB0_3: # %if.end
; CHECK-NEXT: addl %esi, %ecx
; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 4a6c1d0ae5deb..7ba7f6212d517 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -86,11 +86,11 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: orl 20(%ebp), %ecx
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: shldl $15, %edx, %ecx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shldl $15, %ecx, %edx
+; X86-NEXT: orl 8(%ebp), %eax
+; X86-NEXT: shrdl $17, %ecx, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: sete %al
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@@ -117,11 +117,11 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: orl 20(%ebp), %ecx
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: shldl $15, %edx, %ecx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shldl $15, %ecx, %edx
+; X86-NEXT: orl 8(%ebp), %eax
+; X86-NEXT: shrdl $17, %ecx, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: setne %al
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@@ -145,10 +145,10 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl 20(%ebp), %ecx
; X86-NEXT: shll $17, %ecx
-; X86-NEXT: orl 8(%ebp), %eax
+; X86-NEXT: orl 16(%ebp), %eax
; X86-NEXT: orl 12(%ebp), %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: sete %al
@@ -174,10 +174,10 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl 20(%ebp), %ecx
; X86-NEXT: shll $17, %ecx
-; X86-NEXT: orl 8(%ebp), %eax
+; X86-NEXT: orl 16(%ebp), %eax
; X86-NEXT: orl 12(%ebp), %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: setne %al
diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index 5e168a82e03e7..4cfb050958abb 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -75,37 +75,37 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: shldl $28, %edx, %esi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: shldl $28, %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $28, %ebx, %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: shldl $28, %ecx, %ebx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shldl $28, %edi, %esi
-; X86-NEXT: shldl $28, %eax, %edi
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shrdl $4, %eax, %ecx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shldl $28, %eax, %edx
+; X86-NEXT: shldl $28, %eax, %ebp
+; X86-NEXT: shrdl $4, %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl $28, %eax, %esi
+; X86-NEXT: shrdl $4, %eax, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrdl $4, %eax, %ecx
-; X86-NEXT: shrl $4, %ebp
+; X86-NEXT: shrdl $4, %ecx, %ebx
+; X86-NEXT: shrl $4, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 60(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, 56(%eax)
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, 52(%eax)
-; X86-NEXT: movl %ebx, 48(%eax)
-; X86-NEXT: movl %esi, 44(%eax)
+; X86-NEXT: movl %ecx, 60(%eax)
+; X86-NEXT: movl %ebx, 56(%eax)
+; X86-NEXT: movl %esi, 52(%eax)
+; X86-NEXT: movl %edx, 48(%eax)
+; X86-NEXT: movl %ebp, 44(%eax)
; X86-NEXT: movl %edi, 40(%eax)
-; X86-NEXT: movl %edx, 36(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 36(%eax)
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, 32(%eax)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shrl $31, %ecx
@@ -157,37 +157,37 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: shldl $26, %edx, %esi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: shldl $26, %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $26, %ebx, %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: shldl $26, %ecx, %ebx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shldl $26, %edi, %esi
-; X86-NEXT: shldl $26, %eax, %edi
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shrdl $6, %eax, %ecx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shldl $26, %eax, %edx
+; X86-NEXT: shldl $26, %eax, %ebp
+; X86-NEXT: shrdl $6, %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl $26, %eax, %esi
+; X86-NEXT: shrdl $6, %eax, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrdl $6, %eax, %ecx
-; X86-NEXT: sarl $6, %ebp
+; X86-NEXT: shrdl $6, %ecx, %ebx
+; X86-NEXT: sarl $6, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 60(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, 56(%eax)
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, 52(%eax)
-; X86-NEXT: movl %ebx, 48(%eax)
-; X86-NEXT: movl %esi, 44(%eax)
+; X86-NEXT: movl %ecx, 60(%eax)
+; X86-NEXT: movl %ebx, 56(%eax)
+; X86-NEXT: movl %esi, 52(%eax)
+; X86-NEXT: movl %edx, 48(%eax)
+; X86-NEXT: movl %ebp, 44(%eax)
; X86-NEXT: movl %edi, 40(%eax)
-; X86-NEXT: movl %edx, 36(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 36(%eax)
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, 32(%eax)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: sarl $31, %ecx
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index eb5d172a3b352..af96345968630 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -7,13 +7,11 @@ define void @_start() nounwind {
; FAST-SHLD: # %bb.0: # %Entry
; FAST-SHLD-NEXT: movq -40(%rsp), %rax
; FAST-SHLD-NEXT: movq -32(%rsp), %rcx
-; FAST-SHLD-NEXT: movq %rcx, %rdx
-; FAST-SHLD-NEXT: shlq $62, %rdx
+; FAST-SHLD-NEXT: shrdq $2, %rcx, %rax
; FAST-SHLD-NEXT: shrq $2, %rcx
-; FAST-SHLD-NEXT: shldq $2, %rdx, %rcx
-; FAST-SHLD-NEXT: andq $-4, %rax
-; FAST-SHLD-NEXT: incq %rax
-; FAST-SHLD-NEXT: movq %rax, -40(%rsp)
+; FAST-SHLD-NEXT: leaq 1(,%rax,4), %rdx
+; FAST-SHLD-NEXT: movq %rdx, -40(%rsp)
+; FAST-SHLD-NEXT: shldq $2, %rax, %rcx
; FAST-SHLD-NEXT: movq %rcx, -32(%rsp)
; FAST-SHLD-NEXT: orq $-2, -56(%rsp)
; FAST-SHLD-NEXT: movq $-1, -48(%rsp)
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index c058e37e0ce11..ffbb4bf06debc 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -317,8 +317,8 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB5_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %eax
; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB5_2:
; X86-NEXT: negl %ebx
; X86-NEXT: shrdl $1, %edi, %eax
@@ -370,35 +370,34 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: cmpl %eax, %ebp
-; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: cmpl %esi, %ebp
+; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: movl $0, %ebx
; X86-NEXT: sbbl %ebx, %ebx
; X86-NEXT: movl %ebx, %edi
; X86-NEXT: orl $1, %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: subl %ebp, %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: subl %ebp, %eax
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: xorl %edx, %ebp
; X86-NEXT: xorl %edx, %eax
-; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: subl %edx, %esi
-; X86-NEXT: sbbl %edx, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: shldl $31, %esi, %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: sbbl %edx, %ebp
+; X86-NEXT: shrdl $1, %ebp, %eax
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: mull %edi
; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shrl %ebp
; X86-NEXT: imull %edi, %ebp
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -455,8 +454,8 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB7_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %eax
; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB7_2:
; X86-NEXT: negl %ebx
; X86-NEXT: shrdl $1, %edi, %eax
@@ -524,8 +523,8 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB8_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %eax
; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB8_2:
; X86-NEXT: negl %ebx
; X86-NEXT: shrdl $1, %edi, %eax
@@ -595,8 +594,8 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; X86-NEXT: movzbl %bl, %ebx
; X86-NEXT: jl .LBB9_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %eax
; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: .LBB9_2:
; X86-NEXT: negl %ebx
; X86-NEXT: shrdl $1, %edi, %eax
diff --git a/llvm/test/CodeGen/X86/pr32282.ll b/llvm/test/CodeGen/X86/pr32282.ll
index d2e748d582f04..a5bb7316a9673 100644
--- a/llvm/test/CodeGen/X86/pr32282.ll
+++ b/llvm/test/CodeGen/X86/pr32282.ll
@@ -13,17 +13,17 @@ define dso_local void @foo(i64 %x) nounwind {
; X86-LABEL: foo:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: movl d+4, %eax
+; X86-NEXT: movl d, %eax
; X86-NEXT: notl %eax
-; X86-NEXT: movl d, %ecx
+; X86-NEXT: movl d+4, %ecx
; X86-NEXT: notl %ecx
-; X86-NEXT: andl $-566231040, %ecx # imm = 0xDE400000
-; X86-NEXT: andl $701685459, %eax # imm = 0x29D2DED3
-; X86-NEXT: shrdl $21, %eax, %ecx
-; X86-NEXT: shrl $21, %eax
-; X86-NEXT: addl $7, %ecx
-; X86-NEXT: pushl %eax
+; X86-NEXT: andl $701685459, %ecx # imm = 0x29D2DED3
+; X86-NEXT: andl $-566231040, %eax # imm = 0xDE400000
+; X86-NEXT: shrdl $21, %ecx, %eax
+; X86-NEXT: shrl $21, %ecx
+; X86-NEXT: addl $7, %eax
; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %eax
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll __divdi3
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index b3cb7401e6402..eecd15dd2afe9 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -45,8 +45,8 @@ define void @f() nounwind {
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: shldl $30, %ebx, %ecx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl $30, %esi, %edx
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrdl $2, %ebx, %edx
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB0_1
@@ -206,12 +206,12 @@ define void @f() nounwind {
; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: sbbl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: sbbl %ecx, %edi
-; X86-NEXT: shll $30, %edi
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sarl $30, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: shll $30, %esi
+; X86-NEXT: movl %esi, %edi
; X86-NEXT: sarl $31, %edi
+; X86-NEXT: sarl $30, %esi
; X86-NEXT: shrdl $1, %edi, %esi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: andl $1, %eax
diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll
index 2fb4410567be6..bf553c02fea3f 100644
--- a/llvm/test/CodeGen/X86/pr43820.ll
+++ b/llvm/test/CodeGen/X86/pr43820.ll
@@ -10,58 +10,37 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; CHECK-NEXT: bswapq %r12
-; CHECK-NEXT: movq %r12, %r10
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT: bswapq %rbx
+; CHECK-NEXT: movq %rbx, %r10
; CHECK-NEXT: shrq $4, %r10
-; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: andq %rsi, %r12
-; CHECK-NEXT: shlq $4, %r12
-; CHECK-NEXT: orq %r10, %r12
+; CHECK-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT: andq %r11, %r10
+; CHECK-NEXT: andq %r11, %rbx
+; CHECK-NEXT: shlq $4, %rbx
+; CHECK-NEXT: orq %r10, %rbx
; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
-; CHECK-NEXT: movq %r12, %r13
-; CHECK-NEXT: andq %r10, %r13
-; CHECK-NEXT: shrq $2, %r12
-; CHECK-NEXT: andq %r10, %r12
-; CHECK-NEXT: leaq (%r12,%r13,4), %r12
-; CHECK-NEXT: movabsq $6148914691230924800, %r13 # imm = 0x5555555555000000
-; CHECK-NEXT: movq %r12, %rbp
-; CHECK-NEXT: andq %r13, %rbp
-; CHECK-NEXT: shrq %r12
-; CHECK-NEXT: andq %r13, %r12
-; CHECK-NEXT: leaq (%r12,%rbp,2), %rax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: bswapq %r14
-; CHECK-NEXT: movq %r14, %r12
-; CHECK-NEXT: shrq $4, %r12
-; CHECK-NEXT: andq %rsi, %r12
-; CHECK-NEXT: andq %rsi, %r14
-; CHECK-NEXT: shlq $4, %r14
-; CHECK-NEXT: orq %r12, %r14
-; CHECK-NEXT: movq %r14, %r12
+; CHECK-NEXT: movq %rbx, %r12
; CHECK-NEXT: andq %r10, %r12
-; CHECK-NEXT: shrq $2, %r14
-; CHECK-NEXT: andq %r10, %r14
-; CHECK-NEXT: leaq (%r14,%r12,4), %r12
-; CHECK-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
+; CHECK-NEXT: shrq $2, %rbx
+; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: leaq (%rbx,%r12,4), %r12
+; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
; CHECK-NEXT: movq %r12, %r13
-; CHECK-NEXT: andq %r14, %r13
+; CHECK-NEXT: andq %rbx, %r13
; CHECK-NEXT: shrq %r12
-; CHECK-NEXT: andq %r14, %r12
+; CHECK-NEXT: andq %rbx, %r12
; CHECK-NEXT: leaq (%r12,%r13,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r15
; CHECK-NEXT: movq %r15, %r12
; CHECK-NEXT: shrq $4, %r12
-; CHECK-NEXT: andq %rsi, %r12
-; CHECK-NEXT: andq %rsi, %r15
+; CHECK-NEXT: andq %r11, %r12
+; CHECK-NEXT: andq %r11, %r15
; CHECK-NEXT: shlq $4, %r15
; CHECK-NEXT: orq %r12, %r15
; CHECK-NEXT: movq %r15, %r12
@@ -69,149 +48,169 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: shrq $2, %r15
; CHECK-NEXT: andq %r10, %r15
; CHECK-NEXT: leaq (%r15,%r12,4), %r15
+; CHECK-NEXT: movabsq $6148914691230924800, %r12 # imm = 0x5555555555000000
+; CHECK-NEXT: movq %r15, %r13
+; CHECK-NEXT: andq %r12, %r13
+; CHECK-NEXT: shrq %r15
+; CHECK-NEXT: andq %r12, %r15
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; CHECK-NEXT: leaq (%r15,%r13,2), %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: bswapq %r12
+; CHECK-NEXT: movq %r12, %r15
+; CHECK-NEXT: shrq $4, %r15
+; CHECK-NEXT: andq %r11, %r15
+; CHECK-NEXT: andq %r11, %r12
+; CHECK-NEXT: shlq $4, %r12
+; CHECK-NEXT: orq %r15, %r12
+; CHECK-NEXT: movq %r12, %r15
+; CHECK-NEXT: andq %r10, %r15
+; CHECK-NEXT: shrq $2, %r12
+; CHECK-NEXT: andq %r10, %r12
+; CHECK-NEXT: leaq (%r12,%r15,4), %r15
; CHECK-NEXT: movq %r15, %r12
-; CHECK-NEXT: andq %r14, %r12
+; CHECK-NEXT: andq %rbx, %r12
; CHECK-NEXT: shrq %r15
-; CHECK-NEXT: andq %r14, %r15
+; CHECK-NEXT: andq %rbx, %r15
; CHECK-NEXT: leaq (%r15,%r12,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: bswapq %rbx
-; CHECK-NEXT: movq %rbx, %r15
+; CHECK-NEXT: bswapq %r14
+; CHECK-NEXT: movq %r14, %r15
; CHECK-NEXT: shrq $4, %r15
-; CHECK-NEXT: andq %rsi, %r15
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: shlq $4, %rbx
-; CHECK-NEXT: orq %r15, %rbx
-; CHECK-NEXT: movq %rbx, %r15
+; CHECK-NEXT: andq %r11, %r15
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: shlq $4, %r14
+; CHECK-NEXT: orq %r15, %r14
+; CHECK-NEXT: movq %r14, %r15
; CHECK-NEXT: andq %r10, %r15
-; CHECK-NEXT: shrq $2, %rbx
-; CHECK-NEXT: andq %r10, %rbx
-; CHECK-NEXT: leaq (%rbx,%r15,4), %rbx
-; CHECK-NEXT: movq %rbx, %r15
-; CHECK-NEXT: andq %r14, %r15
-; CHECK-NEXT: shrq %rbx
-; CHECK-NEXT: andq %r14, %rbx
-; CHECK-NEXT: leaq (%rbx,%r15,2), %rax
+; CHECK-NEXT: shrq $2, %r14
+; CHECK-NEXT: andq %r10, %r14
+; CHECK-NEXT: leaq (%r14,%r15,4), %r14
+; CHECK-NEXT: movq %r14, %r15
+; CHECK-NEXT: andq %rbx, %r15
+; CHECK-NEXT: shrq %r14
+; CHECK-NEXT: andq %rbx, %r14
+; CHECK-NEXT: leaq (%r14,%r15,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: shrq $4, %rbx
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: orq %rbx, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: orq %r14, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %r10, %r14
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: andq %r10, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rdi,%r14,4), %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: andq %r14, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
+; CHECK-NEXT: andq %rbx, %rdi
+; CHECK-NEXT: leaq (%rdi,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: shrq $4, %rbx
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: orq %rbx, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: orq %r14, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %r10, %r14
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: andq %r10, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rdi,%r14,4), %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: andq %r14, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
+; CHECK-NEXT: andq %rbx, %rdi
+; CHECK-NEXT: leaq (%rdi,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: shrq $4, %rbx
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: orq %rbx, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: orq %r14, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %r10, %r14
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: andq %r10, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rdi,%r14,4), %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: andq %r14, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
+; CHECK-NEXT: andq %rbx, %rdi
+; CHECK-NEXT: leaq (%rdi,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: shrq $4, %rbx
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: orq %rbx, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: orq %r14, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %r10, %r14
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: andq %r10, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rdi,%r14,4), %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: andq %r14, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
+; CHECK-NEXT: andq %rbx, %rdi
+; CHECK-NEXT: leaq (%rdi,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: shrq $4, %rbx
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: orq %rbx, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: orq %r14, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %r10, %r14
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: andq %r10, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rdi,%r14,4), %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: andq %r14, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
+; CHECK-NEXT: andq %rbx, %rdi
+; CHECK-NEXT: leaq (%rdi,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: bswapq %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: shrq $4, %rbx
-; CHECK-NEXT: andq %rsi, %rbx
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: shrq $4, %r14
+; CHECK-NEXT: andq %r11, %r14
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
-; CHECK-NEXT: orq %rbx, %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r10, %rbx
+; CHECK-NEXT: orq %r14, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %r10, %r14
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: andq %r10, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: andq %r14, %rbx
+; CHECK-NEXT: leaq (%rdi,%r14,4), %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: andq %rbx, %r14
; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: andq %r14, %rdi
-; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax
+; CHECK-NEXT: andq %rbx, %rdi
+; CHECK-NEXT: leaq (%rdi,%r14,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: bswapq %rdi
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %rdi
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: andq %r11, %rdi
; CHECK-NEXT: shlq $4, %rdi
; CHECK-NEXT: orq %rax, %rdi
; CHECK-NEXT: movq %rdi, %rax
@@ -220,15 +219,15 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: andq %r10, %rdi
; CHECK-NEXT: leaq (%rdi,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rdi
-; CHECK-NEXT: andq %r14, %rdi
+; CHECK-NEXT: andq %rbx, %rdi
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%rdi,2), %rdi
; CHECK-NEXT: bswapq %r9
; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %r9
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: andq %r11, %r9
; CHECK-NEXT: shlq $4, %r9
; CHECK-NEXT: orq %rax, %r9
; CHECK-NEXT: movq %r9, %rax
@@ -237,16 +236,16 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: andq %r10, %r9
; CHECK-NEXT: leaq (%r9,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r9
-; CHECK-NEXT: andq %r14, %r9
+; CHECK-NEXT: andq %rbx, %r9
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%r9,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %r8
; CHECK-NEXT: movq %r8, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %r8
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: andq %r11, %r8
; CHECK-NEXT: shlq $4, %r8
; CHECK-NEXT: orq %rax, %r8
; CHECK-NEXT: movq %r8, %rax
@@ -255,16 +254,16 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: andq %r10, %r8
; CHECK-NEXT: leaq (%r8,%rax,4), %rax
; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: andq %r14, %r8
+; CHECK-NEXT: andq %rbx, %r8
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%r8,2), %rax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: bswapq %rcx
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: andq %r11, %rcx
; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: movq %rcx, %rax
@@ -273,15 +272,15 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: andq %r10, %rcx
; CHECK-NEXT: leaq (%rcx,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: andq %r14, %rcx
+; CHECK-NEXT: andq %rbx, %rcx
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %r14, %rax
-; CHECK-NEXT: leaq (%rax,%rcx,2), %rbx
+; CHECK-NEXT: andq %rbx, %rax
+; CHECK-NEXT: leaq (%rax,%rcx,2), %r14
; CHECK-NEXT: bswapq %rdx
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %rdx
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: andq %r11, %rdx
; CHECK-NEXT: shlq $4, %rdx
; CHECK-NEXT: orq %rax, %rdx
; CHECK-NEXT: movq %rdx, %rax
@@ -290,30 +289,29 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: andq %r10, %rdx
; CHECK-NEXT: leaq (%rdx,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: andq %r14, %rdx
+; CHECK-NEXT: andq %rbx, %rdx
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%rdx,2), %rdx
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-NEXT: bswapq %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: bswapq %rsi
+; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: andq %rsi, %rax
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: shlq $4, %rcx
-; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: andq %r11, %rsi
+; CHECK-NEXT: shlq $4, %rsi
+; CHECK-NEXT: orq %rax, %rsi
+; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: andq %r10, %rax
-; CHECK-NEXT: shrq $2, %rcx
-; CHECK-NEXT: andq %r10, %rcx
-; CHECK-NEXT: leaq (%rcx,%rax,4), %rax
+; CHECK-NEXT: shrq $2, %rsi
+; CHECK-NEXT: andq %r10, %rsi
+; CHECK-NEXT: leaq (%rsi,%rax,4), %rax
; CHECK-NEXT: movq %rax, %rsi
-; CHECK-NEXT: andq %r14, %rsi
+; CHECK-NEXT: andq %rbx, %rsi
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andq %r14, %rax
+; CHECK-NEXT: andq %rbx, %rax
; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rax, %r10
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rcx, %rax
@@ -327,30 +325,31 @@ define i1000 @square(i1000 %A) nounwind {
; CHECK-NEXT: shrdq $24, %r12, %r13
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r15, %r12
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %r14, %r15
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %rbx, %r15
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %r11, %r14
+; CHECK-NEXT: shrdq $24, %r11, %rbx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; CHECK-NEXT: shrdq $24, %r9, %r11
; CHECK-NEXT: movq %rdi, %r8
; CHECK-NEXT: shrdq $24, %rdi, %r9
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; CHECK-NEXT: shrdq $24, %rdi, %r8
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-NEXT: shrdq $24, %rcx, %rdi
-; CHECK-NEXT: shrdq $24, %rbx, %rcx
-; CHECK-NEXT: shrdq $24, %rdx, %rbx
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: shrdq $24, %rax, %rdi
+; CHECK-NEXT: shrdq $24, %r14, %rax
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: shrdq $24, %rdx, %r14
; CHECK-NEXT: shrdq $24, %rsi, %rdx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; CHECK-NEXT: movq %rdx, 112(%rax)
-; CHECK-NEXT: movq %rbx, 104(%rax)
+; CHECK-NEXT: movq %r14, 104(%rax)
; CHECK-NEXT: movq %rcx, 96(%rax)
; CHECK-NEXT: movq %rdi, 88(%rax)
; CHECK-NEXT: movq %r8, 80(%rax)
; CHECK-NEXT: movq %r9, 72(%rax)
; CHECK-NEXT: movq %r11, 64(%rax)
-; CHECK-NEXT: movq %r14, 56(%rax)
+; CHECK-NEXT: movq %rbx, 56(%rax)
; CHECK-NEXT: movq %r15, 48(%rax)
; CHECK-NEXT: movq %r12, 40(%rax)
; CHECK-NEXT: movq %r13, 32(%rax)
diff --git a/llvm/test/CodeGen/X86/pr49162.ll b/llvm/test/CodeGen/X86/pr49162.ll
index 0e65e121531bf..db8cec61acd6b 100644
--- a/llvm/test/CodeGen/X86/pr49162.ll
+++ b/llvm/test/CodeGen/X86/pr49162.ll
@@ -17,10 +17,7 @@ define ptr @PR49162(ptr %base, ptr %ptr160) {
;
; X64-LABEL: PR49162:
; X64: # %bb.0:
-; X64-NEXT: movl 8(%rsi), %eax
-; X64-NEXT: shll $16, %eax
-; X64-NEXT: cltq
-; X64-NEXT: sarq $16, %rax
+; X64-NEXT: movswq 8(%rsi), %rax
; X64-NEXT: leaq (%rdi,%rax,4), %rax
; X64-NEXT: retq
%load160 = load i160, ptr %ptr160, align 4
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 26e68861cf45c..b5332068d7edd 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -13,12 +13,11 @@ define i64 @rolq_extract_shl(i64 %i) nounwind {
; X86-LABEL: rolq_extract_shl:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shldl $3, %edx, %ecx
-; X86-NEXT: shll $3, %eax
; X86-NEXT: shll $3, %edx
-; X86-NEXT: shrdl $25, %edx, %eax
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shldl $7, %ecx, %eax
; X86-NEXT: shrdl $25, %ecx, %edx
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index d03d0727b15df..393e05bfd0cc6 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -2611,34 +2611,35 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: andq %r13, %rbp
; SSE2-NEXT: shldq $62, %rdi, %r13
; SSE2-NEXT: movq %r13, 88(%rax)
-; SSE2-NEXT: movq %r10, %r13
-; SSE2-NEXT: shldq $20, %r9, %r13
+; SSE2-NEXT: movq %r9, %r13
+; SSE2-NEXT: shrdq $44, %r10, %r13
; SSE2-NEXT: movq %r13, 64(%rax)
-; SSE2-NEXT: movq %r11, %r13
-; SSE2-NEXT: shldq $31, %r8, %r13
+; SSE2-NEXT: movq %r8, %r13
+; SSE2-NEXT: shrdq $33, %r11, %r13
; SSE2-NEXT: movq %r13, 48(%rax)
-; SSE2-NEXT: movq %rcx, %r13
-; SSE2-NEXT: shldq $42, %rbx, %r13
+; SSE2-NEXT: movq %rbx, %r13
+; SSE2-NEXT: shrdq $22, %rcx, %r13
; SSE2-NEXT: movq %r13, 32(%rax)
-; SSE2-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800
-; SSE2-NEXT: andq %r12, %r13
-; SSE2-NEXT: shldq $53, %rdx, %r12
-; SSE2-NEXT: movq %r12, 16(%rax)
-; SSE2-NEXT: movq %rbp, %r12
-; SSE2-NEXT: shrq $48, %r12
-; SSE2-NEXT: movb %r12b, 102(%rax)
+; SSE2-NEXT: movq %rdx, %r13
+; SSE2-NEXT: shrdq $11, %r12, %r13
+; SSE2-NEXT: movq %r13, 16(%rax)
+; SSE2-NEXT: movq %rbp, %r13
+; SSE2-NEXT: shrq $48, %r13
+; SSE2-NEXT: movb %r13b, 102(%rax)
; SSE2-NEXT: shrq $32, %rbp
; SSE2-NEXT: movw %bp, 100(%rax)
-; SSE2-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF
-; SSE2-NEXT: andq %r12, %r15
+; SSE2-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF
+; SSE2-NEXT: andq %r13, %r15
; SSE2-NEXT: shldq $9, %r14, %r15
; SSE2-NEXT: shlq $62, %rdi
; SSE2-NEXT: orq %r15, %rdi
; SSE2-NEXT: movq %rdi, 80(%rax)
+; SSE2-NEXT: movabsq $9007199254738944, %rdi # imm = 0x1FFFFFFFFFF800
+; SSE2-NEXT: andq %r12, %rdi
; SSE2-NEXT: shlq $42, %rbx
-; SSE2-NEXT: shrq $11, %r13
-; SSE2-NEXT: orq %rbx, %r13
-; SSE2-NEXT: movq %r13, 24(%rax)
+; SSE2-NEXT: shrq $11, %rdi
+; SSE2-NEXT: orq %rbx, %rdi
+; SSE2-NEXT: movq %rdi, 24(%rax)
; SSE2-NEXT: shlq $9, %r14
; SSE2-NEXT: andl $511, %r10d # imm = 0x1FF
; SSE2-NEXT: orq %r14, %r10
@@ -2655,7 +2656,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: andq %r12, %rcx
+; SSE2-NEXT: andq %r13, %rcx
; SSE2-NEXT: shlq $53, %rdx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: movq %rdx, 8(%rax)
@@ -2704,8 +2705,8 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE4-NEXT: setg %r15b
; SSE4-NEXT: subb %sil, %r15b
; SSE4-NEXT: movsbq %r15b, %rsi
-; SSE4-NEXT: movq %rsi, %r15
-; SSE4-NEXT: sarq $63, %r15
+; SSE4-NEXT: movq %rsi, %r12
+; SSE4-NEXT: sarq $63, %r12
; SSE4-NEXT: addb %bpl, %bpl
; SSE4-NEXT: sarb %bpl
; SSE4-NEXT: addb %dl, %dl
@@ -2714,9 +2715,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE4-NEXT: setl %dl
; SSE4-NEXT: setg %bpl
; SSE4-NEXT: subb %dl, %bpl
-; SSE4-NEXT: movsbq %bpl, %r12
-; SSE4-NEXT: movq %r12, %r13
-; SSE4-NEXT: sarq $63, %r13
+; SSE4-NEXT: movsbq %bpl, %r13
+; SSE4-NEXT: movq %r13, %r15
+; SSE4-NEXT: sarq $63, %r15
; SSE4-NEXT: addb %bl, %bl
; SSE4-NEXT: sarb %bl
; SSE4-NEXT: addb %cl, %cl
@@ -2760,45 +2761,46 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE4-NEXT: setg %r11b
; SSE4-NEXT: subb %dil, %r11b
; SSE4-NEXT: movsbq %r11b, %rdi
-; SSE4-NEXT: movq %rdi, %rbp
-; SSE4-NEXT: sarq $63, %rbp
-; SSE4-NEXT: movl %ebp, 96(%rax)
-; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF
-; SSE4-NEXT: andq %rbp, %r11
-; SSE4-NEXT: shldq $62, %rdi, %rbp
-; SSE4-NEXT: movq %rbp, 88(%rax)
-; SSE4-NEXT: movq %r10, %rbp
-; SSE4-NEXT: shldq $20, %r9, %rbp
-; SSE4-NEXT: movq %rbp, 64(%rax)
-; SSE4-NEXT: movq %r8, %rbp
-; SSE4-NEXT: shldq $31, %rdx, %rbp
-; SSE4-NEXT: movq %rbp, 48(%rax)
-; SSE4-NEXT: movq %rcx, %rbp
-; SSE4-NEXT: shldq $42, %rbx, %rbp
-; SSE4-NEXT: movq %rbp, 32(%rax)
-; SSE4-NEXT: movabsq $9007199254738944, %rbp # imm = 0x1FFFFFFFFFF800
-; SSE4-NEXT: andq %r13, %rbp
-; SSE4-NEXT: shldq $53, %r12, %r13
-; SSE4-NEXT: movq %r13, 16(%rax)
-; SSE4-NEXT: movq %r11, %r13
-; SSE4-NEXT: shrq $48, %r13
-; SSE4-NEXT: movb %r13b, 102(%rax)
-; SSE4-NEXT: shrq $32, %r11
-; SSE4-NEXT: movw %r11w, 100(%rax)
+; SSE4-NEXT: movq %rdi, %r11
+; SSE4-NEXT: sarq $63, %r11
+; SSE4-NEXT: movl %r11d, 96(%rax)
+; SSE4-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF
+; SSE4-NEXT: andq %r11, %rbp
+; SSE4-NEXT: shldq $62, %rdi, %r11
+; SSE4-NEXT: movq %r11, 88(%rax)
+; SSE4-NEXT: movq %r9, %r11
+; SSE4-NEXT: shrdq $44, %r10, %r11
+; SSE4-NEXT: movq %r11, 64(%rax)
+; SSE4-NEXT: movq %rdx, %r11
+; SSE4-NEXT: shrdq $33, %r8, %r11
+; SSE4-NEXT: movq %r11, 48(%rax)
+; SSE4-NEXT: movq %rbx, %r11
+; SSE4-NEXT: shrdq $22, %rcx, %r11
+; SSE4-NEXT: movq %r11, 32(%rax)
+; SSE4-NEXT: movq %r13, %r11
+; SSE4-NEXT: shrdq $11, %r15, %r11
+; SSE4-NEXT: movq %r11, 16(%rax)
+; SSE4-NEXT: movq %rbp, %r11
+; SSE4-NEXT: shrq $48, %r11
+; SSE4-NEXT: movb %r11b, 102(%rax)
+; SSE4-NEXT: shrq $32, %rbp
+; SSE4-NEXT: movw %bp, 100(%rax)
; SSE4-NEXT: movabsq $9007199254740991, %r11 # imm = 0x1FFFFFFFFFFFFF
-; SSE4-NEXT: andq %r11, %r15
-; SSE4-NEXT: shldq $9, %rsi, %r15
+; SSE4-NEXT: andq %r11, %r12
+; SSE4-NEXT: shldq $9, %rsi, %r12
; SSE4-NEXT: shlq $62, %rdi
-; SSE4-NEXT: orq %r15, %rdi
+; SSE4-NEXT: orq %r12, %rdi
; SSE4-NEXT: movq %rdi, 80(%rax)
; SSE4-NEXT: andq %r11, %r14
-; SSE4-NEXT: shlq $53, %r12
-; SSE4-NEXT: orq %r14, %r12
-; SSE4-NEXT: movq %r12, 8(%rax)
+; SSE4-NEXT: shlq $53, %r13
+; SSE4-NEXT: orq %r14, %r13
+; SSE4-NEXT: movq %r13, 8(%rax)
; SSE4-NEXT: shlq $42, %rbx
-; SSE4-NEXT: shrq $11, %rbp
-; SSE4-NEXT: orq %rbx, %rbp
-; SSE4-NEXT: movq %rbp, 24(%rax)
+; SSE4-NEXT: movabsq $9007199254738944, %rdi # imm = 0x1FFFFFFFFFF800
+; SSE4-NEXT: andq %r15, %rdi
+; SSE4-NEXT: shrq $11, %rdi
+; SSE4-NEXT: orq %rbx, %rdi
+; SSE4-NEXT: movq %rdi, 24(%rax)
; SSE4-NEXT: shlq $9, %rsi
; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF
; SSE4-NEXT: orq %rsi, %r10
@@ -2912,39 +2914,40 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; AVX-NEXT: setg %r11b
; AVX-NEXT: subb %dil, %r11b
; AVX-NEXT: movsbq %r11b, %rdi
-; AVX-NEXT: movq %rdi, %rbp
-; AVX-NEXT: sarq $63, %rbp
-; AVX-NEXT: movl %ebp, 96(%rax)
-; AVX-NEXT: movb $51, %r11b
-; AVX-NEXT: bzhiq %r11, %rbp, %r11
-; AVX-NEXT: shldq $62, %rdi, %rbp
-; AVX-NEXT: movq %rbp, 88(%rax)
-; AVX-NEXT: movq %r10, %rbp
-; AVX-NEXT: shldq $20, %r9, %rbp
-; AVX-NEXT: movq %rbp, 64(%rax)
-; AVX-NEXT: movq %r8, %rbp
-; AVX-NEXT: shldq $31, %rdx, %rbp
-; AVX-NEXT: movq %rbp, 48(%rax)
-; AVX-NEXT: movq %rcx, %rbp
-; AVX-NEXT: shldq $42, %rbx, %rbp
-; AVX-NEXT: movq %rbp, 32(%rax)
-; AVX-NEXT: movb $42, %bpl
-; AVX-NEXT: bzhiq %rbp, %r13, %rbp
-; AVX-NEXT: shldq $53, %r15, %r13
-; AVX-NEXT: movq %r13, 16(%rax)
-; AVX-NEXT: movq %r11, %r13
-; AVX-NEXT: shrq $48, %r13
-; AVX-NEXT: movb %r13b, 102(%rax)
-; AVX-NEXT: shrq $32, %r11
-; AVX-NEXT: movw %r11w, 100(%rax)
+; AVX-NEXT: movq %rdi, %r11
+; AVX-NEXT: sarq $63, %r11
+; AVX-NEXT: movl %r11d, 96(%rax)
+; AVX-NEXT: movb $51, %bpl
+; AVX-NEXT: bzhiq %rbp, %r11, %rbp
+; AVX-NEXT: shldq $62, %rdi, %r11
+; AVX-NEXT: movq %r11, 88(%rax)
+; AVX-NEXT: movq %r9, %r11
+; AVX-NEXT: shrdq $44, %r10, %r11
+; AVX-NEXT: movq %r11, 64(%rax)
+; AVX-NEXT: movq %rdx, %r11
+; AVX-NEXT: shrdq $33, %r8, %r11
+; AVX-NEXT: movq %r11, 48(%rax)
+; AVX-NEXT: movq %rbx, %r11
+; AVX-NEXT: shrdq $22, %rcx, %r11
+; AVX-NEXT: movq %r11, 32(%rax)
+; AVX-NEXT: movq %r15, %r11
+; AVX-NEXT: shrdq $11, %r13, %r11
+; AVX-NEXT: movq %r11, 16(%rax)
+; AVX-NEXT: movq %rbp, %r11
+; AVX-NEXT: shrq $48, %r11
+; AVX-NEXT: movb %r11b, 102(%rax)
+; AVX-NEXT: shrq $32, %rbp
+; AVX-NEXT: movw %bp, 100(%rax)
; AVX-NEXT: movb $53, %r11b
; AVX-NEXT: bzhiq %r11, %r12, %r12
; AVX-NEXT: shldq $9, %rsi, %r12
; AVX-NEXT: shlq $62, %rdi
; AVX-NEXT: orq %r12, %rdi
; AVX-NEXT: movq %rdi, 80(%rax)
+; AVX-NEXT: movb $42, %dil
+; AVX-NEXT: bzhiq %rdi, %r13, %rdi
; AVX-NEXT: shlq $42, %rbx
-; AVX-NEXT: orq %rbp, %rbx
+; AVX-NEXT: orq %rdi, %rbx
; AVX-NEXT: movq %rbx, 24(%rax)
; AVX-NEXT: bzhiq %r11, %r14, %rdi
; AVX-NEXT: shlq $53, %r15
@@ -3039,10 +3042,12 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; X86-NEXT: setl %cl
; X86-NEXT: setg %ch
; X86-NEXT: subb %cl, %ch
-; X86-NEXT: movsbl %ch, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movsbl %ch, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpb %al, %ah
@@ -3050,8 +3055,8 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movsbl %cl, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ecx, (%edi)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $2097151, %eax # imm = 0x1FFFFF
@@ -3068,9 +3073,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
-; X86-NEXT: movsbl %dl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movsbl %dl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sarl $31, %edi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
; X86-NEXT: setl %al
@@ -3084,81 +3089,89 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; X86-NEXT: setl %dl
; X86-NEXT: setg %dh
; X86-NEXT: subb %dl, %dh
-; X86-NEXT: movsbl %dh, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %ebx, 96(%edi)
-; X86-NEXT: movl %ebx, 92(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, 80(%edi)
-; X86-NEXT: movl %eax, 68(%edi)
-; X86-NEXT: movl %eax, 64(%edi)
-; X86-NEXT: movl %esi, 52(%edi)
-; X86-NEXT: movl %esi, 48(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, 36(%edi)
-; X86-NEXT: movl %ebp, 24(%edi)
-; X86-NEXT: movl %ebp, 20(%edi)
-; X86-NEXT: movl %ecx, 8(%edi)
-; X86-NEXT: movl %ecx, 4(%edi)
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movw %cx, 100(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $30, %edx, %ecx
-; X86-NEXT: movl %ecx, 88(%edi)
+; X86-NEXT: movsbl %dh, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %edx, 96(%esi)
+; X86-NEXT: movl %edx, 92(%esi)
+; X86-NEXT: movl %ebx, 80(%esi)
+; X86-NEXT: movl %eax, 68(%esi)
+; X86-NEXT: movl %eax, 64(%esi)
+; X86-NEXT: movl %edi, 52(%esi)
+; X86-NEXT: movl %edi, 48(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 36(%esi)
+; X86-NEXT: movl %ebp, 24(%esi)
+; X86-NEXT: movl %ebp, 20(%esi)
+; X86-NEXT: movl %ecx, 8(%esi)
+; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movw %dx, 100(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shrdl $2, %edx, %ecx
+; X86-NEXT: movl %ecx, 88(%esi)
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $9, %edx, %ecx
-; X86-NEXT: movl %ecx, 76(%edi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $9, %esi, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %ecx, 76(%edx)
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shldl $20, %edx, %ecx
-; X86-NEXT: movl %ecx, 60(%edi)
+; X86-NEXT: movl %ecx, 60(%esi)
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $31, %esi, %ecx
+; X86-NEXT: movl %ecx, 44(%ebx)
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $31, %edx, %ecx
-; X86-NEXT: movl %ecx, 44(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $10, %edx, %ecx
-; X86-NEXT: movl %ecx, 32(%edi)
-; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $21, %ebx, %ecx
-; X86-NEXT: movl %ecx, 16(%edi)
+; X86-NEXT: shrdl $22, %ebx, %ecx
+; X86-NEXT: movl %ecx, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shrdl $11, %ebp, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %ecx, 16(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: shll $9, %ecx
; X86-NEXT: andl $511, %eax # imm = 0x1FF
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl %eax, 72(%edi)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %eax, 72(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shll $20, %eax
-; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, 56(%edi)
-; X86-NEXT: shll $10, %edx
+; X86-NEXT: andl $1048575, %edi # imm = 0xFFFFF
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, 56(%ecx)
+; X86-NEXT: shll $10, %esi
; X86-NEXT: andl $1023, %ebp # imm = 0x3FF
-; X86-NEXT: orl %edx, %ebp
-; X86-NEXT: movl %ebp, 28(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %esi, %ebp
+; X86-NEXT: movl %ebp, 28(%ecx)
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: shll $21, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl %eax, 12(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: andl $7, %eax
-; X86-NEXT: movb %al, 102(%edi)
+; X86-NEXT: movb %al, 102(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shll $30, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, 84(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shll $31, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, 40(%edi)
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %eax, 84(%ecx)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $10, %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shll $31, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, 40(%eax)
; X86-NEXT: addl $52, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -3267,34 +3280,35 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SETZUCC-NEXT: andq %r13, %rbp
; SETZUCC-NEXT: shldq $62, %rdi, %r13
; SETZUCC-NEXT: movq %r13, 88(%rax)
-; SETZUCC-NEXT: movq %r10, %r13
-; SETZUCC-NEXT: shldq $20, %r9, %r13
+; SETZUCC-NEXT: movq %r9, %r13
+; SETZUCC-NEXT: shrdq $44, %r10, %r13
; SETZUCC-NEXT: movq %r13, 64(%rax)
-; SETZUCC-NEXT: movq %r11, %r13
-; SETZUCC-NEXT: shldq $31, %r8, %r13
+; SETZUCC-NEXT: movq %r8, %r13
+; SETZUCC-NEXT: shrdq $33, %r11, %r13
; SETZUCC-NEXT: movq %r13, 48(%rax)
-; SETZUCC-NEXT: movq %rcx, %r13
-; SETZUCC-NEXT: shldq $42, %rbx, %r13
+; SETZUCC-NEXT: movq %rbx, %r13
+; SETZUCC-NEXT: shrdq $22, %rcx, %r13
; SETZUCC-NEXT: movq %r13, 32(%rax)
-; SETZUCC-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800
-; SETZUCC-NEXT: andq %r12, %r13
-; SETZUCC-NEXT: shldq $53, %rdx, %r12
-; SETZUCC-NEXT: movq %r12, 16(%rax)
-; SETZUCC-NEXT: movq %rbp, %r12
-; SETZUCC-NEXT: shrq $48, %r12
-; SETZUCC-NEXT: movb %r12b, 102(%rax)
+; SETZUCC-NEXT: movq %rdx, %r13
+; SETZUCC-NEXT: shrdq $11, %r12, %r13
+; SETZUCC-NEXT: movq %r13, 16(%rax)
+; SETZUCC-NEXT: movq %rbp, %r13
+; SETZUCC-NEXT: shrq $48, %r13
+; SETZUCC-NEXT: movb %r13b, 102(%rax)
; SETZUCC-NEXT: shrq $32, %rbp
; SETZUCC-NEXT: movw %bp, 100(%rax)
-; SETZUCC-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF
-; SETZUCC-NEXT: andq %r12, %r15
+; SETZUCC-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF
+; SETZUCC-NEXT: andq %r13, %r15
; SETZUCC-NEXT: shldq $9, %r14, %r15
; SETZUCC-NEXT: shlq $62, %rdi
; SETZUCC-NEXT: orq %r15, %rdi
; SETZUCC-NEXT: movq %rdi, 80(%rax)
+; SETZUCC-NEXT: movabsq $9007199254738944, %rdi # imm = 0x1FFFFFFFFFF800
+; SETZUCC-NEXT: andq %r12, %rdi
; SETZUCC-NEXT: shlq $42, %rbx
-; SETZUCC-NEXT: shrq $11, %r13
-; SETZUCC-NEXT: orq %rbx, %r13
-; SETZUCC-NEXT: movq %r13, 24(%rax)
+; SETZUCC-NEXT: shrq $11, %rdi
+; SETZUCC-NEXT: orq %rbx, %rdi
+; SETZUCC-NEXT: movq %rdi, 24(%rax)
; SETZUCC-NEXT: shlq $9, %r14
; SETZUCC-NEXT: andl $511, %r10d # imm = 0x1FF
; SETZUCC-NEXT: orq %r14, %r10
@@ -3311,7 +3325,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SETZUCC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SETZUCC-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SETZUCC-NEXT: movq %xmm0, %rcx
-; SETZUCC-NEXT: andq %r12, %rcx
+; SETZUCC-NEXT: andq %r13, %rcx
; SETZUCC-NEXT: shlq $53, %rdx
; SETZUCC-NEXT: orq %rcx, %rdx
; SETZUCC-NEXT: movq %rdx, 8(%rax)
@@ -3424,34 +3438,35 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; NO-SETZUCC-NEXT: andq %r13, %rbp
; NO-SETZUCC-NEXT: shldq $62, %rdi, %r13
; NO-SETZUCC-NEXT: movq %r13, 88(%rax)
-; NO-SETZUCC-NEXT: movq %r10, %r13
-; NO-SETZUCC-NEXT: shldq $20, %r9, %r13
+; NO-SETZUCC-NEXT: movq %r9, %r13
+; NO-SETZUCC-NEXT: shrdq $44, %r10, %r13
; NO-SETZUCC-NEXT: movq %r13, 64(%rax)
-; NO-SETZUCC-NEXT: movq %r11, %r13
-; NO-SETZUCC-NEXT: shldq $31, %r8, %r13
+; NO-SETZUCC-NEXT: movq %r8, %r13
+; NO-SETZUCC-NEXT: shrdq $33, %r11, %r13
; NO-SETZUCC-NEXT: movq %r13, 48(%rax)
-; NO-SETZUCC-NEXT: movq %rcx, %r13
-; NO-SETZUCC-NEXT: shldq $42, %rbx, %r13
+; NO-SETZUCC-NEXT: movq %rbx, %r13
+; NO-SETZUCC-NEXT: shrdq $22, %rcx, %r13
; NO-SETZUCC-NEXT: movq %r13, 32(%rax)
-; NO-SETZUCC-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800
-; NO-SETZUCC-NEXT: andq %r12, %r13
-; NO-SETZUCC-NEXT: shldq $53, %rdx, %r12
-; NO-SETZUCC-NEXT: movq %r12, 16(%rax)
-; NO-SETZUCC-NEXT: movq %rbp, %r12
-; NO-SETZUCC-NEXT: shrq $48, %r12
-; NO-SETZUCC-NEXT: movb %r12b, 102(%rax)
+; NO-SETZUCC-NEXT: movq %rdx, %r13
+; NO-SETZUCC-NEXT: shrdq $11, %r12, %r13
+; NO-SETZUCC-NEXT: movq %r13, 16(%rax)
+; NO-SETZUCC-NEXT: movq %rbp, %r13
+; NO-SETZUCC-NEXT: shrq $48, %r13
+; NO-SETZUCC-NEXT: movb %r13b, 102(%rax)
; NO-SETZUCC-NEXT: shrq $32, %rbp
; NO-SETZUCC-NEXT: movw %bp, 100(%rax)
-; NO-SETZUCC-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF
-; NO-SETZUCC-NEXT: andq %r12, %r15
+; NO-SETZUCC-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF
+; NO-SETZUCC-NEXT: andq %r13, %r15
; NO-SETZUCC-NEXT: shldq $9, %r14, %r15
; NO-SETZUCC-NEXT: shlq $62, %rdi
; NO-SETZUCC-NEXT: orq %r15, %rdi
; NO-SETZUCC-NEXT: movq %rdi, 80(%rax)
+; NO-SETZUCC-NEXT: movabsq $9007199254738944, %rdi # imm = 0x1FFFFFFFFFF800
+; NO-SETZUCC-NEXT: andq %r12, %rdi
; NO-SETZUCC-NEXT: shlq $42, %rbx
-; NO-SETZUCC-NEXT: shrq $11, %r13
-; NO-SETZUCC-NEXT: orq %rbx, %r13
-; NO-SETZUCC-NEXT: movq %r13, 24(%rax)
+; NO-SETZUCC-NEXT: shrq $11, %rdi
+; NO-SETZUCC-NEXT: orq %rbx, %rdi
+; NO-SETZUCC-NEXT: movq %rdi, 24(%rax)
; NO-SETZUCC-NEXT: shlq $9, %r14
; NO-SETZUCC-NEXT: andl $511, %r10d # imm = 0x1FF
; NO-SETZUCC-NEXT: orq %r14, %r10
@@ -3468,7 +3483,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; NO-SETZUCC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; NO-SETZUCC-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; NO-SETZUCC-NEXT: movq %xmm0, %rcx
-; NO-SETZUCC-NEXT: andq %r12, %rcx
+; NO-SETZUCC-NEXT: andq %r13, %rcx
; NO-SETZUCC-NEXT: shlq $53, %rdx
; NO-SETZUCC-NEXT: orq %rcx, %rdx
; NO-SETZUCC-NEXT: movq %rdx, 8(%rax)
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 7df490f984928..d929f48fa7146 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -558,7 +558,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: subq $120, %rsp
+; X64-NEXT: subq $104, %rsp
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pxor %xmm2, %xmm2
@@ -568,108 +568,106 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT: psrad $31, %xmm2
; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm3, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
+; X64-NEXT: movq %xmm3, %r15
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: shldq $31, %r15, %rbx
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm1, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %xmm1, %r12
+; X64-NEXT: movq %r12, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %rbp
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %ebx, %r14d
+; X64-NEXT: testb %r14b, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rdx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rdx, %r13
+; X64-NEXT: movq %rbp, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r14
-; X64-NEXT: cmovgeq %rdx, %rbp
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: cmovgeq %rdx, %r13
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rcx
+; X64-NEXT: cmpq %r13, %rcx
; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r14, %rax
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movq %rbp, %xmm0
+; X64-NEXT: sbbq %rbp, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %r15
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: shldq $31, %r15, %rbx
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %xmm0, %r12
+; X64-NEXT: movq %r12, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %rbp
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %ebx, %r14d
+; X64-NEXT: testb %r14b, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq (%rsp), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %rbp, %rax
; X64-NEXT: sbbq $0, %rax
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r14
-; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: cmovgeq %rax, %rbp
+; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rcx
+; X64-NEXT: cmpq %r13, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r14, %rax
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movq %rbp, %xmm0
+; X64-NEXT: sbbq %rbp, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
@@ -682,111 +680,109 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: psrad $31, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %r15
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: shldq $31, %r15, %rbx
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pcmpgtd %xmm0, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %xmm0, %r12
+; X64-NEXT: movq %r12, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %rbp
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %ebx, %r14d
+; X64-NEXT: testb %r14b, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq (%rsp), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %rbp, %rax
; X64-NEXT: sbbq $0, %rax
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r14
-; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: cmovgeq %rax, %rbp
+; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rcx
+; X64-NEXT: cmpq %r13, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r14, %rax
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movq %rbp, %xmm0
-; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: sbbq %rbp, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %r15
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: shldq $31, %r15, %rbx
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %xmm0, %r12
+; X64-NEXT: movq %r12, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %rbp
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: movq %r14, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %ebx, %r14d
+; X64-NEXT: testb %r14b, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %rbp, %rax
; X64-NEXT: sbbq $0, %rax
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r14
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rax
-; X64-NEXT: sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: cmovgeq %rax, %rbp
-; X64-NEXT: movq %rbp, %xmm1
-; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
+; X64-NEXT: cmpq %r13, %rax
+; X64-NEXT: sbbq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: cmovgeq %rax, %r13
+; X64-NEXT: movq %r13, %xmm1
+; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: psrlq $1, %xmm0
; X64-NEXT: shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = xmm0[0,2],mem[0,2]
-; X64-NEXT: addq $120, %rsp
+; X64-NEXT: addq $104, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
@@ -955,10 +951,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: subl $1, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %ecx
@@ -979,12 +975,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: testb %bl, %bh
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: cmpl $-1, %eax
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: cmpl $-1, %edi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: sbbl $0, %esi
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: sbbl $0, %esi
@@ -992,29 +988,29 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: sbbl $0, %esi
; X86-NEXT: cmovgel %ebx, %edx
; X86-NEXT: cmovgel %ebx, %ecx
-; X86-NEXT: cmovgel %ebx, %edi
+; X86-NEXT: cmovgel %ebx, %eax
; X86-NEXT: movl $-1, %esi
-; X86-NEXT: cmovgel %esi, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: cmovgel %esi, %edi
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: negl %esi
; X86-NEXT: movl $-1, %esi
-; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl $-1, %esi
; X86-NEXT: sbbl %ecx, %esi
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: sbbl %edx, %ecx
-; X86-NEXT: cmovgel %ebx, %eax
-; X86-NEXT: movl $-1, %edx
-; X86-NEXT: cmovgel %edx, %edi
-; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovgel %ecx, %eax
+; X86-NEXT: cmovgel %ebx, %edi
+; X86-NEXT: shrdl $1, %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: subl $1, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %edi
@@ -1035,11 +1031,11 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: testb %bh, %cl
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: cmpl $-1, %eax
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: cmpl $-1, %esi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: sbbl $0, %ecx
@@ -1048,29 +1044,29 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl $0, %ecx
; X86-NEXT: cmovgel %ecx, %edx
; X86-NEXT: cmovgel %ecx, %edi
-; X86-NEXT: cmovgel %ecx, %esi
+; X86-NEXT: cmovgel %ecx, %eax
; X86-NEXT: movl $-1, %ebx
-; X86-NEXT: cmovgel %ebx, %eax
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: cmovgel %ebx, %esi
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: negl %ecx
; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: cmovgel %ebx, %eax
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: cmovgel %ecx, %eax
-; X86-NEXT: cmovgel %ebx, %esi
-; X86-NEXT: shldl $31, %eax, %esi
+; X86-NEXT: cmovgel %ecx, %esi
+; X86-NEXT: shrdl $1, %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: subl $1, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %edi
@@ -1091,11 +1087,11 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: testb %bh, %cl
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: cmpl $-1, %eax
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: cmpl $-1, %ebx
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: sbbl $0, %ecx
@@ -1104,21 +1100,21 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl $0, %ecx
; X86-NEXT: cmovgel %ecx, %edx
; X86-NEXT: cmovgel %ecx, %edi
-; X86-NEXT: cmovgel %ecx, %ebx
+; X86-NEXT: cmovgel %ecx, %eax
; X86-NEXT: movl $-1, %esi
-; X86-NEXT: cmovgel %esi, %eax
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: cmovgel %esi, %ebx
+; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: negl %ecx
; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: cmovgel %esi, %eax
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: cmovgel %ecx, %eax
-; X86-NEXT: cmovgel %esi, %ebx
-; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: cmovgel %ecx, %ebx
+; X86-NEXT: shrdl $1, %eax, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1174,12 +1170,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: movl $-1, %eax
; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: cmovgel %edx, %esi
; X86-NEXT: movl $0, %eax
; X86-NEXT: cmovgel %eax, %ebx
-; X86-NEXT: cmovgel %edx, %esi
-; X86-NEXT: shldl $31, %ebx, %esi
+; X86-NEXT: shrdl $1, %esi, %ebx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll
index f2627df3a98d8..a467720fbbe61 100644
--- a/llvm/test/CodeGen/X86/shift-and.ll
+++ b/llvm/test/CodeGen/X86/shift-and.ll
@@ -168,19 +168,21 @@ define void @t5ptr(i64 %t, ptr %ptr) nounwind {
define i64 @t6(i64 %key, ptr nocapture %val) nounwind {
; X86-LABEL: t6:
; X86: # %bb.0:
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: shrdl $3, %esi, %ecx
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%edx), %eax
-; X86-NEXT: movl 4(%edx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: shrdl $3, %edi, %esi
+; X86-NEXT: shrl $3, %edi
+; X86-NEXT: movl (%ecx), %eax
+; X86-NEXT: movl 4(%ecx), %edx
; X86-NEXT: addl $-1, %eax
; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: andl %esi, %edx
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: andl %edi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: t6:
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index 016e3a239180a..05e16b3dea562 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -1802,55 +1802,58 @@ define i512 @shl_i512_200(i512 %a0) nounwind {
; SSE-LABEL: shl_i512_200:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq %rsi, %rdi
+; SSE-NEXT: shrdq $56, %rdx, %rdi
+; SSE-NEXT: shrdq $56, %rcx, %rdx
+; SSE-NEXT: shrdq $56, %r8, %rcx
; SSE-NEXT: shldq $8, %r8, %r9
-; SSE-NEXT: shldq $8, %rcx, %r8
-; SSE-NEXT: shldq $8, %rdx, %rcx
-; SSE-NEXT: shldq $8, %rsi, %rdx
; SSE-NEXT: shlq $8, %rsi
-; SSE-NEXT: movq %r9, 56(%rdi)
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rcx, 40(%rdi)
-; SSE-NEXT: movq %rdx, 32(%rdi)
-; SSE-NEXT: movq %rsi, 24(%rdi)
+; SSE-NEXT: movq %r9, 56(%rax)
+; SSE-NEXT: movq %rcx, 48(%rax)
+; SSE-NEXT: movq %rdx, 40(%rax)
+; SSE-NEXT: movq %rdi, 32(%rax)
+; SSE-NEXT: movq %rsi, 24(%rax)
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, (%rdi)
-; SSE-NEXT: movq $0, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, (%rax)
+; SSE-NEXT: movq $0, 16(%rax)
; SSE-NEXT: retq
;
; AVX2-LABEL: shl_i512_200:
; AVX2: # %bb.0:
; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq %rsi, %rdi
+; AVX2-NEXT: shrdq $56, %rdx, %rdi
+; AVX2-NEXT: shrdq $56, %rcx, %rdx
+; AVX2-NEXT: shrdq $56, %r8, %rcx
; AVX2-NEXT: shldq $8, %r8, %r9
-; AVX2-NEXT: shldq $8, %rcx, %r8
-; AVX2-NEXT: shldq $8, %rdx, %rcx
-; AVX2-NEXT: shldq $8, %rsi, %rdx
; AVX2-NEXT: shlq $8, %rsi
-; AVX2-NEXT: movq %r9, 56(%rdi)
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %rcx, 40(%rdi)
-; AVX2-NEXT: movq %rdx, 32(%rdi)
-; AVX2-NEXT: movq %rsi, 24(%rdi)
+; AVX2-NEXT: movq %r9, 56(%rax)
+; AVX2-NEXT: movq %rcx, 48(%rax)
+; AVX2-NEXT: movq %rdx, 40(%rax)
+; AVX2-NEXT: movq %rdi, 32(%rax)
+; AVX2-NEXT: movq %rsi, 24(%rax)
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovaps %xmm0, (%rdi)
-; AVX2-NEXT: movq $0, 16(%rdi)
+; AVX2-NEXT: vmovaps %xmm0, (%rax)
+; AVX2-NEXT: movq $0, 16(%rax)
; AVX2-NEXT: retq
;
; AVX512-LABEL: shl_i512_200:
; AVX512: # %bb.0:
; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: movq %rsi, %rdi
+; AVX512-NEXT: shrdq $56, %rdx, %rdi
+; AVX512-NEXT: shrdq $56, %rcx, %rdx
+; AVX512-NEXT: shrdq $56, %r8, %rcx
; AVX512-NEXT: shldq $8, %r8, %r9
-; AVX512-NEXT: shldq $8, %rcx, %r8
-; AVX512-NEXT: shldq $8, %rdx, %rcx
-; AVX512-NEXT: shldq $8, %rsi, %rdx
; AVX512-NEXT: shlq $8, %rsi
-; AVX512-NEXT: movq %r9, 56(%rdi)
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %rcx, 40(%rdi)
-; AVX512-NEXT: movq %rdx, 32(%rdi)
-; AVX512-NEXT: movq %rsi, 24(%rdi)
+; AVX512-NEXT: movq %r9, 56(%rax)
+; AVX512-NEXT: movq %rcx, 48(%rax)
+; AVX512-NEXT: movq %rdx, 40(%rax)
+; AVX512-NEXT: movq %rdi, 32(%rax)
+; AVX512-NEXT: movq %rsi, 24(%rax)
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rdi)
-; AVX512-NEXT: movq $0, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, (%rax)
+; AVX512-NEXT: movq $0, 16(%rax)
; AVX512-NEXT: retq
%r = shl i512 %a0, 200
ret i512 %r
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 509d4443e930a..c12a66247ca02 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -727,21 +727,21 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl 48(%ebp), %ecx
-; X86-NEXT: movl 52(%ebp), %edx
-; X86-NEXT: shrdl $28, %edx, %ecx
-; X86-NEXT: sarl $28, %edx
-; X86-NEXT: cmpl %esi, %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: shrdl $28, %ecx, %edx
+; X86-NEXT: sarl $28, %ecx
+; X86-NEXT: cmpl %esi, %edx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: cmovll %esi, %ecx
-; X86-NEXT: cmovll %eax, %edx
+; X86-NEXT: cmovll %esi, %edx
+; X86-NEXT: cmovll %eax, %ecx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: leal -8(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 5e9fe27b41d2c..cdfb484be0b6c 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -727,21 +727,21 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl 48(%ebp), %ecx
-; X86-NEXT: movl 52(%ebp), %edx
-; X86-NEXT: shrdl $28, %edx, %ecx
-; X86-NEXT: sarl $28, %edx
-; X86-NEXT: cmpl %ecx, %esi
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: shrdl $28, %ecx, %edx
+; X86-NEXT: sarl $28, %ecx
+; X86-NEXT: cmpl %edx, %esi
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: sbbl %edx, %edi
-; X86-NEXT: cmovll %esi, %ecx
-; X86-NEXT: cmovll %eax, %edx
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: cmovll %esi, %edx
+; X86-NEXT: cmovll %eax, %ecx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: leal -8(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 3da5973f9f903..ea4dc0330df71 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -180,10 +180,10 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: callq __udivti3 at PLT
; X64-NEXT: cmpq $2, %rdx
-; X64-NEXT: movq $-1, %rcx
-; X64-NEXT: cmovaeq %rcx, %rax
; X64-NEXT: movl $1, %ecx
; X64-NEXT: cmovbq %rdx, %rcx
+; X64-NEXT: movq $-1, %rdx
+; X64-NEXT: cmovaeq %rdx, %rax
; X64-NEXT: shrdq $1, %rcx, %rax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@@ -384,9 +384,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax), %ecx
; X86-NEXT: shrl $31, %eax
@@ -398,18 +399,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: leal (%ebx,%ebx), %eax
-; X86-NEXT: shrl $31, %ebx
-; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: leal (%edi,%edi), %eax
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: shldl $31, %eax, %edi
; X86-NEXT: pushl $0
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: leal (%esi,%esi), %eax
; X86-NEXT: shrl $31, %esi
; X86-NEXT: shldl $31, %eax, %esi
@@ -419,44 +420,45 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: leal (%edx,%edx), %ecx
-; X86-NEXT: shrl $31, %edx
-; X86-NEXT: shldl $31, %ecx, %edx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: cmpl $2, %esi
-; X86-NEXT: movl $-1, %edx
-; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: leal (%ebp,%ebp), %eax
+; X86-NEXT: shrl $31, %ebp
+; X86-NEXT: shldl $31, %eax, %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: cmpl $2, %edx
; X86-NEXT: movl $1, %ebp
-; X86-NEXT: cmovael %ebp, %esi
-; X86-NEXT: shldl $31, %eax, %esi
-; X86-NEXT: cmpl $2, %ebx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: cmovael %edx, %eax
-; X86-NEXT: cmovael %ebp, %ebx
-; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: cmovael %ebp, %edx
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovael %ecx, %esi
+; X86-NEXT: shrdl $1, %edx, %esi
; X86-NEXT: cmpl $2, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovael %edx, %eax
; X86-NEXT: cmovael %ebp, %edi
-; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovael %edx, %ebx
+; X86-NEXT: shrdl $1, %edi, %ebx
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: cmpl $2, %ecx
+; X86-NEXT: cmovael %ebp, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: cmovael %edx, %edi
+; X86-NEXT: shrdl $1, %ecx, %edi
; X86-NEXT: pushl $0
; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %eax
; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: cmpl $2, %edx
+; X86-NEXT: cmovbl %edx, %ebp
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: cmovael %ecx, %eax
-; X86-NEXT: cmovbl %edx, %ebp
-; X86-NEXT: shldl $31, %eax, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: shrdl $1, %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %eax, 12(%ecx)
+; X86-NEXT: movl %edi, 8(%ecx)
+; X86-NEXT: movl %ebx, 4(%ecx)
+; X86-NEXT: movl %esi, (%ecx)
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl $8, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index 7ef859978cdbf..9943956e50efc 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -1325,21 +1325,21 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl 48(%ebp), %ecx
-; X86-NEXT: movl 52(%ebp), %edx
-; X86-NEXT: shrdl $28, %edx, %ecx
-; X86-NEXT: sarl $28, %edx
-; X86-NEXT: cmpl %esi, %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: shrdl $28, %ecx, %edx
+; X86-NEXT: sarl $28, %ecx
+; X86-NEXT: cmpl %esi, %edx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: cmovbl %esi, %ecx
-; X86-NEXT: cmovbl %eax, %edx
+; X86-NEXT: cmovbl %esi, %edx
+; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: leal -8(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index c927abf3a4263..79e6eb32605d9 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -736,21 +736,21 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl 48(%ebp), %ecx
-; X86-NEXT: movl 52(%ebp), %edx
-; X86-NEXT: shrdl $28, %edx, %ecx
-; X86-NEXT: sarl $28, %edx
-; X86-NEXT: cmpl %ecx, %esi
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: shrdl $28, %ecx, %edx
+; X86-NEXT: sarl $28, %ecx
+; X86-NEXT: cmpl %edx, %esi
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: sbbl %edx, %edi
-; X86-NEXT: cmovbl %esi, %ecx
-; X86-NEXT: cmovbl %eax, %edx
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: cmovbl %esi, %edx
+; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: leal -8(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 1e11ea97396da..6ce34991050ac 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -3609,12 +3609,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
; SSE2-NEXT: psrad $15, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movl 8(%rdi), %ecx
-; SSE2-NEXT: shll $28, %ecx
-; SSE2-NEXT: movq %rax, %rdx
-; SSE2-NEXT: shrq $51, %rdx
-; SSE2-NEXT: shll $15, %edx
-; SSE2-NEXT: orl %ecx, %edx
-; SSE2-NEXT: movd %edx, %xmm1
+; SSE2-NEXT: shldq $13, %rax, %rcx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: pslld $15, %xmm1
; SSE2-NEXT: psrad $15, %xmm1
; SSE2-NEXT: shrq $34, %rax
; SSE2-NEXT: movd %eax, %xmm2
@@ -3637,12 +3634,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
; SSSE3-NEXT: psrad $15, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movl 8(%rdi), %ecx
-; SSSE3-NEXT: shll $28, %ecx
-; SSSE3-NEXT: movq %rax, %rdx
-; SSSE3-NEXT: shrq $51, %rdx
-; SSSE3-NEXT: shll $15, %edx
-; SSSE3-NEXT: orl %ecx, %edx
-; SSSE3-NEXT: movd %edx, %xmm1
+; SSSE3-NEXT: shldq $13, %rax, %rcx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: pslld $15, %xmm1
; SSSE3-NEXT: psrad $15, %xmm1
; SSSE3-NEXT: shrq $34, %rax
; SSSE3-NEXT: movd %eax, %xmm2
@@ -3655,51 +3649,45 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
; SSE41-LABEL: sext_4i17_to_4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq (%rdi), %rax
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq $17, %rcx
-; SSE41-NEXT: shll $15, %ecx
-; SSE41-NEXT: sarl $15, %ecx
; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movq %rax, %rcx
+; SSE41-NEXT: movl 8(%rdi), %edx
+; SSE41-NEXT: shldq $13, %rax, %rdx
+; SSE41-NEXT: shrq $17, %rax
+; SSE41-NEXT: shll $15, %eax
+; SSE41-NEXT: sarl $15, %eax
; SSE41-NEXT: pslld $15, %xmm0
; SSE41-NEXT: psrad $15, %xmm0
-; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; SSE41-NEXT: movq %rax, %rcx
+; SSE41-NEXT: pinsrd $1, %eax, %xmm0
; SSE41-NEXT: shrq $34, %rcx
; SSE41-NEXT: shll $15, %ecx
; SSE41-NEXT: sarl $15, %ecx
; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
-; SSE41-NEXT: movl 8(%rdi), %ecx
-; SSE41-NEXT: shll $28, %ecx
-; SSE41-NEXT: shrq $51, %rax
-; SSE41-NEXT: shll $15, %eax
-; SSE41-NEXT: orl %ecx, %eax
-; SSE41-NEXT: sarl $15, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; SSE41-NEXT: shll $15, %edx
+; SSE41-NEXT: sarl $15, %edx
+; SSE41-NEXT: pinsrd $3, %edx, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_4i17_to_4i32:
; AVX: # %bb.0:
; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $17, %rcx
-; AVX-NEXT: shll $15, %ecx
-; AVX-NEXT: sarl $15, %ecx
; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: movq %rax, %rcx
+; AVX-NEXT: movl 8(%rdi), %edx
+; AVX-NEXT: shldq $13, %rax, %rdx
+; AVX-NEXT: shrq $17, %rax
+; AVX-NEXT: shll $15, %eax
+; AVX-NEXT: sarl $15, %eax
; AVX-NEXT: vpslld $15, %xmm0, %xmm0
; AVX-NEXT: vpsrad $15, %xmm0, %xmm0
-; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
+; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX-NEXT: shrq $34, %rcx
; AVX-NEXT: shll $15, %ecx
; AVX-NEXT: sarl $15, %ecx
; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movl 8(%rdi), %ecx
-; AVX-NEXT: shll $28, %ecx
-; AVX-NEXT: shrq $51, %rax
-; AVX-NEXT: shll $15, %eax
-; AVX-NEXT: orl %ecx, %eax
-; AVX-NEXT: sarl $15, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX-NEXT: shll $15, %edx
+; AVX-NEXT: sarl $15, %edx
+; AVX-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
; AVX-NEXT: retq
;
; X86-SSE2-LABEL: sext_4i17_to_4i32:
@@ -3719,8 +3707,8 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
; X86-SSE2-NEXT: movd %eax, %xmm0
; X86-SSE2-NEXT: pslld $15, %xmm0
; X86-SSE2-NEXT: psrad $15, %xmm0
-; X86-SSE2-NEXT: shldl $15, %eax, %ecx
-; X86-SSE2-NEXT: movd %ecx, %xmm2
+; X86-SSE2-NEXT: shrdl $17, %ecx, %eax
+; X86-SSE2-NEXT: movd %eax, %xmm2
; X86-SSE2-NEXT: pslld $15, %xmm2
; X86-SSE2-NEXT: psrad $15, %xmm2
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -3729,30 +3717,24 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
;
; X86-SSE41-LABEL: sext_4i17_to_4i32:
; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: pushl %esi
-; X86-SSE41-NEXT: .cfi_def_cfa_offset 8
-; X86-SSE41-NEXT: .cfi_offset %esi, -8
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE41-NEXT: movl (%edx), %eax
-; X86-SSE41-NEXT: movl 4(%edx), %ecx
-; X86-SSE41-NEXT: movl %ecx, %esi
-; X86-SSE41-NEXT: movl 8(%edx), %edx
-; X86-SSE41-NEXT: shldl $13, %ecx, %edx
-; X86-SSE41-NEXT: shldl $15, %eax, %ecx
-; X86-SSE41-NEXT: shll $15, %ecx
-; X86-SSE41-NEXT: sarl $15, %ecx
-; X86-SSE41-NEXT: movd %eax, %xmm0
-; X86-SSE41-NEXT: pslld $15, %xmm0
-; X86-SSE41-NEXT: psrad $15, %xmm0
-; X86-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; X86-SSE41-NEXT: shll $13, %esi
-; X86-SSE41-NEXT: sarl $15, %esi
-; X86-SSE41-NEXT: pinsrd $2, %esi, %xmm0
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movl (%ecx), %edx
+; X86-SSE41-NEXT: movl 4(%ecx), %eax
+; X86-SSE41-NEXT: movd %edx, %xmm0
+; X86-SSE41-NEXT: shrdl $17, %eax, %edx
; X86-SSE41-NEXT: shll $15, %edx
; X86-SSE41-NEXT: sarl $15, %edx
-; X86-SSE41-NEXT: pinsrd $3, %edx, %xmm0
-; X86-SSE41-NEXT: popl %esi
-; X86-SSE41-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE41-NEXT: pslld $15, %xmm0
+; X86-SSE41-NEXT: psrad $15, %xmm0
+; X86-SSE41-NEXT: pinsrd $1, %edx, %xmm0
+; X86-SSE41-NEXT: movl 8(%ecx), %ecx
+; X86-SSE41-NEXT: shldl $13, %eax, %ecx
+; X86-SSE41-NEXT: shll $13, %eax
+; X86-SSE41-NEXT: sarl $15, %eax
+; X86-SSE41-NEXT: pinsrd $2, %eax, %xmm0
+; X86-SSE41-NEXT: shll $15, %ecx
+; X86-SSE41-NEXT: sarl $15, %ecx
+; X86-SSE41-NEXT: pinsrd $3, %ecx, %xmm0
; X86-SSE41-NEXT: retl
%a = load <4 x i17>, ptr %ptr
%b = sext <4 x i17> %a to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 7b0f1c9f8a660..e061e4e35e1e7 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -2332,11 +2332,8 @@ define <4 x i32> @zext_4i17_to_4i32(ptr %ptr) {
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movl 8(%rdi), %ecx
-; SSE2-NEXT: shll $13, %ecx
-; SSE2-NEXT: movq %rax, %rdx
-; SSE2-NEXT: shrq $51, %rdx
-; SSE2-NEXT: orl %ecx, %edx
-; SSE2-NEXT: movd %edx, %xmm1
+; SSE2-NEXT: shldq $13, %rax, %rcx
+; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: shrq $34, %rax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -2353,11 +2350,8 @@ define <4 x i32> @zext_4i17_to_4i32(ptr %ptr) {
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movl 8(%rdi), %ecx
-; SSSE3-NEXT: shll $13, %ecx
-; SSSE3-NEXT: movq %rax, %rdx
-; SSSE3-NEXT: shrq $51, %rdx
-; SSSE3-NEXT: orl %ecx, %edx
-; SSSE3-NEXT: movd %edx, %xmm1
+; SSSE3-NEXT: shldq $13, %rax, %rcx
+; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: shrq $34, %rax
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -2367,15 +2361,12 @@ define <4 x i32> @zext_4i17_to_4i32(ptr %ptr) {
;
; SSE41-LABEL: zext_4i17_to_4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movl 8(%rdi), %eax
-; SSE41-NEXT: shll $13, %eax
-; SSE41-NEXT: movq (%rdi), %rcx
-; SSE41-NEXT: movq %rcx, %rdx
-; SSE41-NEXT: shrq $51, %rdx
-; SSE41-NEXT: orl %eax, %edx
-; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: movq (%rdi), %rax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movq %rax, %rcx
+; SSE41-NEXT: movl 8(%rdi), %edx
+; SSE41-NEXT: shldq $13, %rax, %rdx
; SSE41-NEXT: shrq $17, %rax
-; SSE41-NEXT: movd %ecx, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
; SSE41-NEXT: shrq $34, %rcx
; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
@@ -2385,15 +2376,12 @@ define <4 x i32> @zext_4i17_to_4i32(ptr %ptr) {
;
; AVX1-LABEL: zext_4i17_to_4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: movl 8(%rdi), %eax
-; AVX1-NEXT: shll $13, %eax
-; AVX1-NEXT: movq (%rdi), %rcx
-; AVX1-NEXT: movq %rcx, %rdx
-; AVX1-NEXT: shrq $51, %rdx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: movl 8(%rdi), %edx
+; AVX1-NEXT: shldq $13, %rax, %rdx
; AVX1-NEXT: shrq $17, %rax
-; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: shrq $34, %rcx
; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
@@ -2403,15 +2391,12 @@ define <4 x i32> @zext_4i17_to_4i32(ptr %ptr) {
;
; AVX2-LABEL: zext_4i17_to_4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: movl 8(%rdi), %eax
-; AVX2-NEXT: shll $13, %eax
-; AVX2-NEXT: movq (%rdi), %rcx
-; AVX2-NEXT: movq %rcx, %rdx
-; AVX2-NEXT: shrq $51, %rdx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movl 8(%rdi), %edx
+; AVX2-NEXT: shldq $13, %rax, %rdx
; AVX2-NEXT: shrq $17, %rax
-; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX2-NEXT: shrq $34, %rcx
; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
@@ -2422,15 +2407,12 @@ define <4 x i32> @zext_4i17_to_4i32(ptr %ptr) {
;
; AVX512-LABEL: zext_4i17_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: movl 8(%rdi), %eax
-; AVX512-NEXT: shll $13, %eax
-; AVX512-NEXT: movq (%rdi), %rcx
-; AVX512-NEXT: movq %rcx, %rdx
-; AVX512-NEXT: shrq $51, %rdx
-; AVX512-NEXT: orl %eax, %edx
-; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: movl 8(%rdi), %edx
+; AVX512-NEXT: shldq $13, %rax, %rdx
; AVX512-NEXT: shrq $17, %rax
-; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX512-NEXT: shrq $34, %rcx
; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
>From f11b90dc5733983d8d504341f9475cb61f432cb4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 10 Feb 2026 19:24:46 -0800
Subject: [PATCH 3/3] [DAGCombiner] Combine (fshl A, B, S) | (fshr C, D, BW-S)
--> (fshl (A|C), (B|D), S)
This is similar to the FSHL/FSHR handling in hoistLogicOpWithSameOpcodeHands.
Here the opcodes aren't exactly the same, but the operations are
equivalent.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 +++++++++++++++
llvm/test/CodeGen/X86/icmp-shift-opt.ll | 20 ++++++++-----------
2 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 11f0f6d8611c7..ac169e3b7361c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8527,6 +8527,22 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
}
}
+ // (fshl A, B, S) | (fshr C, D, BW-S) --> fshl (A|C), (B|D), S
+ if (N0.getOpcode() == ISD::FSHL && N1.getOpcode() == ISD::FSHR &&
+ N0.hasOneUse() && N1.hasOneUse()) {
+ auto *S0 = dyn_cast<ConstantSDNode>(N0.getOperand(2));
+ auto *S1 = dyn_cast<ConstantSDNode>(N1.getOperand(2));
+ if (S0 && S1 && (S0->getZExtValue() + S1->getZExtValue()) == BW) {
+ SDValue A = N0.getOperand(0);
+ SDValue B = N0.getOperand(1);
+ SDValue C = N1.getOperand(0);
+ SDValue D = N1.getOperand(1);
+ SDValue NewLHS = DAG.getNode(ISD::OR, DL, VT, A, C);
+ SDValue NewRHS = DAG.getNode(ISD::OR, DL, VT, B, D);
+ return DAG.getNode(ISD::FSHL, DL, VT, NewLHS, NewRHS, N0.getOperand(2));
+ }
+ }
+
// Attempt to match a legalized build_pair-esque pattern:
// or(shl(aext(Hi),BW/2),zext(Lo))
SDValue Lo, Hi;
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 7ba7f6212d517..0296c2a011e25 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -83,14 +83,12 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: shrl $17, %eax
; X86-NEXT: orl 20(%ebp), %ecx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: shldl $15, %ecx, %edx
-; X86-NEXT: orl 8(%ebp), %eax
-; X86-NEXT: shrdl $17, %ecx, %eax
-; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl 16(%ebp), %ecx
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %al
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@@ -114,14 +112,12 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: shrl $17, %eax
; X86-NEXT: orl 20(%ebp), %ecx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: shldl $15, %ecx, %edx
-; X86-NEXT: orl 8(%ebp), %eax
-; X86-NEXT: shrdl $17, %ecx, %eax
-; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl 16(%ebp), %ecx
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: setne %al
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
- Previous message: [llvm] [DAGCombiner] Combine (fshl A, X, Y) | (shl X, Y) --> fshl (A|X), X, Y (PR #180888)
- Next message: [llvm] [DAGCombiner] Combine (fshl A, B, S) | (fshr C, D, BW-S) --> (fshl (A|C), (B|D), S) (PR #180889)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the llvm-commits
mailing list