[llvm] 24a3192 - [MachineSink] sink more profitable loads
Chen Zheng via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 1 18:16:38 PST 2020
Author: Chen Zheng
Date: 2020-11-01T21:13:27-05:00
New Revision: 24a31922ce2ab1ae9e6bda1a315774010bf402d6
URL: https://github.com/llvm/llvm-project/commit/24a31922ce2ab1ae9e6bda1a315774010bf402d6
DIFF: https://github.com/llvm/llvm-project/commit/24a31922ce2ab1ae9e6bda1a315774010bf402d6.diff
LOG: [MachineSink] sink more profitable loads
Reviewed By: qcolombet
Differential Revision: https://reviews.llvm.org/D86864
Added:
Modified:
llvm/lib/CodeGen/MachineSink.cpp
llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
llvm/test/CodeGen/X86/MachineSink-eflags.ll
llvm/test/CodeGen/X86/avx2-masked-gather.ll
llvm/test/CodeGen/X86/cmovcmov.ll
llvm/test/CodeGen/X86/select.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 1914075d22e2..0c7c1cb67723 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -127,6 +127,12 @@ namespace {
/// current block.
DenseSet<DebugVariable> SeenDbgVars;
+ std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>, bool>
+ HasStoreCache;
+ std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>,
+ std::vector<MachineInstr *>>
+ StoreInstrCache;
+
public:
static char ID; // Pass identification
@@ -159,6 +165,9 @@ namespace {
MachineBasicBlock *From,
MachineBasicBlock *To);
+ bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To,
+ MachineInstr &MI);
+
/// Postpone the splitting of the given critical
/// edge (\p From, \p To).
///
@@ -359,6 +368,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
EverMadeChange = true;
}
+ HasStoreCache.clear();
+ StoreInstrCache.clear();
+
// Now clear any kill flags for recorded registers.
for (auto I : RegsToClearKillFlags)
MRI->clearKillFlags(I);
@@ -919,6 +931,73 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
}
}
+/// hasStoreBetween - check if there is store betweeen straight line blocks From
+/// and To.
+bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
+ MachineBasicBlock *To, MachineInstr &MI) {
+ // Make sure From and To are in straight line which means From dominates To
+ // and To post dominates From.
+ if (!DT->dominates(From, To) || !PDT->dominates(To, From))
+ return true;
+
+ auto BlockPair = std::make_pair(From, To);
+
+ // Does these two blocks pair be queried before and have a definite cached
+ // result?
+ if (HasStoreCache.find(BlockPair) != HasStoreCache.end())
+ return HasStoreCache[BlockPair];
+
+ if (StoreInstrCache.find(BlockPair) != StoreInstrCache.end())
+ return std::any_of(
+ StoreInstrCache[BlockPair].begin(), StoreInstrCache[BlockPair].end(),
+ [&](MachineInstr *I) { return I->mayAlias(AA, MI, false); });
+
+ bool SawStore = false;
+ bool HasAliasedStore = false;
+ DenseSet<MachineBasicBlock *> HandledBlocks;
+ // Go through all reachable blocks from From.
+ for (MachineBasicBlock *BB : depth_first(From)) {
+ // We insert the instruction at the start of block To, so no need to worry
+ // about stores inside To.
+ // Store in block From should be already considered when just enter function
+ // SinkInstruction.
+ if (BB == To || BB == From)
+ continue;
+
+ // We already handle this BB in previous iteration.
+ if (HandledBlocks.count(BB))
+ continue;
+
+ HandledBlocks.insert(BB);
+ // To post dominates BB, it must be a path from block From.
+ if (PDT->dominates(To, BB)) {
+ for (MachineInstr &I : *BB) {
+ // Treat as alias conservatively for a call or an ordered memory
+ // operation.
+ if (I.isCall() || I.hasOrderedMemoryRef()) {
+ HasStoreCache[BlockPair] = true;
+ return true;
+ }
+
+ if (I.mayStore()) {
+ SawStore = true;
+ // We still have chance to sink MI if all stores between are not
+ // aliased to MI.
+ // Cache all store instructions, so that we don't need to go through
+ // all From reachable blocks for next load instruction.
+ if (I.mayAlias(AA, MI, false))
+ HasAliasedStore = true;
+ StoreInstrCache[BlockPair].push_back(&I);
+ }
+ }
+ }
+ }
+ // If there is no store at all, cache the result.
+ if (!SawStore)
+ HasStoreCache[BlockPair] = false;
+ return HasAliasedStore;
+}
+
/// SinkInstruction - Determine whether it is safe to sink the specified machine
/// instruction out of its current block into a successor.
bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
@@ -979,8 +1058,9 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
bool TryBreak = false;
- bool store = true;
- if (!MI.isSafeToMove(AA, store)) {
+ bool Store =
+ MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
+ if (!MI.isSafeToMove(AA, Store)) {
LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
TryBreak = true;
}
diff --git a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
index d38f3d577147..9813df4fbb85 100644
--- a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
+++ b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll
@@ -40,40 +40,39 @@ define i128 @cmovcc128(i64 signext %a, i128 %b, i128 %c) nounwind {
; RV32I-LABEL: cmovcc128:
; RV32I: # %bb.0: # %entry
; RV32I-NEXT: xori a1, a1, 123
-; RV32I-NEXT: or a2, a1, a2
-; RV32I-NEXT: mv a1, a3
-; RV32I-NEXT: beqz a2, .LBB1_2
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: mv a2, a3
+; RV32I-NEXT: beqz a1, .LBB1_2
; RV32I-NEXT: # %bb.1: # %entry
-; RV32I-NEXT: mv a1, a4
+; RV32I-NEXT: mv a2, a4
; RV32I-NEXT: .LBB1_2: # %entry
-; RV32I-NEXT: lw a6, 0(a1)
-; RV32I-NEXT: beqz a2, .LBB1_6
+; RV32I-NEXT: beqz a1, .LBB1_5
; RV32I-NEXT: # %bb.3: # %entry
-; RV32I-NEXT: addi a1, a4, 4
-; RV32I-NEXT: lw a5, 0(a1)
-; RV32I-NEXT: bnez a2, .LBB1_7
+; RV32I-NEXT: addi a7, a4, 4
+; RV32I-NEXT: bnez a1, .LBB1_6
; RV32I-NEXT: .LBB1_4:
-; RV32I-NEXT: addi a1, a3, 8
-; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: bnez a2, .LBB1_8
+; RV32I-NEXT: addi a5, a3, 8
+; RV32I-NEXT: j .LBB1_7
; RV32I-NEXT: .LBB1_5:
-; RV32I-NEXT: addi a2, a3, 12
-; RV32I-NEXT: j .LBB1_9
-; RV32I-NEXT: .LBB1_6:
-; RV32I-NEXT: addi a1, a3, 4
-; RV32I-NEXT: lw a5, 0(a1)
-; RV32I-NEXT: beqz a2, .LBB1_4
+; RV32I-NEXT: addi a7, a3, 4
+; RV32I-NEXT: beqz a1, .LBB1_4
+; RV32I-NEXT: .LBB1_6: # %entry
+; RV32I-NEXT: addi a5, a4, 8
; RV32I-NEXT: .LBB1_7: # %entry
-; RV32I-NEXT: addi a1, a4, 8
+; RV32I-NEXT: lw a6, 0(a2)
+; RV32I-NEXT: lw a7, 0(a7)
+; RV32I-NEXT: lw a2, 0(a5)
+; RV32I-NEXT: beqz a1, .LBB1_9
+; RV32I-NEXT: # %bb.8: # %entry
+; RV32I-NEXT: addi a1, a4, 12
+; RV32I-NEXT: j .LBB1_10
+; RV32I-NEXT: .LBB1_9:
+; RV32I-NEXT: addi a1, a3, 12
+; RV32I-NEXT: .LBB1_10: # %entry
; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: beqz a2, .LBB1_5
-; RV32I-NEXT: .LBB1_8: # %entry
-; RV32I-NEXT: addi a2, a4, 12
-; RV32I-NEXT: .LBB1_9: # %entry
-; RV32I-NEXT: lw a2, 0(a2)
-; RV32I-NEXT: sw a2, 12(a0)
-; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a5, 4(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sw a2, 8(a0)
+; RV32I-NEXT: sw a7, 4(a0)
; RV32I-NEXT: sw a6, 0(a0)
; RV32I-NEXT: ret
;
@@ -124,40 +123,39 @@ entry:
define i128 @cmov128(i1 %a, i128 %b, i128 %c) nounwind {
; RV32I-LABEL: cmov128:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: andi a4, a1, 1
-; RV32I-NEXT: mv a1, a2
-; RV32I-NEXT: bnez a4, .LBB3_2
+; RV32I-NEXT: andi a1, a1, 1
+; RV32I-NEXT: mv a4, a2
+; RV32I-NEXT: bnez a1, .LBB3_2
; RV32I-NEXT: # %bb.1: # %entry
-; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: mv a4, a3
; RV32I-NEXT: .LBB3_2: # %entry
-; RV32I-NEXT: lw a6, 0(a1)
-; RV32I-NEXT: bnez a4, .LBB3_6
+; RV32I-NEXT: bnez a1, .LBB3_5
; RV32I-NEXT: # %bb.3: # %entry
-; RV32I-NEXT: addi a1, a3, 4
-; RV32I-NEXT: lw a5, 0(a1)
-; RV32I-NEXT: beqz a4, .LBB3_7
+; RV32I-NEXT: addi a7, a3, 4
+; RV32I-NEXT: beqz a1, .LBB3_6
; RV32I-NEXT: .LBB3_4:
-; RV32I-NEXT: addi a1, a2, 8
-; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: beqz a4, .LBB3_8
+; RV32I-NEXT: addi a5, a2, 8
+; RV32I-NEXT: j .LBB3_7
; RV32I-NEXT: .LBB3_5:
-; RV32I-NEXT: addi a2, a2, 12
-; RV32I-NEXT: j .LBB3_9
-; RV32I-NEXT: .LBB3_6:
-; RV32I-NEXT: addi a1, a2, 4
-; RV32I-NEXT: lw a5, 0(a1)
-; RV32I-NEXT: bnez a4, .LBB3_4
+; RV32I-NEXT: addi a7, a2, 4
+; RV32I-NEXT: bnez a1, .LBB3_4
+; RV32I-NEXT: .LBB3_6: # %entry
+; RV32I-NEXT: addi a5, a3, 8
; RV32I-NEXT: .LBB3_7: # %entry
-; RV32I-NEXT: addi a1, a3, 8
+; RV32I-NEXT: lw a6, 0(a4)
+; RV32I-NEXT: lw a7, 0(a7)
+; RV32I-NEXT: lw a4, 0(a5)
+; RV32I-NEXT: bnez a1, .LBB3_9
+; RV32I-NEXT: # %bb.8: # %entry
+; RV32I-NEXT: addi a1, a3, 12
+; RV32I-NEXT: j .LBB3_10
+; RV32I-NEXT: .LBB3_9:
+; RV32I-NEXT: addi a1, a2, 12
+; RV32I-NEXT: .LBB3_10: # %entry
; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: bnez a4, .LBB3_5
-; RV32I-NEXT: .LBB3_8: # %entry
-; RV32I-NEXT: addi a2, a3, 12
-; RV32I-NEXT: .LBB3_9: # %entry
-; RV32I-NEXT: lw a2, 0(a2)
-; RV32I-NEXT: sw a2, 12(a0)
-; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a5, 4(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sw a4, 8(a0)
+; RV32I-NEXT: sw a7, 4(a0)
; RV32I-NEXT: sw a6, 0(a0)
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index 08ddd4222842..9d7f8f8724db 100644
--- a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -14,40 +14,40 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-NEXT: movslq (%rdi), %rax
+; CHECK-NEXT: movslq (%rdi), %rdi
; CHECK-NEXT: movslq (%rsi), %r8
; CHECK-NEXT: movslq (%rdx), %r10
-; CHECK-NEXT: movl (%rcx), %edi
-; CHECK-NEXT: movslq (%r9), %rcx
-; CHECK-NEXT: movq %rsp, %rdx
-; CHECK-NEXT: subl %eax, %r8d
-; CHECK-NEXT: movslq %r8d, %rsi
+; CHECK-NEXT: movl (%rcx), %esi
+; CHECK-NEXT: movq %rsp, %rcx
+; CHECK-NEXT: subl %edi, %r8d
+; CHECK-NEXT: movslq %r8d, %rdx
; CHECK-NEXT: js .LBB0_1
; CHECK-NEXT: # %bb.11: # %b63
-; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_14
; CHECK-NEXT: # %bb.12:
-; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_13: # %a25b
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: je .LBB0_13
; CHECK-NEXT: .LBB0_14: # %b85
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.15:
-; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_16: # %a25b140
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: je .LBB0_16
; CHECK-NEXT: .LBB0_1: # %a29b
-; CHECK-NEXT: cmpl %r10d, %edi
+; CHECK-NEXT: cmpl %r10d, %esi
; CHECK-NEXT: js .LBB0_10
; CHECK-NEXT: # %bb.2: # %b158
+; CHECK-NEXT: movslq (%r9), %rsi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: movb $1, %r10b
@@ -77,7 +77,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: js .LBB0_4
; CHECK-NEXT: # %bb.17: # %b179
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_18
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_37: # %a30b
@@ -97,7 +97,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: je .LBB0_19
; CHECK-NEXT: .LBB0_4: # %a33b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: orl %r8d, %eax
; CHECK-NEXT: movl %eax, %r9d
; CHECK-NEXT: shrl $31, %r9d
@@ -106,7 +106,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: .LBB0_5: # %a50b
; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: movl %r8d, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: orl %esi, %eax
; CHECK-NEXT: movl %eax, %r11d
; CHECK-NEXT: shrl $31, %r11d
; CHECK-NEXT: testl %eax, %eax
@@ -156,7 +156,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_21 Depth 3
-; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: js .LBB0_22
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_21: # %a35b
@@ -169,14 +169,14 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_28: # %b1016
; CHECK-NEXT: # in Loop: Header=BB0_26 Depth=2
-; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_6
; CHECK-NEXT: .LBB0_26: # %b858
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_38 Depth 3
; CHECK-NEXT: # Child Loop BB0_29 Depth 3
-; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_27
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_38: # %a53b
@@ -194,38 +194,38 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # Parent Loop BB0_26 Depth=2
; CHECK-NEXT: # => This Inner Loop Header: Depth=3
-; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: jle .LBB0_29
; CHECK-NEXT: jmp .LBB0_28
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_32: # %b1263
; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2
-; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: jle .LBB0_7
; CHECK-NEXT: .LBB0_30: # %b1117
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_39 Depth 3
; CHECK-NEXT: # Child Loop BB0_33 Depth 3
-; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: js .LBB0_31
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_39: # %a63b
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # Parent Loop BB0_30 Depth=2
; CHECK-NEXT: # => This Inner Loop Header: Depth=3
-; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_39
; CHECK-NEXT: .LBB0_31: # %b1139
; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2
-; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_32
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_33: # %a63b1266
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # Parent Loop BB0_30 Depth=2
; CHECK-NEXT: # => This Inner Loop Header: Depth=3
-; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: jle .LBB0_33
; CHECK-NEXT: jmp .LBB0_32
; CHECK-NEXT: .p2align 4, 0x90
@@ -237,7 +237,7 @@ define void @foo(i32* %a0, i32* %a1, i32* %a2, i32* %a3, i32* %a4, i32* %a5) {
; CHECK-NEXT: # Parent Loop BB0_3 Depth=1
; CHECK-NEXT: # => This Loop Header: Depth=2
; CHECK-NEXT: # Child Loop BB0_24 Depth 3
-; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: js .LBB0_25
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_24: # %a45b
diff --git a/llvm/test/CodeGen/X86/MachineSink-eflags.ll b/llvm/test/CodeGen/X86/MachineSink-eflags.ll
index 4e6041b9c7a9..5637e1fcb3e4 100644
--- a/llvm/test/CodeGen/X86/MachineSink-eflags.ll
+++ b/llvm/test/CodeGen/X86/MachineSink-eflags.ll
@@ -16,31 +16,30 @@ define void @foo(i8* nocapture %_stubArgs) nounwind {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $152, %rsp
; CHECK-NEXT: movq 48(%rdi), %rax
-; CHECK-NEXT: movl 64(%rdi), %edx
+; CHECK-NEXT: movl 64(%rdi), %ecx
; CHECK-NEXT: movl $200, %esi
; CHECK-NEXT: addl 68(%rdi), %esi
-; CHECK-NEXT: imull $46, %edx, %ecx
-; CHECK-NEXT: addq %rsi, %rcx
-; CHECK-NEXT: shlq $4, %rcx
-; CHECK-NEXT: imull $47, %edx, %edx
+; CHECK-NEXT: imull $46, %ecx, %edx
; CHECK-NEXT: addq %rsi, %rdx
; CHECK-NEXT: shlq $4, %rdx
-; CHECK-NEXT: movaps (%rax,%rdx), %xmm0
+; CHECK-NEXT: imull $47, %ecx, %ecx
+; CHECK-NEXT: addq %rsi, %rcx
+; CHECK-NEXT: shlq $4, %rcx
; CHECK-NEXT: cmpl $0, (%rdi)
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.2: # %entry
-; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: je .LBB0_4
-; CHECK-NEXT: jmp .LBB0_5
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: movaps (%rax,%rdx), %xmm0
+; CHECK-NEXT: .LBB0_3: # %entry
; CHECK-NEXT: movaps (%rax,%rcx), %xmm1
-; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jne .LBB0_5
-; CHECK-NEXT: .LBB0_4: # %entry
-; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: # %bb.4: # %entry
+; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: .LBB0_5: # %entry
-; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq $152, %rsp
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
index 62326113ed0d..9b3635fa1c9e 100644
--- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
@@ -358,44 +358,57 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
;
; NOGATHER-LABEL: masked_gather_v8i32:
; NOGATHER: # %bb.0: # %entry
-; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
-; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2
; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0
; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; NOGATHER-NEXT: vpmovmskb %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB6_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
-; NOGATHER-NEXT: vmovq %xmm3, %rcx
+; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB6_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
-; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx
+; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm0
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_4: # %else2
-; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $4, %al
-; NOGATHER-NEXT: jne .LBB6_5
-; NOGATHER-NEXT: # %bb.6: # %else5
+; NOGATHER-NEXT: je .LBB6_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vmovq %xmm0, %rcx
+; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm2
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_6: # %else5
; NOGATHER-NEXT: testb $8, %al
-; NOGATHER-NEXT: jne .LBB6_7
+; NOGATHER-NEXT: je .LBB6_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
+; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB6_8: # %else8
+; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0
; NOGATHER-NEXT: testb $16, %al
-; NOGATHER-NEXT: jne .LBB6_9
+; NOGATHER-NEXT: je .LBB6_10
+; NOGATHER-NEXT: # %bb.9: # %cond.load10
+; NOGATHER-NEXT: vmovq %xmm0, %rcx
+; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
+; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
+; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_10: # %else11
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB6_12
-; NOGATHER-NEXT: .LBB6_11: # %cond.load13
-; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT: # %bb.11: # %cond.load13
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
+; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
+; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2
+; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB6_12: # %else14
-; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $64, %al
; NOGATHER-NEXT: jne .LBB6_13
; NOGATHER-NEXT: # %bb.14: # %else17
@@ -404,26 +417,6 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; NOGATHER-NEXT: .LBB6_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
-; NOGATHER-NEXT: .LBB6_5: # %cond.load4
-; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm3
-; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; NOGATHER-NEXT: testb $8, %al
-; NOGATHER-NEXT: je .LBB6_8
-; NOGATHER-NEXT: .LBB6_7: # %cond.load7
-; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
-; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0
-; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; NOGATHER-NEXT: testb $16, %al
-; NOGATHER-NEXT: je .LBB6_10
-; NOGATHER-NEXT: .LBB6_9: # %cond.load10
-; NOGATHER-NEXT: vmovq %xmm2, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm0, %xmm0
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; NOGATHER-NEXT: testb $32, %al
-; NOGATHER-NEXT: jne .LBB6_11
-; NOGATHER-NEXT: jmp .LBB6_12
; NOGATHER-NEXT: .LBB6_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -472,44 +465,58 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
;
; NOGATHER-LABEL: masked_gather_v8float:
; NOGATHER: # %bb.0: # %entry
-; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
-; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2
; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0
; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; NOGATHER-NEXT: vpmovmskb %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB7_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
-; NOGATHER-NEXT: vmovq %xmm3, %rcx
+; NOGATHER-NEXT: vmovq %xmm2, %rcx
; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; NOGATHER-NEXT: .LBB7_2: # %else
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB7_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
-; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx
+; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],mem[0],xmm1[2,3]
; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_4: # %else2
-; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
; NOGATHER-NEXT: testb $4, %al
-; NOGATHER-NEXT: jne .LBB7_5
-; NOGATHER-NEXT: # %bb.6: # %else5
+; NOGATHER-NEXT: je .LBB7_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vmovq %xmm0, %rcx
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0,1],mem[0],xmm1[3]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_6: # %else5
; NOGATHER-NEXT: testb $8, %al
-; NOGATHER-NEXT: jne .LBB7_7
+; NOGATHER-NEXT: je .LBB7_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB7_8: # %else8
+; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0
; NOGATHER-NEXT: testb $16, %al
-; NOGATHER-NEXT: jne .LBB7_9
+; NOGATHER-NEXT: je .LBB7_10
+; NOGATHER-NEXT: # %bb.9: # %cond.load10
+; NOGATHER-NEXT: vmovq %xmm0, %rcx
+; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_10: # %else11
; NOGATHER-NEXT: testb $32, %al
; NOGATHER-NEXT: je .LBB7_12
-; NOGATHER-NEXT: .LBB7_11: # %cond.load13
-; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT: # %bb.11: # %cond.load13
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
+; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB7_12: # %else14
-; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
; NOGATHER-NEXT: testb $64, %al
; NOGATHER-NEXT: jne .LBB7_13
; NOGATHER-NEXT: # %bb.14: # %else17
@@ -518,27 +525,6 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; NOGATHER-NEXT: .LBB7_16: # %else20
; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
-; NOGATHER-NEXT: .LBB7_5: # %cond.load4
-; NOGATHER-NEXT: vmovq %xmm0, %rcx
-; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],mem[0],xmm1[3]
-; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; NOGATHER-NEXT: testb $8, %al
-; NOGATHER-NEXT: je .LBB7_8
-; NOGATHER-NEXT: .LBB7_7: # %cond.load7
-; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx
-; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0]
-; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; NOGATHER-NEXT: testb $16, %al
-; NOGATHER-NEXT: je .LBB7_10
-; NOGATHER-NEXT: .LBB7_9: # %cond.load10
-; NOGATHER-NEXT: vmovq %xmm2, %rcx
-; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; NOGATHER-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; NOGATHER-NEXT: testb $32, %al
-; NOGATHER-NEXT: jne .LBB7_11
-; NOGATHER-NEXT: jmp .LBB7_12
; NOGATHER-NEXT: .LBB7_13: # %cond.load16
; NOGATHER-NEXT: vmovq %xmm0, %rcx
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll
index 7afc8bf731dc..64d83474cf0c 100644
--- a/llvm/test/CodeGen/X86/cmovcmov.ll
+++ b/llvm/test/CodeGen/X86/cmovcmov.ll
@@ -165,14 +165,13 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
; NOCMOV-NEXT: fnstsw %ax
; NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
; NOCMOV-NEXT: sahf
-; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax
+; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx
; NOCMOV-NEXT: jne .LBB4_3
; NOCMOV-NEXT: # %bb.1: # %entry
; NOCMOV-NEXT: jp .LBB4_3
; NOCMOV-NEXT: # %bb.2: # %entry
-; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax
+; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx
; NOCMOV-NEXT: .LBB4_3: # %entry
-; NOCMOV-NEXT: movl (%eax), %ecx
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx
; NOCMOV-NEXT: jne .LBB4_6
; NOCMOV-NEXT: # %bb.4: # %entry
@@ -181,7 +180,6 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx
; NOCMOV-NEXT: .LBB4_6: # %entry
; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT: movl (%edx), %edx
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi
; NOCMOV-NEXT: jne .LBB4_9
; NOCMOV-NEXT: # %bb.7: # %entry
@@ -189,6 +187,8 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
; NOCMOV-NEXT: # %bb.8: # %entry
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi
; NOCMOV-NEXT: .LBB4_9: # %entry
+; NOCMOV-NEXT: movl (%ecx), %ecx
+; NOCMOV-NEXT: movl (%edx), %edx
; NOCMOV-NEXT: movl (%esi), %esi
; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edi
; NOCMOV-NEXT: jne .LBB4_12
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 7c46c5982bfd..ebd5c5495a57 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -557,63 +557,59 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB7_1
; MCU-NEXT: # %bb.2:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
-; MCU-NEXT: movl (%eax), %eax
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
; MCU-NEXT: je .LBB7_5
; MCU-NEXT: .LBB7_4:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; MCU-NEXT: movl (%ecx), %ecx
; MCU-NEXT: je .LBB7_8
; MCU-NEXT: .LBB7_7:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
-; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: je .LBB7_11
; MCU-NEXT: .LBB7_10:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
-; MCU-NEXT: movl (%edi), %edi
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: je .LBB7_14
; MCU-NEXT: .LBB7_13:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
-; MCU-NEXT: movl (%ebx), %ebx
-; MCU-NEXT: je .LBB7_17
-; MCU-NEXT: .LBB7_16:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
-; MCU-NEXT: jmp .LBB7_18
-; MCU-NEXT: .LBB7_1:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
-; MCU-NEXT: movl (%eax), %eax
+; MCU-NEXT: jmp .LBB7_15
+; MCU-NEXT: .LBB7_1:
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
; MCU-NEXT: jne .LBB7_4
; MCU-NEXT: .LBB7_5:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; MCU-NEXT: movl (%ecx), %ecx
; MCU-NEXT: jne .LBB7_7
; MCU-NEXT: .LBB7_8:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
-; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: jne .LBB7_10
; MCU-NEXT: .LBB7_11:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
-; MCU-NEXT: movl (%edi), %edi
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: jne .LBB7_13
; MCU-NEXT: .LBB7_14:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
-; MCU-NEXT: movl (%ebx), %ebx
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
+; MCU-NEXT: .LBB7_15:
+; MCU-NEXT: movl (%edi), %ebx
+; MCU-NEXT: movl (%ecx), %edi
+; MCU-NEXT: movl (%esi), %esi
+; MCU-NEXT: movl (%ebp), %ecx
+; MCU-NEXT: movl (%eax), %eax
; MCU-NEXT: jne .LBB7_16
-; MCU-NEXT: .LBB7_17:
+; MCU-NEXT: # %bb.17:
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
+; MCU-NEXT: jmp .LBB7_18
+; MCU-NEXT: .LBB7_16:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: .LBB7_18:
; MCU-NEXT: movl (%ebp), %ebp
; MCU-NEXT: decl %ebp
-; MCU-NEXT: decl %ebx
-; MCU-NEXT: decl %edi
-; MCU-NEXT: decl %esi
-; MCU-NEXT: decl %ecx
; MCU-NEXT: decl %eax
-; MCU-NEXT: movl %eax, 20(%edx)
-; MCU-NEXT: movl %ecx, 16(%edx)
+; MCU-NEXT: decl %ecx
+; MCU-NEXT: decl %esi
+; MCU-NEXT: decl %edi
+; MCU-NEXT: decl %ebx
+; MCU-NEXT: movl %ebx, 20(%edx)
+; MCU-NEXT: movl %edi, 16(%edx)
; MCU-NEXT: movl %esi, 12(%edx)
-; MCU-NEXT: movl %edi, 8(%edx)
-; MCU-NEXT: movl %ebx, 4(%edx)
+; MCU-NEXT: movl %ecx, 8(%edx)
+; MCU-NEXT: movl %eax, 4(%edx)
; MCU-NEXT: movl %ebp, (%edx)
; MCU-NEXT: popl %esi
; MCU-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index e31d71f8010e..d48cd14b092e 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4361,7 +4361,6 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-LABEL: uitofp_load_4i64_to_4f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
@@ -4377,6 +4376,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB83_3:
+; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
@@ -4710,40 +4710,38 @@ define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-LABEL: uitofp_load_8i64_to_8f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movdqa 32(%rdi), %xmm2
-; SSE2-NEXT: movdqa 48(%rdi), %xmm1
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_1
; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
+; SSE2-NEXT: cvtsi2ss %rax, %xmm2
; SSE2-NEXT: jmp .LBB87_3
; SSE2-NEXT: .LBB87_1:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: addss %xmm3, %xmm3
+; SSE2-NEXT: cvtsi2ss %rax, %xmm2
+; SSE2-NEXT: addss %xmm2, %xmm2
; SSE2-NEXT: .LBB87_3:
+; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_4
; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm4
+; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: jmp .LBB87_6
; SSE2-NEXT: .LBB87_4:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: addss %xmm4, %xmm4
+; SSE2-NEXT: cvtsi2ss %rax, %xmm1
+; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB87_6:
-; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_7
; SSE2-NEXT: # %bb.8:
@@ -4759,55 +4757,59 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm0
; SSE2-NEXT: addss %xmm0, %xmm0
; SSE2-NEXT: .LBB87_9:
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: movdqa 48(%rdi), %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE2-NEXT: movq %xmm3, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_10
; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm6
+; SSE2-NEXT: cvtsi2ss %rax, %xmm4
; SSE2-NEXT: jmp .LBB87_12
; SSE2-NEXT: .LBB87_10:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm6
-; SSE2-NEXT: addss %xmm6, %xmm6
+; SSE2-NEXT: cvtsi2ss %rax, %xmm4
+; SSE2-NEXT: addss %xmm4, %xmm4
; SSE2-NEXT: .LBB87_12:
-; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: movq %xmm6, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_13
; SSE2-NEXT: # %bb.14:
-; SSE2-NEXT: xorps %xmm5, %xmm5
-; SSE2-NEXT: cvtsi2ss %rax, %xmm5
+; SSE2-NEXT: xorps %xmm3, %xmm3
+; SSE2-NEXT: cvtsi2ss %rax, %xmm3
; SSE2-NEXT: jmp .LBB87_15
; SSE2-NEXT: .LBB87_13:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm5, %xmm5
-; SSE2-NEXT: cvtsi2ss %rax, %xmm5
-; SSE2-NEXT: addss %xmm5, %xmm5
+; SSE2-NEXT: xorps %xmm3, %xmm3
+; SSE2-NEXT: cvtsi2ss %rax, %xmm3
+; SSE2-NEXT: addss %xmm3, %xmm3
; SSE2-NEXT: .LBB87_15:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: movdqa 32(%rdi), %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE2-NEXT: movq %xmm6, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_16
; SSE2-NEXT: # %bb.17:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm7
+; SSE2-NEXT: xorps %xmm6, %xmm6
+; SSE2-NEXT: cvtsi2ss %rax, %xmm6
; SSE2-NEXT: jmp .LBB87_18
; SSE2-NEXT: .LBB87_16:
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm7
-; SSE2-NEXT: addss %xmm7, %xmm7
+; SSE2-NEXT: xorps %xmm6, %xmm6
+; SSE2-NEXT: cvtsi2ss %rax, %xmm6
+; SSE2-NEXT: addss %xmm6, %xmm6
; SSE2-NEXT: .LBB87_18:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: movq %xmm5, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_19
; SSE2-NEXT: # %bb.20:
@@ -4823,9 +4825,9 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: cvtsi2ss %rax, %xmm1
; SSE2-NEXT: addss %xmm1, %xmm1
; SSE2-NEXT: .LBB87_21:
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm2, %rax
; SSE2-NEXT: testq %rax, %rax
; SSE2-NEXT: js .LBB87_22
@@ -4843,7 +4845,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE2-NEXT: addss %xmm2, %xmm2
; SSE2-NEXT: .LBB87_24:
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: uitofp_load_8i64_to_8f32:
More information about the llvm-commits
mailing list