[llvm] 628467e - [MachineCSE] Allow PRE of instructions that read physical registers

John Brawn via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 27 06:15:24 PDT 2022


Author: John Brawn
Date: 2022-10-27T14:14:57+01:00
New Revision: 628467e53f4ceecd2b5f0797f07591c66d9d9d2a

URL: https://github.com/llvm/llvm-project/commit/628467e53f4ceecd2b5f0797f07591c66d9d9d2a
DIFF: https://github.com/llvm/llvm-project/commit/628467e53f4ceecd2b5f0797f07591c66d9d9d2a.diff

LOG: [MachineCSE] Allow PRE of instructions that read physical registers

Currently MachineCSE forbids PRE when the instruction reads a physical
register. Relax this so that it's allowed when the value being read is
the same as what would be read in the place the instruction would be
hoisted to.

This is being done in preparation for adding FPCR handling to the
AArch64 backend, in order to prevent it to from worsening the
generated code, but for targets that already have a similar register
it should improve things.

This patch affects code generation in several tests. The new code
looks better except for in Thumb2/LowOverheadLoops/memcall.ll where
we perform PRE but the LowOverheadLoops transformation then undoes
it. Also in AMDGPU/selectcc-opt.ll the CHECK makes things look worse,
but actually the function as a whole is better (as a MOV is PRE'd).

Differential Revision: https://reviews.llvm.org/D136675

Added: 
    llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir

Modified: 
    llvm/lib/CodeGen/MachineCSE.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 2de5879e26b09..6b5320b71d9a7 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -145,7 +145,7 @@ namespace {
                          DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
     bool PerformCSE(MachineDomTreeNode *Node);
 
-    bool isPRECandidate(MachineInstr *MI);
+    bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs);
     bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
     bool PerformSimplePRE(MachineDominatorTree *DT);
     /// Heuristics to see if it's profitable to move common computations of MBB
@@ -798,7 +798,8 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
 // We use stronger checks for PRE candidate rather than for CSE ones to embrace
 // checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
 // to exclude instrs created by PRE that won't be CSEed later.
-bool MachineCSE::isPRECandidate(MachineInstr *MI) {
+bool MachineCSE::isPRECandidate(MachineInstr *MI,
+                                SmallSet<MCRegister, 8> &PhysRefs) {
   if (!isCSECandidate(MI) ||
       MI->isNotDuplicable() ||
       MI->mayLoad() ||
@@ -813,7 +814,7 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI) {
 
   for (const auto &use : MI->uses())
     if (use.isReg() && !Register::isVirtualRegister(use.getReg()))
-      return false;
+      PhysRefs.insert(use.getReg());
 
   return true;
 }
@@ -822,7 +823,8 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
                                  MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
-    if (!isPRECandidate(&MI))
+    SmallSet<MCRegister, 8> PhysRefs;
+    if (!isPRECandidate(&MI, PhysRefs))
       continue;
 
     if (!PREMap.count(&MI)) {
@@ -858,6 +860,15 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
         if (MI.isConvergent() && CMBB != MBB)
           continue;
 
+        // If this instruction uses physical registers then we can only do PRE
+        // if it's using the value that is live at the place we're hoisting to.
+        bool NonLocal;
+        PhysDefVector PhysDefs;
+        if (!PhysRefs.empty() &&
+            !PhysRegDefsReach(&*(CMBB->getFirstTerminator()), &MI, PhysRefs,
+                              PhysDefs, NonLocal))
+          continue;
+
         assert(MI.getOperand(0).isDef() &&
                "First operand of instr with one explicit def must be this def");
         Register VReg = MI.getOperand(0).getReg();

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
index a6a3237ee929f..d6c675a636e9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll
@@ -22,8 +22,6 @@ define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_1:
-; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}}
-; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}}
 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0xc00, [[IDX]]
 define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 346b7d2deb18b..4ddd0c6583104 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -191,131 +191,131 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s5, -1
 ; CHECK-NEXT:    s_and_b64 s[6:7], s[6:7], s[4:5]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[6:7], 0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v2
+; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; CHECK-NEXT:    v_trunc_f32_e32 v2, v2
-; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s4, v0
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v1
+; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
+; CHECK-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v1
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v7, s4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v8, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v10, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, s5, v0
-; CHECK-NEXT:    v_mul_hi_u32 v6, s4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v5
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s1, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, s1, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, s0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, s1, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, s1, v1
+; CHECK-NEXT:    v_mul_hi_u32 v6, s0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, s1, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, s0, v4
+; CHECK-NEXT:    v_mul_lo_u32 v8, s1, v4
+; CHECK-NEXT:    v_mul_hi_u32 v9, s0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, s1, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, s2, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s3, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s2, v0
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
-; CHECK-NEXT:    v_mul_lo_u32 v2, s2, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, s0, v5
-; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], s1, v2
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s3, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_mul_lo_u32 v6, s2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, s3, v1
+; CHECK-NEXT:    v_mul_hi_u32 v8, s2, v1
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v1
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
+; CHECK-NEXT:    v_mul_lo_u32 v4, s2, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, s0, v6
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], s1, v4
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s3, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v4, v0, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v4, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; CHECK-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
@@ -326,9 +326,8 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v2
 ; CHECK-NEXT:    s_sub_i32 s1, 0, s2
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v1, s1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f2ad41481eca4..3fd860ab72e39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -188,130 +188,130 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s5, -1
 ; CHECK-NEXT:    s_and_b64 s[6:7], s[6:7], s[4:5]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[6:7], 0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v2
+; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; CHECK-NEXT:    v_trunc_f32_e32 v2, v2
-; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s4, v0
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v1
+; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
+; CHECK-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v1
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v7, s4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v8, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v10, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, s5, v0
-; CHECK-NEXT:    v_mul_hi_u32 v6, s4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v5
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s1, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, s1, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, s1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, s0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, s1, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, s1, v1
+; CHECK-NEXT:    v_mul_hi_u32 v6, s0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, s1, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, s0, v4
+; CHECK-NEXT:    v_mul_lo_u32 v8, s1, v4
+; CHECK-NEXT:    v_mul_hi_u32 v9, s0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, s1, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, s2, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s3, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, s2, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v2, s2, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, s0, v5
-; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v0, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], s1, v0
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s2, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s3, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_mul_lo_u32 v6, s2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, s3, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, s2, v1
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v4, s2, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, s0, v6
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v3, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], s1, v1
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s3, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v1, v0, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, s2, v3
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v5, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
@@ -322,9 +322,8 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v2
 ; CHECK-NEXT:    s_sub_i32 s1, 0, s2
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v1, s1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
index 8e9409188daad..563d86daa55cb 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -33,6 +33,7 @@ ENDIF:
 
 ; EG-LABEL: {{^}}test_b:
 ; EG: SET{{[GTEQN]+}}_DX10
+; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
 ; EG-NEXT: PRED_
 ; EG-NEXT: ALU clause starting
 define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) {

diff  --git a/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir b/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir
new file mode 100644
index 0000000000000..36484be012362
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/machine-cse-rm-pre.mir
@@ -0,0 +1,173 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=powerpc-unknown-unknown -run-pass=machine-cse -verify-machineinstrs | FileCheck %s
+--- |
+  define void @can_pre() {
+  entry:
+    br label %for.body
+
+  for.body:
+    br i1 undef, label %if.then, label %if.else
+
+  if.then:
+    br label %if.end
+
+  if.else:
+    br label %if.end
+
+  if.end:
+    br label %for.body
+  }
+
+  define void @cannot_pre() {
+  entry:
+    br label %for.body
+
+  for.body:
+    br i1 undef, label %if.then, label %if.else
+
+  if.then:
+    br label %if.end
+
+  if.else:
+    br label %if.end
+
+  if.end:
+    br label %for.body
+  }
+...
+---
+name: can_pre
+registers:
+  - { id: 0, class: f8rc, preferred-register: '' }
+  - { id: 1, class: f8rc, preferred-register: '' }
+  - { id: 2, class: gprc, preferred-register: '' }
+  - { id: 3, class: gprc, preferred-register: '' }
+  - { id: 4, class: f8rc, preferred-register: '' }
+  - { id: 5, class: f8rc, preferred-register: '' }
+liveins:
+  - { reg: '$r1', virtual-reg: '%2' }
+  - { reg: '$r2', virtual-reg: '%3' }
+  - { reg: '$f1', virtual-reg: '%4' }
+  - { reg: '$f2', virtual-reg: '%5' }
+body:             |
+  ; CHECK-LABEL: name: can_pre
+  ; CHECK: bb.0.for.body:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $r1, $r2, $f1, $f2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:f8rc = COPY $f2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:f8rc = COPY $f1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gprc = COPY $r2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprc = COPY $r1
+  ; CHECK-NEXT:   $cr0 = CMPLWI [[COPY3]], 0
+  ; CHECK-NEXT:   %6:f8rc = nofpexcept FDIV [[COPY1]], [[COPY]], implicit $rm
+  ; CHECK-NEXT:   BCC 44, $cr0, %bb.1
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.else:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.end:
+  ; CHECK-NEXT:   BLR implicit $lr, implicit $rm
+  bb.0.for.body:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r1, $r2, $f1, $f2
+
+    %5:f8rc = COPY $f2
+    %4:f8rc = COPY $f1
+    %3:gprc = COPY $r2
+    %2:gprc = COPY $r1
+    $cr0 = CMPLWI %2, 0
+    BCC 44, $cr0, %bb.1
+    B %bb.2
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+
+    %0:f8rc = nofpexcept FDIV %4, %5, implicit $rm
+    B %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+
+    %1:f8rc = nofpexcept FDIV %4, %5, implicit $rm
+
+  bb.3.if.end:
+    BLR implicit $lr, implicit $rm
+...
+---
+name: cannot_pre
+registers:
+  - { id: 0, class: f8rc, preferred-register: '' }
+  - { id: 1, class: f8rc, preferred-register: '' }
+  - { id: 2, class: gprc, preferred-register: '' }
+  - { id: 3, class: gprc, preferred-register: '' }
+  - { id: 4, class: f8rc, preferred-register: '' }
+  - { id: 5, class: f8rc, preferred-register: '' }
+  - { id: 6, class: f8rc, preferred-register: '' }
+liveins:
+  - { reg: '$r1', virtual-reg: '%2' }
+  - { reg: '$r2', virtual-reg: '%3' }
+  - { reg: '$f1', virtual-reg: '%4' }
+  - { reg: '$f2', virtual-reg: '%5' }
+body:             |
+  ; CHECK-LABEL: name: cannot_pre
+  ; CHECK: bb.0.for.body:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $r1, $r2, $f1, $f2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:f8rc = COPY $f2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:f8rc = COPY $f1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gprc = COPY $r2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprc = COPY $r1
+  ; CHECK-NEXT:   $cr0 = CMPLWI [[COPY3]], 0
+  ; CHECK-NEXT:   BCC 44, $cr0, %bb.1
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[SETRND:%[0-9]+]]:f8rc = SETRND [[COPY2]], implicit-def $rm, implicit $rm
+  ; CHECK-NEXT:   %0:f8rc = nofpexcept FDIV [[COPY1]], [[COPY]], implicit $rm
+  ; CHECK-NEXT:   B %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.else:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %1:f8rc = nofpexcept FDIV [[COPY1]], [[COPY]], implicit $rm
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.end:
+  ; CHECK-NEXT:   BLR implicit $lr, implicit $rm
+  bb.0.for.body:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r1, $r2, $f1, $f2
+
+    %5:f8rc = COPY $f2
+    %4:f8rc = COPY $f1
+    %3:gprc = COPY $r2
+    %2:gprc = COPY $r1
+    $cr0 = CMPLWI %2, 0
+    BCC 44, $cr0, %bb.1
+    B %bb.2
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+
+    %6:f8rc = SETRND %3, implicit-def $rm, implicit $rm
+    %0:f8rc = nofpexcept FDIV %4, %5, implicit $rm
+    B %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+
+    %1:f8rc = nofpexcept FDIV %4, %5, implicit $rm
+
+  bb.3.if.end:
+    BLR implicit $lr, implicit $rm
+...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index 040e026e6a80a..f0495def81858 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -277,23 +277,23 @@ for.body:                                         ; preds = %entry, %for.body
 define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
 ; CHECK-LABEL: test_memset_preheader:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r7, lr}
-; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    cbz r2, .LBB6_5
 ; CHECK-NEXT:  @ %bb.1: @ %prehead
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    wlstp.8 lr, r2, .LBB6_3
 ; CHECK-NEXT:  .LBB6_2: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vstrb.8 q0, [r12], #16
+; CHECK-NEXT:    vstrb.8 q0, [r4], #16
 ; CHECK-NEXT:    letp lr, .LBB6_2
 ; CHECK-NEXT:  .LBB6_3: @ %prehead
 ; CHECK-NEXT:    dls lr, r2
-; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:  .LBB6_4: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb r3, [r12], #1
-; CHECK-NEXT:    strb r3, [r1], #1
+; CHECK-NEXT:    ldrb r4, [r3], #1
+; CHECK-NEXT:    strb r4, [r1], #1
 ; CHECK-NEXT:    le lr, .LBB6_4
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
@@ -302,7 +302,7 @@ define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) {
 ; CHECK-NEXT:    vstrb.8 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB6_6
 ; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup
-; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %cmp6 = icmp ne i32 %n, 0
   br i1 %cmp6, label %prehead, label %for.cond.cleanup


        


More information about the llvm-commits mailing list