[llvm] 10ed1ec - [MachineSink] Allow sinking of constant or ignorable physreg uses

Vang Thao via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 18 06:19:11 PST 2022


Author: Vang Thao
Date: 2022-01-18T14:17:40Z
New Revision: 10ed1eca241f893085b8db40138e588e72aaee3a

URL: https://github.com/llvm/llvm-project/commit/10ed1eca241f893085b8db40138e588e72aaee3a
DIFF: https://github.com/llvm/llvm-project/commit/10ed1eca241f893085b8db40138e588e72aaee3a.diff

LOG: [MachineSink] Allow sinking of constant or ignorable physreg uses

For AMDGPU, any use of the physical register EXEC prevents sinking even if it is not a real physical register read. Add check to see if a physical
register use can be ignored for sinking.

Also perform same constant and ignorable physical register check when considering sinking in loops.

https://reviews.llvm.org/D116053

Added: 
    llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir

Modified: 
    llvm/include/llvm/CodeGen/TargetInstrInfo.h
    llvm/lib/CodeGen/MachineSink.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 58b8e59b68d7b..411811d08c183 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -130,7 +130,7 @@ class TargetInstrInfo : public MCInstrInfo {
   }
 
   /// Given \p MO is a PhysReg use return if it can be ignored for the purpose
-  /// of instruction rematerialization.
+  /// of instruction rematerialization or sinking.
   virtual bool isIgnorableUse(const MachineOperand &MO) const {
     return false;
   }

diff  --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 54c478645dcfa..0dbbc218e9464 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -796,9 +796,14 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
     if (Reg == 0)
       continue;
 
-    // Don't handle physical register.
-    if (Register::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg)) {
+      if (MO.isUse() &&
+          (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO)))
+        continue;
+
+      // Don't handle non-constant and non-ignorable physical register.
       return false;
+    }
 
     // Users for the defs are all dominated by SuccToSinkTo.
     if (MO.isDef()) {
@@ -898,7 +903,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
         // it could get allocated to something with a def during allocation.
-        if (!MRI->isConstantPhysReg(Reg))
+        if (!MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO))
           return nullptr;
       } else if (!MO.isDead()) {
         // A def that isn't dead. We can't move it.

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 65c62c47a823d..73416dbb3096e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2962,19 +2962,18 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
@@ -2984,134 +2983,134 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v6
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v6, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v10, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v3
 ; CGP-NEXT:    v_trunc_f32_e32 v9, v9
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v9
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v3
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v6
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v12, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v16, v11, v3
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v3
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v15
+; CGP-NEXT:    v_mul_lo_u32 v16, v3, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v3, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v9, v15
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v15, v9, v16
-; CGP-NEXT:    v_mul_lo_u32 v17, v3, v14
-; CGP-NEXT:    v_mul_hi_u32 v18, v3, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v9, v16
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v18, v9, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v17, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v16, v3, v13
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_mul_hi_u32 v17, v3, v14
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v14, v12, v9
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v3
-; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v15
-; CGP-NEXT:    v_mul_lo_u32 v14, v3, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v3, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v9, v15
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v3, v12
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v3
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v14
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v11, v9
+; CGP-NEXT:    v_mul_lo_u32 v14, v11, v3
+; CGP-NEXT:    v_mul_hi_u32 v11, v11, v3
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v14
+; CGP-NEXT:    v_mul_lo_u32 v13, v3, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v3, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v8, v9
+; CGP-NEXT:    v_mul_lo_u32 v15, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v11
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v11, v9, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v12, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v9
-; CGP-NEXT:    v_mul_hi_u32 v15, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v3
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v8, v12, vcc
-; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v12
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v2, v3
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v9
+; CGP-NEXT:    v_mul_hi_u32 v14, v1, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v3
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_subb_u32_e64 v12, s[4:5], v8, v11, vcc
+; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v11
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v2
 ; CGP-NEXT:    v_subb_u32_e32 v8, vcc, v8, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v1
 ; CGP-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v2
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v2
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v13, s[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v12
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v14, v4, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v4, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v6, v0
+; CGP-NEXT:    v_xor_b32_e32 v3, v10, v0
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
@@ -3120,8 +3119,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow2
-; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
-; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[8:9]
+; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
+; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
@@ -3145,8 +3145,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
-; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -3154,9 +3154,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v2
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v2, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v10
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v2
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v10, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
 ; CGP-NEXT:    v_xor_b32_e32 v4, v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -3294,15 +3294,15 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  .LBB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v9
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -3310,15 +3310,15 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v9
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v10
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  .LBB8_8:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index e75db654647b4..5e60c7ca2415a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -2918,19 +2918,18 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
@@ -2944,129 +2943,129 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v2
-; CGP-NEXT:    v_trunc_f32_e32 v8, v8
-; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
+; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v9, v9
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v9
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v8
-; CGP-NEXT:    v_mul_hi_u32 v16, v9, v2
-; CGP-NEXT:    v_mul_lo_u32 v15, v9, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v1
+; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v0, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v2
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v2
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v2, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v14, v8, v15
-; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v8, v15
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v17, v8, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v9, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v12
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v16, v2, v13
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v12, v2
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v2
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v2
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v14
-; CGP-NEXT:    v_mul_lo_u32 v13, v2, v9
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v2, v9
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v11, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v2
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v2
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v6, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v6, v2
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v12, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_mul_hi_u32 v10, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v8, v1, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v9
+; CGP-NEXT:    v_mul_hi_u32 v12, v3, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v3, v9
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v9, v1, v9
+; CGP-NEXT:    v_mul_lo_u32 v11, v1, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v6, v2, vcc
-; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v6, v2
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v8, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v8, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v0
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v3, v1
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v3, v1
+; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v2, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v0
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
@@ -3074,7 +3073,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow2
-; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
+; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
+; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
@@ -3098,7 +3098,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -3106,9 +3106,9 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v2
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v2, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v10
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v2
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v10, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
 ; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
@@ -3244,15 +3244,15 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v4, v2, v8
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v8
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v8, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  .LBB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v9
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -3260,13 +3260,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v9
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v9
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v9
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  .LBB8_8:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 5c5dd2ca5805d..bf3c080cafaea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -1504,21 +1504,20 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1527,124 +1526,125 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v6, v0
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v0
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v1, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v15, v0, v12
-; CGP-NEXT:    v_mul_lo_u32 v17, v1, v12
-; CGP-NEXT:    v_mul_hi_u32 v18, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v16
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v1
 ; CGP-NEXT:    v_mul_lo_u32 v12, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v0
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v12
 ; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_mul_lo_u32 v14, v0, v11
+; CGP-NEXT:    v_mul_lo_u32 v16, v1, v11
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v1, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v0, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v1, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, v0, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, v0, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v6, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v11, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v0
-; CGP-NEXT:    v_mul_hi_u32 v13, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v4, v2, v1
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, 1, v0
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v14
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v9, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v0
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v9, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v9, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v2
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v14, v12, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v15, v16, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v11, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v14, v15, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow2
-; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
-; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[8:9]
+; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
+; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
@@ -1668,8 +1668,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
-; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -1677,10 +1677,10 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v11
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v10
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v9
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -1690,13 +1690,13 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v8, v4, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v12, v6, v2
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v3, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v3, v11
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
 ; CGP-NEXT:    v_mul_lo_u32 v13, v2, v8
 ; CGP-NEXT:    v_mul_lo_u32 v15, v3, v8
@@ -1704,46 +1704,46 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
 ; CGP-NEXT:    v_mul_hi_u32 v13, v2, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v4
 ; CGP-NEXT:    v_mul_hi_u32 v14, v2, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
@@ -1751,50 +1751,50 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v6, v5, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v7, v2
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v7, v3
 ; CGP-NEXT:    v_mul_hi_u32 v12, v5, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v10, v2
-; CGP-NEXT:    v_mul_lo_u32 v8, v11, v2
-; CGP-NEXT:    v_mul_hi_u32 v9, v10, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, v9, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v10, v2
+; CGP-NEXT:    v_mul_hi_u32 v11, v9, v2
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v4, v9, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
 ; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
 ; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v13, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v7, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v9, v7, vcc
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v10, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v11, v7, vcc
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v11
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc
@@ -1802,15 +1802,15 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  .LBB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v9
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -1818,15 +1818,15 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v9
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v10
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  .LBB8_8:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index fdf5ba53330c1..97806c56204de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2045,21 +2045,20 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2068,122 +2067,123 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v6, v0
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v0
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v1, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v15, v0, v12
-; CGP-NEXT:    v_mul_lo_u32 v17, v1, v12
-; CGP-NEXT:    v_mul_hi_u32 v18, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v16
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v1
 ; CGP-NEXT:    v_mul_lo_u32 v12, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v0
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v12
 ; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_mul_lo_u32 v14, v0, v11
+; CGP-NEXT:    v_mul_lo_u32 v16, v1, v11
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v1, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v0, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v1, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, v0, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, v0, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v6, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v11, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v1, v2, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v6
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v10
 ; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v0, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v9, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v1, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v1, v2
+; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v0, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v12, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow2
-; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
+; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
+; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
@@ -2207,7 +2207,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -2215,10 +2215,10 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v11
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v10
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v9
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -2228,13 +2228,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v8, v4, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v12, v6, v2
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v3, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v3, v11
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
 ; CGP-NEXT:    v_mul_lo_u32 v13, v2, v8
 ; CGP-NEXT:    v_mul_lo_u32 v15, v3, v8
@@ -2242,46 +2242,46 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
 ; CGP-NEXT:    v_mul_hi_u32 v13, v2, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v4
 ; CGP-NEXT:    v_mul_hi_u32 v14, v2, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
@@ -2289,65 +2289,65 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v6, v5, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v7, v2
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v7, v3
 ; CGP-NEXT:    v_mul_hi_u32 v12, v5, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v10, v2
-; CGP-NEXT:    v_mul_lo_u32 v8, v11, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, v9, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v10, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v9, v2
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v3, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v3, v9, v3
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
 ; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v7, v2, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v7, v2
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v11
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v11, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v3, v9
 ; CGP-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v2, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v6, v10
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v6, v9
 ; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  .LBB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v9
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -2355,13 +2355,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v9
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v9
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v9
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  .LBB8_8:

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 8f30a866bc8f1..5ee80c8a238cf 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -156,18 +156,18 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
 
-; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-
-; GFX9-MUBUF:   v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
-
 ; GCN: s_and_saveexec_b64
 
 ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
-; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
-
 ; GFX9-MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
 ; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}
+
+; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
+; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+
+; GFX9-MUBUF:   v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
+; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
+
 ; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
new file mode 100644
index 0000000000000..5adb6d42cdd63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -0,0 +1,734 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
+
+---
+name:            test_sink_fmac_to_only_use
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: test_sink_fmac_to_only_use
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+  ; GFX9-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = COPY $vgpr1
+    %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_64 = S_MOV_B64 0
+    %6:sreg_64 = S_MOV_B64 0
+    %7:vreg_64 = COPY %5
+    %8:vreg_64 = COPY %6
+    %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %11:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %10, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD killed %8, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %14:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %13, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32(s32) = COPY $vgpr0
+    %16:sreg_32 = S_MOV_B32 1
+    %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
+    %18:sreg_64 = COPY %17
+    %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec
+    %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec
+
+  bb.2:
+    %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1
+    %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1
+    SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    S_ENDPGM 0, implicit %22, implicit %23
+...
+---
+name:            test_no_sink_into_if_cond_multiple_uses
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: test_no_sink_into_if_cond_multiple_uses
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+  ; GFX9-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %13, %10, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = COPY $vgpr1
+    %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_64 = S_MOV_B64 0
+    %6:sreg_64 = S_MOV_B64 0
+    %7:vreg_64 = COPY %5
+    %8:vreg_64 = COPY %6
+    %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %11:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %10, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD killed %8, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %14:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %13, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32(s32) = COPY $vgpr0
+    %16:sreg_32 = S_MOV_B32 1
+    %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
+    %18:sreg_64 = COPY %17
+    %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec
+    %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec
+
+  bb.2:
+    %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1
+    %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1
+    SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    %24:vgpr_32 = V_ADD_F32_e32 %14, %11, implicit $mode, implicit $exec
+    S_ENDPGM 0, implicit %22, implicit %23
+...
+---
+name:            no_sink_fmac_not_constant_mode
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: no_sink_fmac_not_constant_mode
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $mode = IMPLICIT_DEF
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+  ; GFX9-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    $mode = IMPLICIT_DEF
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = COPY $vgpr1
+    %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_64 = S_MOV_B64 0
+    %6:sreg_64 = S_MOV_B64 0
+    %7:vreg_64 = COPY %5
+    %8:vreg_64 = COPY %6
+    %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %11:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %10, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = GLOBAL_LOAD_DWORD killed %8, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %14:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %13, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %15:vgpr_32(s32) = COPY $vgpr0
+    %16:sreg_32 = S_MOV_B32 1
+    %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
+    %18:sreg_64 = COPY %17
+    %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec
+    %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec
+
+  bb.2:
+    %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1
+    %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1
+    SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    S_ENDPGM 0, implicit %22, implicit %23
+...
+---
+name:            test_no_sink_fmac_wwm
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: test_no_sink_fmac_wwm
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %5:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM %5, implicit $exec
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0, implicit %5
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   S_ENDPGM 0, implicit %6
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = COPY $vgpr1
+
+    %20:sreg_64 = S_MOV_B64 0
+    %30:vreg_64 = COPY %20
+    %29:vgpr_32 = GLOBAL_LOAD_DWORD killed %30, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %9:vgpr_32 = STRICT_WWM %6, implicit $exec
+
+    %16:vgpr_32(s32) = COPY $vgpr0
+    %23:sreg_32 = S_MOV_B32 1
+    %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
+    %0:sreg_64 = COPY %24
+    %5:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+
+  bb.2:
+    S_NOP 0, implicit %6
+    SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    S_ENDPGM 0, implicit %9
+...
+---
+name:            test_def_and_use_in_loop_sink_fmac
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: test_def_and_use_in_loop_sink_fmac
+  ; GFX9: bb.0.entry:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.4:
+  ; GFX9-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.5:
+  ; GFX9-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.6:
+  ; GFX9-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.7:
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_ENDPGM 0
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    %101:vgpr_32 = COPY $vgpr0
+    %102:vgpr_32 = COPY $vgpr1
+    %15:vreg_64 = COPY $vgpr2_vgpr3
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    %20:sreg_64 = S_MOV_B64 0
+    %30:vreg_64 = COPY %20
+    %29:vgpr_32 = GLOBAL_LOAD_DWORD %30, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec
+    %31:vgpr_32 = GLOBAL_LOAD_DWORD  %15, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %7:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %31, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec
+    %16:vgpr_32(s32) = COPY $vgpr0
+    %23:sreg_32 = S_MOV_B32 1
+    %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
+    %0:sreg_64 = COPY %24
+    %5:sreg_64 = SI_IF %0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+
+  bb.3:
+    successors: %bb.4(0x40000000), %bb.6(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0, implicit %6, implicit %7
+    SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_CBRANCH_EXECZ %bb.6, implicit $exec
+
+  bb.4:
+    successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+    S_CBRANCH_EXECZ %bb.4, implicit $exec
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+
+  bb.6:
+    successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+
+  bb.7:
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    S_ENDPGM 0
+...
+---
+name:            test_no_sink_def_into_loop
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: test_no_sink_def_into_loop
+  ; GFX9: bb.0.entry:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.4:
+  ; GFX9-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.5:
+  ; GFX9-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.6:
+  ; GFX9-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.7:
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_ENDPGM 0
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    %101:vgpr_32 = COPY $vgpr0
+    %102:vgpr_32 = COPY $vgpr1
+    %15:vreg_64 = COPY $vgpr2_vgpr3
+    %20:sreg_64 = S_MOV_B64 0
+    %30:vreg_64 = COPY %20
+    %29:vgpr_32 = GLOBAL_LOAD_DWORD killed %30, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec
+    %31:vgpr_32 = GLOBAL_LOAD_DWORD killed %15, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %7:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %31, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0, implicit %6, implicit %7
+    %16:vgpr_32(s32) = COPY $vgpr0
+    %23:sreg_32 = S_MOV_B32 1
+    %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
+    %0:sreg_64 = COPY %24
+    %5:sreg_64 = SI_IF %0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+
+  bb.3:
+    successors: %bb.4(0x40000000), %bb.6(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_CBRANCH_EXECZ %bb.6, implicit $exec
+
+  bb.4:
+    successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+    S_CBRANCH_EXECZ %bb.4, implicit $exec
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+
+  bb.6:
+    successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+
+  bb.7:
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    S_ENDPGM 0
+...
+---
+name:            test_no_sink_def_into_loop2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-LABEL: name: test_no_sink_def_into_loop2
+  ; GFX9: bb.0.entry:
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT:   S_BRANCH %bb.2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.3
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT:   S_BRANCH %bb.4
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.4:
+  ; GFX9-NEXT:   successors: %bb.5(0x40000000), %bb.7(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.7, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.5:
+  ; GFX9-NEXT:   successors: %bb.6(0x04000000), %bb.5(0x7c000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.6
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.6:
+  ; GFX9-NEXT:   successors: %bb.7(0x80000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_NOP 0
+  ; GFX9-NEXT:   S_BRANCH %bb.7
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.7:
+  ; GFX9-NEXT:   successors: %bb.8(0x04000000), %bb.2(0x7c000000)
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
+  ; GFX9-NEXT:   S_BRANCH %bb.8
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.8:
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   S_ENDPGM 0
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2 (0x40000000)
+
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    %101:vgpr_32 = COPY $vgpr0
+    %102:vgpr_32 = COPY $vgpr1
+    %15:vreg_64 = COPY $vgpr2_vgpr3
+    %20:sreg_64 = S_MOV_B64 0
+    %30:vreg_64 = COPY %20
+    %29:vgpr_32 = GLOBAL_LOAD_DWORD killed %30, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %29, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec
+    %31:vgpr_32 = GLOBAL_LOAD_DWORD killed %15, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %7:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %31, 0, %101, 0, %102, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    S_NOP 0
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.4(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0, implicit %6, implicit %7
+    %16:vgpr_32(s32) = COPY $vgpr0
+    %23:sreg_32 = S_MOV_B32 1
+    %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
+    %0:sreg_64 = COPY %24
+    %5:sreg_64 = SI_IF %0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5(0x40000000), %bb.7(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_CBRANCH_EXECZ %bb.7, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.5:
+    successors: %bb.6(0x04000000), %bb.5(0x7c000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+    S_CBRANCH_EXECZ %bb.5, implicit $exec
+    S_BRANCH %bb.6
+
+  bb.6:
+    successors: %bb.7(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_NOP 0
+    S_BRANCH %bb.7
+
+  bb.7:
+    successors: %bb.8(0x04000000), %bb.2(0x7c000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+
+    S_CBRANCH_VCCZ %bb.2, implicit $vcc
+    S_BRANCH %bb.8
+
+  bb.8:
+    liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 2fac70d96bc31..d1970a735a17a 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -54,17 +54,17 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-LABEL: lsr_order_mul24_1:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v18
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
-; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX9-NEXT:  ; %bb.1: ; %bb19
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v6
-; GFX9-NEXT:    v_lshl_add_u32 v6, v4, 2, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v18
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_lshl_add_u32 v7, v4, 2, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v9, v17, v12
 ; GFX9-NEXT:    s_mov_b64 s[10:11], 0
@@ -76,7 +76,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    v_add_u32_e32 v12, v17, v0
 ; GFX9-NEXT:    v_add_u32_e32 v19, v9, v0
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_madak_f32 v3, v3, v7, 0x3727c5ac
+; GFX9-NEXT:    v_madak_f32 v3, v3, v6, 0x3727c5ac
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v18, v3, v5
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v16
@@ -97,8 +97,8 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
-; GFX9-NEXT:    ds_write_b32 v6, v3
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    ds_write_b32 v7, v3
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB1_2
 ; GFX9-NEXT:  .LBB1_3: ; %Flow3

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 383760c7809fb..84cec8366259d 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1373,7 +1373,6 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; SI-LABEL: complex_loop:
 ; SI:       ; %bb.0: ; %.entry
 ; SI-NEXT:    s_cmp_lt_i32 s0, 1
-; SI-NEXT:    v_mov_b32_e32 v2, -1
 ; SI-NEXT:    s_cbranch_scc1 .LBB15_7
 ; SI-NEXT:  ; %bb.1: ; %.lr.ph
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
@@ -1405,7 +1404,10 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; SI-NEXT:    s_branch .LBB15_2
 ; SI-NEXT:  .LBB15_6: ; %Flow
 ; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
-; SI-NEXT:  .LBB15_7: ; %._crit_edge
+; SI-NEXT:    exp mrt0 v2, v2, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  .LBB15_7:
+; SI-NEXT:    v_mov_b32_e32 v2, -1
 ; SI-NEXT:    exp mrt0 v2, v2, v0, v0 done vm
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  .LBB15_8:
@@ -1415,7 +1417,6 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ;
 ; GFX10-WAVE64-LABEL: complex_loop:
 ; GFX10-WAVE64:       ; %bb.0: ; %.entry
-; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX10-WAVE64-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc1 .LBB15_7
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %.lr.ph
@@ -1448,7 +1449,10 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; GFX10-WAVE64-NEXT:    s_branch .LBB15_2
 ; GFX10-WAVE64-NEXT:  .LBB15_6: ; %Flow
 ; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX10-WAVE64-NEXT:  .LBB15_7: ; %._crit_edge
+; GFX10-WAVE64-NEXT:    exp mrt0 v2, v2, v0, v0 done vm
+; GFX10-WAVE64-NEXT:    s_endpgm
+; GFX10-WAVE64-NEXT:  .LBB15_7:
+; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX10-WAVE64-NEXT:    exp mrt0 v2, v2, v0, v0 done vm
 ; GFX10-WAVE64-NEXT:    s_endpgm
 ; GFX10-WAVE64-NEXT:  .LBB15_8:
@@ -1458,7 +1462,6 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ;
 ; GFX10-WAVE32-LABEL: complex_loop:
 ; GFX10-WAVE32:       ; %bb.0: ; %.entry
-; GFX10-WAVE32-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX10-WAVE32-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc1 .LBB15_7
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %.lr.ph
@@ -1491,7 +1494,10 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; GFX10-WAVE32-NEXT:    s_branch .LBB15_2
 ; GFX10-WAVE32-NEXT:  .LBB15_6: ; %Flow
 ; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX10-WAVE32-NEXT:  .LBB15_7: ; %._crit_edge
+; GFX10-WAVE32-NEXT:    exp mrt0 v2, v2, v0, v0 done vm
+; GFX10-WAVE32-NEXT:    s_endpgm
+; GFX10-WAVE32-NEXT:  .LBB15_7:
+; GFX10-WAVE32-NEXT:    v_mov_b32_e32 v2, -1
 ; GFX10-WAVE32-NEXT:    exp mrt0 v2, v2, v0, v0 done vm
 ; GFX10-WAVE32-NEXT:    s_endpgm
 ; GFX10-WAVE32-NEXT:  .LBB15_8:

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 2ba35cd971972..d3e2df6763d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -20,7 +20,6 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v4, 0x400000
 ; MUBUF-NEXT:    s_mov_b32 s32, 0xc0000
-; MUBUF-NEXT:    v_add_nc_u32_e64 v40, 4, 0x4000
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, svm_eval_nodes at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, svm_eval_nodes at rel32@hi+12
@@ -33,11 +32,12 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; MUBUF-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; MUBUF-NEXT:    s_cbranch_execz .LBB0_2
 ; MUBUF-NEXT:  ; %bb.1: ; %if.then4.i
+; MUBUF-NEXT:    v_add_nc_u32_e64 v0, 4, 0x4000
 ; MUBUF-NEXT:    s_clause 0x1
-; MUBUF-NEXT:    buffer_load_dword v0, v40, s[36:39], 0 offen
-; MUBUF-NEXT:    buffer_load_dword v1, v40, s[36:39], 0 offen offset:4
+; MUBUF-NEXT:    buffer_load_dword v1, v0, s[36:39], 0 offen
+; MUBUF-NEXT:    buffer_load_dword v2, v0, s[36:39], 0 offen offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; MUBUF-NEXT:    v_add_nc_u32_e32 v0, v2, v1
 ; MUBUF-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
 ; MUBUF-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
 ; MUBUF-NEXT:    buffer_store_dword v0, v0, s[36:39], 0 offen


        


More information about the llvm-commits mailing list