[llvm] [MachineSink] Add capability for aggressive loop sinking (PR #117247)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 9 08:40:45 PST 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/117247

>From b32aa2510a9724fcb815d3186dab1be469acc225 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 20 Nov 2024 14:24:09 -0800
Subject: [PATCH 1/8] [MachineSink] Add option for aggressive loop sinking

Change-Id: I62a6c6fc2c372523ce9ec98d084a434548609ead
---
 llvm/lib/CodeGen/MachineSink.cpp              | 184 +++++++++
 .../aggressive-loop-sink-nonstandard.ll       |  20 +
 .../machine-sink-ignorable-exec-use.mir       | 360 ++++++++++++++++++
 .../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 208 ++++++----
 4 files changed, 703 insertions(+), 69 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index c470bd71dfb29f..d8dd6e8478686d 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -100,6 +100,12 @@ static cl::opt<bool>
                                 "register spills"),
                        cl::init(false), cl::Hidden);
 
+static cl::opt<bool> AggressivelySinkInstsIntoCycle(
+    "aggressively-sink-insts-to-avoid-spills",
+    cl::desc("Aggressively sink instructions into cycles to avoid "
+             "register spills"),
+    cl::init(false), cl::Hidden);
+
 static cl::opt<unsigned> SinkIntoCycleLimit(
     "machine-sink-cycle-limit",
     cl::desc(
@@ -256,6 +262,13 @@ class MachineSinking : public MachineFunctionPass {
                                SmallVectorImpl<MachineInstr *> &Candidates);
   bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
 
+  bool isDead(const MachineInstr *MI) const;
+  bool AggressivelySinkIntoCycle(
+      MachineCycle *Cycle, MachineInstr &I,
+      DenseMap<MachineInstr *,
+               std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+          SunkInstrs);
+
   bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                             MachineBasicBlock *MBB,
                             MachineBasicBlock *SuccToSinkTo,
@@ -679,6 +692,10 @@ void MachineSinking::FindCycleSinkCandidates(
     SmallVectorImpl<MachineInstr *> &Candidates) {
   for (auto &MI : *BB) {
     LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
+    if (MI.isDebugInstr()) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Dont sink debug instructions\n");
+      continue;
+    }
     if (!TII->shouldSink(MI)) {
       LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
                            "target\n");
@@ -799,6 +816,30 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  if (AggressivelySinkInstsIntoCycle) {
+    SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+    DenseMap<MachineInstr *,
+             std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+        SunkInstrs;
+    for (auto *Cycle : Cycles) {
+      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+      if (!Preheader) {
+        LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Can't find preheader\n");
+        continue;
+      }
+      SmallVector<MachineInstr *, 8> Candidates;
+      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+      // Walk the candidates in reverse order so that we start with the use
+      // of a def-use chain, if there is any.
+      for (MachineInstr *I : llvm::reverse(Candidates)) {
+        AggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
+        EverMadeChange = true;
+        ++NumCycleSunk;
+      }
+    }
+  }
+
   HasStoreCache.clear();
   StoreInstrCache.clear();
 
@@ -1574,6 +1615,149 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
   return HasAliasedStore;
 }
 
+/// Copy paste from DeadMachineInstructionElimImpl
+
+bool MachineSinking::isDead(const MachineInstr *MI) const {
+  // Instructions without side-effects are dead iff they only define dead regs.
+  // This function is hot and this loop returns early in the common case,
+  // so only perform additional checks before this if absolutely necessary.
+  for (const MachineOperand &MO : MI->all_defs()) {
+    Register Reg = MO.getReg();
+    if (Reg.isPhysical()) {
+      return false;
+    } else {
+      if (MO.isDead()) {
+#ifndef NDEBUG
+        // Basic check on the register. All of them should be 'undef'.
+        for (auto &U : MRI->use_nodbg_operands(Reg))
+          assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+#endif
+        continue;
+      }
+      for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
+        if (&Use != MI)
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+      }
+    }
+  }
+
+  // Technically speaking inline asm without side effects and no defs can still
+  // be deleted. But there is so much bad inline asm code out there, we should
+  // let them be.
+  if (MI->isInlineAsm())
+    return false;
+
+  // FIXME: See issue #105950 for why LIFETIME markers are considered dead here.
+  if (MI->isLifetimeMarker())
+    return true;
+
+  // If there are no defs with uses, the instruction might be dead.
+  return MI->wouldBeTriviallyDead();
+}
+
+/// Aggressively sink instructions into cycles. This will aggressively try to
+/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
+/// In particular, it will sink into multiple successor blocks without limits
+/// based on the amount of sinking, or the type of ops being sunk (so long as
+/// they are safe to sink).
+bool MachineSinking::AggressivelySinkIntoCycle(
+    MachineCycle *Cycle, MachineInstr &I,
+    DenseMap<MachineInstr *,
+             std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+        SunkInstrs) {
+  LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
+  MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+  assert(Preheader && "Cycle sink needs a preheader block");
+  SmallVector<std::pair<MachineOperand, MachineInstr *>> Uses;
+  // TODO: support instructions with multiple defs
+  if (I.getNumDefs() > 1)
+    return false;
+
+  MachineOperand DefMO = I.getOperand(0);
+  for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
+    Uses.push_back({DefMO, &MI});
+  }
+
+  for (std::pair<MachineOperand, MachineInstr *> Entry : Uses) {
+    MachineInstr *MI = Entry.second;
+    LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Analysing use: " << MI);
+    if (MI->isPHI()) {
+      LLVM_DEBUG(
+          dbgs() << "AggressiveCycleSink:   Not attempting to sink for PHI.\n");
+      continue;
+    }
+    // We cannot sink before the prologue
+    if (TII->isBasicBlockPrologue(*MI) || MI->isPosition()) {
+      LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Use is BasicBlock prologue, "
+                           "can't sink.\n");
+      continue;
+    }
+    if (!Cycle->contains(MI->getParent())) {
+      LLVM_DEBUG(
+          dbgs() << "AggressiveCycleSink:   Use not in cycle, can't sink.\n");
+      continue;
+    }
+
+    MachineBasicBlock *SinkBlock = MI->getParent();
+    MachineInstr *NewMI = nullptr;
+
+    // Check for the case in which we have already sunk a copy of this
+    // instruction into the user block.
+    if (SunkInstrs.contains(&I)) {
+      auto SunkBlocks = SunkInstrs[&I];
+      auto Match = std::find_if(
+          SunkBlocks.begin(), SunkBlocks.end(),
+          [&SinkBlock](
+              std::pair<MachineBasicBlock *, MachineInstr *> SunkEntry) {
+            return SunkEntry.first == SinkBlock;
+          });
+      if (Match != SunkBlocks.end()) {
+        LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Already sunk to block: "
+                          << printMBBReference(*SinkBlock) << "\n");
+        NewMI = Match->second;
+      }
+    }
+
+    // Create a copy of the instruction in the use block.
+    if (!NewMI) {
+      LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
+                        << printMBBReference(*SinkBlock) << "\n");
+
+      NewMI = I.getMF()->CloneMachineInstr(&I);
+      if (DefMO.getReg().isVirtual()) {
+        const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
+        Register DestReg = MRI->createVirtualRegister(TRC);
+        NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
+                                  *TRI);
+      }
+      SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
+                        NewMI);
+      SunkInstrs[&I].push_back({SinkBlock, NewMI});
+    }
+
+    // Conservatively clear any kill flags on uses of sunk instruction
+    for (MachineOperand &MO : NewMI->operands()) {
+      if (MO.isReg() && MO.readsReg())
+        RegsToClearKillFlags.insert(MO.getReg());
+    }
+
+    // The instruction is moved from its basic block, so do not retain the
+    // debug information.
+    assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
+    NewMI->setDebugLoc(DebugLoc());
+
+    // Replace the use with the newly created virtual register.
+    MachineOperand UseMO = Entry.first;
+    MI->substituteRegister(UseMO.getReg(), NewMI->getOperand(0).getReg(),
+                           UseMO.getSubReg(), *TRI);
+  }
+  // If we have replaced all uses, then delete the dead instruction
+  if (isDead(&I))
+    I.eraseFromParent();
+  return true;
+}
+
 /// Sink instructions into cycles if profitable. This especially tries to
 /// prevent register spills caused by register pressure if there is little to no
 /// overhead moving instructions into cycles.
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
new file mode 100644
index 00000000000000..72b4495297a1c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressively-sink-insts-to-avoid-spills=1  < %s | FileCheck -check-prefix=SUNK %s
+
+; Check that various edge cases do not crash the compiler
+
+; Multiple uses of sunk valu, chain of sink candidates
+
+define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
+; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  ret half %result
+}
+
+; Sink candidates with multiple defs
+
+define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
+; SUNK-LABEL: memmove_p5_p5:
+entry:
+  tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index efa21052e3ae2f..f93d8f3dde21b6 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+
 
 ---
 name:            test_sink_fmac_to_only_use
@@ -48,6 +50,47 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+  ;
+  ; GFX9-SUNK-LABEL: name: test_sink_fmac_to_only_use
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+  ; GFX9-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     %1:vgpr_32 = COPY $vgpr0
@@ -131,6 +174,48 @@ body:             |
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+  ;
+  ; GFX9-SUNK-LABEL: name: test_no_sink_into_if_cond_multiple_uses
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+  ; GFX9-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     %1:vgpr_32 = COPY $vgpr0
@@ -215,6 +300,48 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+  ;
+  ; GFX9-SUNK-LABEL: name: no_sink_fmac_not_constant_mode
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   $mode = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+  ; GFX9-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     $mode = IMPLICIT_DEF
@@ -287,6 +414,36 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   S_ENDPGM 0, implicit %6
+  ;
+  ; GFX9-SUNK-LABEL: name: test_no_sink_fmac_wwm
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]]
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit %6
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     %1:vgpr_32 = COPY $vgpr0
@@ -382,6 +539,69 @@ body:             |
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-SUNK-LABEL: name: test_def_and_use_in_loop_sink_fmac
+  ; GFX9-SUNK: bb.0.entry:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6:
+  ; GFX9-SUNK-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.7:
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
   bb.0.entry:
     successors: %bb.1(0x80000000)
 
@@ -512,6 +732,69 @@ body:             |
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop
+  ; GFX9-SUNK: bb.0.entry:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6:
+  ; GFX9-SUNK-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.7:
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
   bb.0.entry:
     successors: %bb.1(0x80000000)
 
@@ -656,6 +939,83 @@ body:             |
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop2
+  ; GFX9-SUNK: bb.0.entry:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.5(0x40000000), %bb.7(0x40000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.7, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   successors: %bb.6(0x04000000), %bb.5(0x7c000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.6
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6:
+  ; GFX9-SUNK-NEXT:   successors: %bb.7(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_NOP 0
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.7
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.7:
+  ; GFX9-SUNK-NEXT:   successors: %bb.8(0x04000000), %bb.2(0x7c000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.8
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.8:
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
   bb.0.entry:
     successors: %bb.1(0x40000000), %bb.2 (0x40000000)
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index 04c80582f6f079..2a14b85cf2bd56 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,78 +1,148 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o -  %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o -  %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -aggressively-sink-insts-to-avoid-spills=1 -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
 
 ---
 name: multi_else_break
 tracksRegLiveness: true
 body: |
-  ; CHECK-LABEL: name: multi_else_break
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
-  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
-  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
-  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
-  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
-  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
-  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
-  ; CHECK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   S_ENDPGM 0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
-  ; CHECK-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
-  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
-  ; CHECK-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   successors: %bb.6(0x04000000), %bb.2(0x7c000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
-  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
-  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
-  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
-  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.6
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.1(0x7c000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
-  ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
-  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-LABEL: name: multi_else_break
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr4, $vgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+  ; GFX10-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+  ; GFX10-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+  ; GFX10-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+  ; GFX10-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+  ; GFX10-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.3
+  ;
+  ; GFX10-SUNK-LABEL: name: multi_else_break
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT:   liveins: $vgpr4, $vgpr5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX10-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+  ; GFX10-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+  ; GFX10-SUNK-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+  ; GFX10-SUNK-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+  ; GFX10-SUNK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+  ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+  ; GFX10-SUNK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+  ; GFX10-SUNK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+  ; GFX10-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+  ; GFX10-SUNK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+  ; GFX10-SUNK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+  ; GFX10-SUNK-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+  ; GFX10-SUNK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+  ; GFX10-SUNK-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+  ; GFX10-SUNK-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+  ; GFX10-SUNK-NEXT:   [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+  ; GFX10-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+  ; GFX10-SUNK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.6
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6:
+  ; GFX10-SUNK-NEXT:   successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+  ; GFX10-SUNK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+  ; GFX10-SUNK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
   bb.0:
     successors: %bb.1(0x80000000)
     liveins: $vgpr4, $vgpr5

>From 0813b87be351315d2a23c58e6f95c5be1eba5d45 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 22 Nov 2024 13:22:26 -0800
Subject: [PATCH 2/8] Address review comments

Change-Id: I975fab6cf7dba21788fb5677a5484916ef29d959
---
 llvm/lib/CodeGen/MachineSink.cpp              | 122 ++++-----
 .../aggressive-loop-sink-nonstandard.ll       | 245 +++++++++++++++++-
 .../machine-sink-ignorable-exec-use.mir       |   2 +-
 .../CodeGen/AMDGPU/machine-sink-lane-mask.mir |   4 +-
 4 files changed, 297 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index d8dd6e8478686d..151348e6b1c1ba 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -101,7 +101,7 @@ static cl::opt<bool>
                        cl::init(false), cl::Hidden);
 
 static cl::opt<bool> AggressivelySinkInstsIntoCycle(
-    "aggressively-sink-insts-to-avoid-spills",
+    "aggressive-sink-insts-into-cycles",
     cl::desc("Aggressively sink instructions into cycles to avoid "
              "register spills"),
     cl::init(false), cl::Hidden);
@@ -118,6 +118,8 @@ STATISTIC(NumSplit, "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
 STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
 
+using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
 namespace {
 
 class MachineSinking : public MachineFunctionPass {
@@ -263,11 +265,10 @@ class MachineSinking : public MachineFunctionPass {
   bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
 
   bool isDead(const MachineInstr *MI) const;
-  bool AggressivelySinkIntoCycle(
+  bool aggressivelySinkIntoCycle(
       MachineCycle *Cycle, MachineInstr &I,
-      DenseMap<MachineInstr *,
-               std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
-          SunkInstrs);
+      DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+          &SunkInstrs);
 
   bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                             MachineBasicBlock *MBB,
@@ -692,8 +693,8 @@ void MachineSinking::FindCycleSinkCandidates(
     SmallVectorImpl<MachineInstr *> &Candidates) {
   for (auto &MI : *BB) {
     LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
-    if (MI.isDebugInstr()) {
-      LLVM_DEBUG(dbgs() << "CycleSink: Dont sink debug instructions\n");
+    if (MI.isMetaInstruction()) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Dont sink meta instructions\n");
       continue;
     }
     if (!TII->shouldSink(MI)) {
@@ -786,8 +787,11 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     EverMadeChange = true;
   }
 
-  if (SinkInstsIntoCycle) {
+  if (SinkInstsIntoCycle || AggressivelySinkInstsIntoCycle) {
     SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+
+    DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+        SunkInstrs;
     for (auto *Cycle : Cycles) {
       MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
       if (!Preheader) {
@@ -801,7 +805,18 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
       // of a def-use chain, if there is any.
       // TODO: Sort the candidates using a cost-model.
       unsigned i = 0;
+
       for (MachineInstr *I : llvm::reverse(Candidates)) {
+        // AggressivelySinkInstsIntoCycle sinks a superset of instructions
+        // relative to regular cycle sinking. Thus, this option supercedes
+        // captures all sinking opportunites done
+        if (AggressivelySinkInstsIntoCycle) {
+          aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
+          EverMadeChange = true;
+          ++NumCycleSunk;
+          continue;
+        }
+
         if (i++ == SinkIntoCycleLimit) {
           LLVM_DEBUG(dbgs() << "CycleSink:   Limit reached of instructions to "
                                "be analysed.");
@@ -816,30 +831,6 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (AggressivelySinkInstsIntoCycle) {
-    SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
-    DenseMap<MachineInstr *,
-             std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
-        SunkInstrs;
-    for (auto *Cycle : Cycles) {
-      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
-      if (!Preheader) {
-        LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Can't find preheader\n");
-        continue;
-      }
-      SmallVector<MachineInstr *, 8> Candidates;
-      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
-      // Walk the candidates in reverse order so that we start with the use
-      // of a def-use chain, if there is any.
-      for (MachineInstr *I : llvm::reverse(Candidates)) {
-        AggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
-        EverMadeChange = true;
-        ++NumCycleSunk;
-      }
-    }
-  }
-
   HasStoreCache.clear();
   StoreInstrCache.clear();
 
@@ -1615,31 +1606,27 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
   return HasAliasedStore;
 }
 
-/// Copy paste from DeadMachineInstructionElimImpl
-
 bool MachineSinking::isDead(const MachineInstr *MI) const {
   // Instructions without side-effects are dead iff they only define dead regs.
   // This function is hot and this loop returns early in the common case,
   // so only perform additional checks before this if absolutely necessary.
+
   for (const MachineOperand &MO : MI->all_defs()) {
     Register Reg = MO.getReg();
-    if (Reg.isPhysical()) {
+    if (Reg.isPhysical())
       return false;
-    } else {
-      if (MO.isDead()) {
+
+    if (MO.isDead()) {
 #ifndef NDEBUG
-        // Basic check on the register. All of them should be 'undef'.
-        for (auto &U : MRI->use_nodbg_operands(Reg))
-          assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+      // Basic check on the register. All of them should be 'undef'.
+      for (auto &U : MRI->use_nodbg_operands(Reg))
+        assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
 #endif
-        continue;
-      }
-      for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
-        if (&Use != MI)
-          // This def has a non-debug use. Don't delete the instruction!
-          return false;
-      }
+      continue;
     }
+
+    if (!(MRI->hasAtMostUserInstrs(Reg, 0)))
+      return false;
   }
 
   // Technically speaking inline asm without side effects and no defs can still
@@ -1661,25 +1648,24 @@ bool MachineSinking::isDead(const MachineInstr *MI) const {
 /// In particular, it will sink into multiple successor blocks without limits
 /// based on the amount of sinking, or the type of ops being sunk (so long as
 /// they are safe to sink).
-bool MachineSinking::AggressivelySinkIntoCycle(
+bool MachineSinking::aggressivelySinkIntoCycle(
     MachineCycle *Cycle, MachineInstr &I,
-    DenseMap<MachineInstr *,
-             std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
-        SunkInstrs) {
+    DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+        &SunkInstrs) {
   LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
   MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
   assert(Preheader && "Cycle sink needs a preheader block");
-  SmallVector<std::pair<MachineOperand, MachineInstr *>> Uses;
+  SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
   // TODO: support instructions with multiple defs
   if (I.getNumDefs() > 1)
     return false;
 
-  MachineOperand DefMO = I.getOperand(0);
+  MachineOperand &DefMO = I.getOperand(0);
   for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
-    Uses.push_back({DefMO, &MI});
+    Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
   }
 
-  for (std::pair<MachineOperand, MachineInstr *> Entry : Uses) {
+  for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
     MachineInstr *MI = Entry.second;
     LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Analysing use: " << MI);
     if (MI->isPHI()) {
@@ -1701,22 +1687,14 @@ bool MachineSinking::AggressivelySinkIntoCycle(
 
     MachineBasicBlock *SinkBlock = MI->getParent();
     MachineInstr *NewMI = nullptr;
+    std::pair<MachineInstr *, MachineBasicBlock *> MapEntry(&I, SinkBlock);
 
     // Check for the case in which we have already sunk a copy of this
     // instruction into the user block.
-    if (SunkInstrs.contains(&I)) {
-      auto SunkBlocks = SunkInstrs[&I];
-      auto Match = std::find_if(
-          SunkBlocks.begin(), SunkBlocks.end(),
-          [&SinkBlock](
-              std::pair<MachineBasicBlock *, MachineInstr *> SunkEntry) {
-            return SunkEntry.first == SinkBlock;
-          });
-      if (Match != SunkBlocks.end()) {
-        LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Already sunk to block: "
-                          << printMBBReference(*SinkBlock) << "\n");
-        NewMI = Match->second;
-      }
+    if (SunkInstrs.contains(MapEntry)) {
+      LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Already sunk to block: "
+                        << printMBBReference(*SinkBlock) << "\n");
+      NewMI = SunkInstrs[MapEntry];
     }
 
     // Create a copy of the instruction in the use block.
@@ -1733,7 +1711,7 @@ bool MachineSinking::AggressivelySinkIntoCycle(
       }
       SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
                         NewMI);
-      SunkInstrs[&I].push_back({SinkBlock, NewMI});
+      SunkInstrs[MapEntry] = NewMI;
     }
 
     // Conservatively clear any kill flags on uses of sunk instruction
@@ -1748,9 +1726,9 @@ bool MachineSinking::AggressivelySinkIntoCycle(
     NewMI->setDebugLoc(DebugLoc());
 
     // Replace the use with the newly created virtual register.
-    MachineOperand UseMO = Entry.first;
-    MI->substituteRegister(UseMO.getReg(), NewMI->getOperand(0).getReg(),
-                           UseMO.getSubReg(), *TRI);
+    RegSubRegPair &UseReg = Entry.first;
+    MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
+                           UseReg.SubReg, *TRI);
   }
   // If we have replaced all uses, then delete the dead instruction
   if (isDead(&I))
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
index 72b4495297a1c5..9e53b8434cc021 100644
--- a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressively-sink-insts-to-avoid-spills=1  < %s | FileCheck -check-prefix=SUNK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressive-sink-insts-into-cycles=1 < %s | FileCheck -check-prefix=SUNK %s
 
 ; Check that various edge cases do not crash the compiler
 
@@ -6,6 +7,39 @@
 
 define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
 ; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; SUNK:       ; %bb.0:
+; SUNK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT:    v_mov_b32_e32 v3, v0
+; SUNK-NEXT:    v_and_b32_e32 v0, -4, v3
+; SUNK-NEXT:    global_load_dword v4, v[0:1], off
+; SUNK-NEXT:    v_and_b32_e32 v3, 3, v3
+; SUNK-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; SUNK-NEXT:    s_mov_b32 s2, 0xffff
+; SUNK-NEXT:    v_lshlrev_b32_e64 v5, v3, s2
+; SUNK-NEXT:    s_mov_b64 s[0:1], 0
+; SUNK-NEXT:    v_not_b32_e32 v5, v5
+; SUNK-NEXT:    v_max_f16_e32 v2, v2, v2
+; SUNK-NEXT:  .LBB0_1: ; %atomicrmw.start
+; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    v_mov_b32_e32 v7, v4
+; SUNK-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
+; SUNK-NEXT:    v_max_f16_e32 v4, v4, v4
+; SUNK-NEXT:    v_min_f16_e32 v4, v4, v2
+; SUNK-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
+; SUNK-NEXT:    v_and_or_b32 v6, v7, v5, v4
+; SUNK-NEXT:    buffer_wbl2 sc1
+; SUNK-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    buffer_inv sc1
+; SUNK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; SUNK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SUNK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; SUNK-NEXT:    s_cbranch_execnz .LBB0_1
+; SUNK-NEXT:  ; %bb.2: ; %atomicrmw.end
+; SUNK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
+; SUNK-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
   ret half %result
 }
@@ -14,7 +48,216 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
 
 define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
 ; SUNK-LABEL: memmove_p5_p5:
+; SUNK:       ; %bb.0: ; %entry
+; SUNK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT:    v_and_b32_e32 v4, 15, v2
+; SUNK-NEXT:    v_mov_b32_e32 v5, 0
+; SUNK-NEXT:    v_and_b32_e32 v6, -16, v2
+; SUNK-NEXT:    v_mov_b32_e32 v7, v3
+; SUNK-NEXT:    v_cmp_ne_u64_e64 s[0:1], 0, v[4:5]
+; SUNK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SUNK-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v0
+; SUNK-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; SUNK-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SUNK-NEXT:    s_cbranch_execnz .LBB1_3
+; SUNK-NEXT:  ; %bb.1: ; %Flow46
+; SUNK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[4:5]
+; SUNK-NEXT:    s_cbranch_execnz .LBB1_10
+; SUNK-NEXT:  .LBB1_2: ; %Flow47
+; SUNK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    s_setpc_b64 s[30:31]
+; SUNK-NEXT:  .LBB1_3: ; %memmove_copy_forward
+; SUNK-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SUNK-NEXT:    s_cbranch_execz .LBB1_6
+; SUNK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
+; SUNK-NEXT:    s_mov_b64 s[8:9], 0
+; SUNK-NEXT:    v_mov_b32_e32 v3, v1
+; SUNK-NEXT:    v_mov_b32_e32 v8, v0
+; SUNK-NEXT:  .LBB1_5: ; %memmove_fwd_main_loop
+; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT:    scratch_load_dwordx4 v[10:13], v3, off
+; SUNK-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, -16
+; SUNK-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
+; SUNK-NEXT:    v_add_u32_e32 v3, 16, v3
+; SUNK-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    scratch_store_dwordx4 v8, v[10:13], off
+; SUNK-NEXT:    v_add_u32_e32 v8, 16, v8
+; SUNK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; SUNK-NEXT:    s_cbranch_execnz .LBB1_5
+; SUNK-NEXT:  .LBB1_6: ; %Flow41
+; SUNK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SUNK-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
+; SUNK-NEXT:    s_cbranch_execz .LBB1_9
+; SUNK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
+; SUNK-NEXT:    v_and_b32_e32 v2, -16, v2
+; SUNK-NEXT:    v_add_u32_e32 v0, v0, v2
+; SUNK-NEXT:    v_add_u32_e32 v1, v1, v2
+; SUNK-NEXT:    s_mov_b64 s[8:9], 0
+; SUNK-NEXT:  .LBB1_8: ; %memmove_fwd_residual_loop
+; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT:    scratch_load_ubyte v2, v1, off
+; SUNK-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, -1
+; SUNK-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[4:5]
+; SUNK-NEXT:    v_add_u32_e32 v1, 1, v1
+; SUNK-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    scratch_store_byte v0, v2, off
+; SUNK-NEXT:    v_add_u32_e32 v0, 1, v0
+; SUNK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; SUNK-NEXT:    s_cbranch_execnz .LBB1_8
+; SUNK-NEXT:  .LBB1_9: ; %Flow39
+; SUNK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SUNK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SUNK-NEXT:    ; implicit-def: $vgpr0
+; SUNK-NEXT:    ; implicit-def: $vgpr1
+; SUNK-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SUNK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[4:5]
+; SUNK-NEXT:    s_cbranch_execz .LBB1_2
+; SUNK-NEXT:  .LBB1_10: ; %memmove_copy_backwards
+; SUNK-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; SUNK-NEXT:    s_cbranch_execz .LBB1_13
+; SUNK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
+; SUNK-NEXT:    v_add_u32_e32 v7, -1, v2
+; SUNK-NEXT:    v_add_u32_e32 v6, v0, v7
+; SUNK-NEXT:    v_add_u32_e32 v7, v1, v7
+; SUNK-NEXT:    s_mov_b64 s[6:7], 0
+; SUNK-NEXT:  .LBB1_12: ; %memmove_bwd_residual_loop
+; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT:    scratch_load_ubyte v8, v7, off
+; SUNK-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, -1
+; SUNK-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
+; SUNK-NEXT:    v_add_u32_e32 v7, -1, v7
+; SUNK-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    scratch_store_byte v6, v8, off
+; SUNK-NEXT:    v_add_u32_e32 v6, -1, v6
+; SUNK-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; SUNK-NEXT:    s_cbranch_execnz .LBB1_12
+; SUNK-NEXT:  .LBB1_13: ; %Flow45
+; SUNK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SUNK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; SUNK-NEXT:    s_cbranch_execz .LBB1_16
+; SUNK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
+; SUNK-NEXT:    v_and_b32_e32 v5, -16, v2
+; SUNK-NEXT:    v_add_u32_e32 v4, -16, v5
+; SUNK-NEXT:    v_add_u32_e32 v2, v0, v4
+; SUNK-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v5
+; SUNK-NEXT:    v_add_u32_e32 v4, v1, v4
+; SUNK-NEXT:    s_mov_b64 s[4:5], 0
+; SUNK-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v3, vcc
+; SUNK-NEXT:  .LBB1_15: ; %memmove_bwd_main_loop
+; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT:    scratch_load_dwordx4 v[6:9], v4, off
+; SUNK-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 16
+; SUNK-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; SUNK-NEXT:    v_add_u32_e32 v4, -16, v4
+; SUNK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    scratch_store_dwordx4 v2, v[6:9], off
+; SUNK-NEXT:    v_add_u32_e32 v2, -16, v2
+; SUNK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SUNK-NEXT:    s_cbranch_execnz .LBB1_15
+; SUNK-NEXT:  .LBB1_16: ; %Flow43
+; SUNK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
   ret void
 }
+
+; We should not sink the mfma into the if/else as it is convergent
+
+define void @convergent_sink(<4 x i16> %in0, <4 x i16> %in1, i32 %val, i32 %v, ptr addrspace(1) %outptr) #2 {
+; SUNK-LABEL: convergent_sink:
+; SUNK:       ; %bb.0: ; %entry
+; SUNK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], 0
+; SUNK-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
+; SUNK-NEXT:    v_lshlrev_b32_e32 v2, 1, v5
+; SUNK-NEXT:    s_mov_b32 s4, 0
+; SUNK-NEXT:    s_mov_b64 s[0:1], 0
+; SUNK-NEXT:    v_mov_b32_e32 v5, 0xde
+; SUNK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SUNK-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SUNK-NEXT:    s_branch .LBB2_2
+; SUNK-NEXT:  .LBB2_1: ; %end
+; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v4
+; SUNK-NEXT:    s_add_i32 s4, s4, 1
+; SUNK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; SUNK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; SUNK-NEXT:    s_cbranch_execz .LBB2_7
+; SUNK-NEXT:  .LBB2_2: ; %loop.body
+; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT:    s_cmp_lt_i32 s4, 6
+; SUNK-NEXT:    global_store_dword v[6:7], v5, off
+; SUNK-NEXT:    s_cbranch_scc0 .LBB2_4
+; SUNK-NEXT:  ; %bb.3: ; %else
+; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 3, v[6:7]
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[12:15], off offset:48
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[8:11], off offset:32
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off offset:16
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[0:3], off
+; SUNK-NEXT:    s_mov_b64 s[2:3], 0
+; SUNK-NEXT:    s_branch .LBB2_5
+; SUNK-NEXT:  .LBB2_4: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT:    s_mov_b64 s[2:3], -1
+; SUNK-NEXT:  .LBB2_5: ; %Flow
+; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; SUNK-NEXT:    v_mov_b32_e32 v8, v0
+; SUNK-NEXT:    s_cbranch_vccnz .LBB2_1
+; SUNK-NEXT:  ; %bb.6: ; %if
+; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT:    v_lshl_add_u64 v[8:9], v[2:3], 3, v[6:7]
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[12:15], off offset:48
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[8:11], off offset:32
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off offset:16
+; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[0:3], off
+; SUNK-NEXT:    v_mov_b32_e32 v8, v2
+; SUNK-NEXT:    s_branch .LBB2_1
+; SUNK-NEXT:  .LBB2_7: ; %exit
+; SUNK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[12:15], off offset:48
+; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[8:11], off offset:32
+; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[4:7], off offset:16
+; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[0:3], off
+; SUNK-NEXT:    s_waitcnt vmcnt(0)
+; SUNK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %1005 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %in0, <4 x i16> %in1, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  br label %loop.body
+
+loop.body:
+  %i = phi i32 [0, %entry], [%i.inc, %end]
+  store i32 222, ptr addrspace(1) %outptr
+  %cc = icmp sgt i32 %i, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %v.if = mul i32 %v, 2
+  %sptr.if =  getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.if
+  store <16 x float> %1005, ptr addrspace(1) %sptr.if
+  br label %end
+
+else:
+  %v.else = mul i32 %v, 3
+  %sptr.else =  getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.else
+  store <16 x float> %1005, ptr addrspace(1) %sptr.else
+  br label %end
+
+end:
+  %r = phi i32 [ %v.if, %if ], [ %v.else, %else ]
+  %cmp = icmp ne i32 %r, %val
+  %i.inc = add i32 %i, 1
+  br i1 %cmp, label %loop.body, label %exit
+
+exit:
+  store <16 x float> %1005, ptr addrspace(1) %outptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index f93d8f3dde21b6..259abae6d92c87 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
 
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index 2a14b85cf2bd56..fafad600c47458 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o -  %s | FileCheck -check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -aggressively-sink-insts-to-avoid-spills=1 -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
 
 ---
 name: multi_else_break
@@ -86,6 +86,7 @@ body: |
   ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
   ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT: {{  $}}
   ; GFX10-SUNK-NEXT: bb.1:
   ; GFX10-SUNK-NEXT:   successors: %bb.2(0x80000000)
@@ -100,7 +101,6 @@ body: |
   ; GFX10-SUNK-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
   ; GFX10-SUNK-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
   ; GFX10-SUNK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
-  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX10-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
   ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GFX10-SUNK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc

>From 7e3caf54cfdd9eb757944f8cec657af2ff8c0efd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Sat, 23 Nov 2024 14:00:44 -0800
Subject: [PATCH 3/8] Fix SystemZ test

Change-Id: I8f1138f9fc82251538f2c428f1e67fa2941266b5
---
 llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
index 43c286a830b42e..52c9d1067220ee 100644
--- a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
+++ b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
@@ -25,14 +25,14 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[LARL:%[0-9]+]]:addr64bit = LARL @b
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[LA:%[0-9]+]]:gr64bit = LA [[LARL]], 49, $noreg
   ; CHECK-NEXT:   [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0
   ; CHECK-NEXT:   $r2d = COPY [[DEF]]
   ; CHECK-NEXT:   $r3d = COPY [[LA]]

>From d09d4f1bdaec14b01c1cc1bd2cd8bccd58d0c8b4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 2 Dec 2024 16:03:05 -0800
Subject: [PATCH 4/8] Add low latency check

Change-Id: Iec36f11060ca1b46b6c33130d4ee02863360c671
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   7 +-
 llvm/lib/CodeGen/MachineSink.cpp              |  11 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   8 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   2 +-
 .../machine-sink-aggressive-latency.mir       | 107 ++++++++++++++++++
 5 files changed, 129 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 07b59b241d9f9a..c4c69e5129602c 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1806,8 +1806,13 @@ class TargetInstrInfo : public MCInstrInfo {
   unsigned defaultDefLatency(const MCSchedModel &SchedModel,
                              const MachineInstr &DefMI) const;
 
+  /// Return true if this instruction is considered low latency.
+  virtual bool isLowLatencyInstruction(const MachineInstr &MI) const {
+    return false;
+  };
+
   /// Return true if this opcode has high latency to its result.
-  virtual bool isHighLatencyDef(int opc) const { return false; }
+  virtual bool isHighLatencyDef(int opc) const { return false; };
 
   /// Compute operand latency between a def of 'Reg'
   /// and a use in the current loop. Return true if the target considered
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 151348e6b1c1ba..8e4f02495229a2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1652,13 +1652,18 @@ bool MachineSinking::aggressivelySinkIntoCycle(
     MachineCycle *Cycle, MachineInstr &I,
     DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
         &SunkInstrs) {
+  // TODO: support instructions with multiple defs
+  if (I.getNumDefs() > 1)
+    return false;
+
+  // Only sink instructions which the target considers to be low latency
+  if (!TII->isLowLatencyInstruction(I))
+    return false;
+
   LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
   MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
   assert(Preheader && "Cycle sink needs a preheader block");
   SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
-  // TODO: support instructions with multiple defs
-  if (I.getNumDefs() > 1)
-    return false;
 
   MachineOperand &DefMO = I.getOperand(0);
   for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c864f03f1f0f9e..f3ef9a25dd82c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8676,7 +8676,13 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
 
-  return isSMRD(Opc);
+  if (MI.isCopy() || isSMRD(Opc))
+    return true;
+
+  if (SchedModel.hasInstrSchedModel())
+    return SchedModel.computeInstrLatency(Opc) < 4;
+
+  return false;
 }
 
 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1f7fff76d15210..f103eb9e97e9b4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1291,7 +1291,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
 
-  bool isLowLatencyInstruction(const MachineInstr &MI) const;
+  bool isLowLatencyInstruction(const MachineInstr &MI) const override;
   bool isHighLatencyDef(int Opc) const override;
 
   /// Return the descriptor of the target-specific machine instruction
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
new file mode 100644
index 00000000000000..b5296a85b31352
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
@@ -0,0 +1,107 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o -  %s | FileCheck -check-prefixes=GFX9-SUNK %s
+
+---
+name: latency_cycle_sink
+tracksRegLiveness: true
+body: |
+  ; GFX10-SUNK-LABEL: name: latency_cycle_sink
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT:   liveins: $vgpr4, $vgpr5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX9-SUNK-LABEL: name: latency_cycle_sink
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT:   liveins: $vgpr4, $vgpr5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_2]], 8, [[V_PK_MUL_LO_U16_2]], 0, 0, 0, 0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr4, $vgpr5
+    %83:vgpr_32 = IMPLICIT_DEF
+    %80:vgpr_32 = V_PK_MUL_LO_U16 8, %83, 8, %83, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+    S_BRANCH %bb.2
+
+
+  bb.2:
+    %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...

>From 16b16e38a0b3d6b1de887830228e960aba30d87a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 3 Dec 2024 14:38:28 -0800
Subject: [PATCH 5/8] Extra semicolon

Change-Id: I17405578571a711f53db71df0e9329600c01fceb
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index c4c69e5129602c..bfc3450c97c024 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1812,7 +1812,7 @@ class TargetInstrInfo : public MCInstrInfo {
   };
 
   /// Return true if this opcode has high latency to its result.
-  virtual bool isHighLatencyDef(int opc) const { return false; };
+  virtual bool isHighLatencyDef(int opc) const { return false; }
 
   /// Compute operand latency between a def of 'Reg'
   /// and a use in the current loop. Return true if the target considered

>From 5dddd8498e30fdb1352a4015b52b50f4d3e5dcc5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 5 Dec 2024 11:00:25 -0800
Subject: [PATCH 6/8] Use stage approach

Change-Id: I4082bd57dd03236e4d578dac4804949544f4dcf2
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |    5 -
 llvm/lib/CodeGen/MachineSink.cpp              |  208 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |    8 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    2 +-
 .../aggressive-loop-sink-nonstandard.ll       |  263 ----
 .../machine-sink-aggressive-latency.mir       |  107 --
 .../CodeGen/AMDGPU/machine-sink-cycle.mir     | 1272 +++++++++++++++++
 .../machine-sink-ignorable-exec-use.mir       |  360 +----
 .../CodeGen/AMDGPU/machine-sink-lane-mask.mir |   71 +-
 .../SystemZ/machinelicm-sunk-kill-flags.mir   |   14 +-
 10 files changed, 1364 insertions(+), 946 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index bfc3450c97c024..07b59b241d9f9a 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1806,11 +1806,6 @@ class TargetInstrInfo : public MCInstrInfo {
   unsigned defaultDefLatency(const MCSchedModel &SchedModel,
                              const MachineInstr &DefMI) const;
 
-  /// Return true if this instruction is considered low latency.
-  virtual bool isLowLatencyInstruction(const MachineInstr &MI) const {
-    return false;
-  };
-
   /// Return true if this opcode has high latency to its result.
   virtual bool isHighLatencyDef(int opc) const { return false; }
 
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 8e4f02495229a2..3f2e790b059041 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -44,6 +44,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -100,12 +101,6 @@ static cl::opt<bool>
                                 "register spills"),
                        cl::init(false), cl::Hidden);
 
-static cl::opt<bool> AggressivelySinkInstsIntoCycle(
-    "aggressive-sink-insts-into-cycles",
-    cl::desc("Aggressively sink instructions into cycles to avoid "
-             "register spills"),
-    cl::init(false), cl::Hidden);
-
 static cl::opt<unsigned> SinkIntoCycleLimit(
     "machine-sink-cycle-limit",
     cl::desc(
@@ -135,6 +130,7 @@ class MachineSinking : public MachineFunctionPass {
   const MachineBranchProbabilityInfo *MBPI = nullptr;
   AliasAnalysis *AA = nullptr;
   RegisterClassInfo RegClassInfo;
+  TargetSchedModel SchedModel;
 
   // Remember which edges have been considered for breaking.
   SmallSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>, 8>
@@ -262,7 +258,6 @@ class MachineSinking : public MachineFunctionPass {
 
   void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
                                SmallVectorImpl<MachineInstr *> &Candidates);
-  bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
 
   bool isDead(const MachineInstr *MI) const;
   bool aggressivelySinkIntoCycle(
@@ -284,11 +279,14 @@ class MachineSinking : public MachineFunctionPass {
   GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
                          AllSuccsCache &AllSuccessors) const;
 
-  std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB);
+  std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB,
+                                               bool UseCache = true);
 
   bool registerPressureSetExceedsLimit(unsigned NRegs,
                                        const TargetRegisterClass *RC,
                                        const MachineBasicBlock &MBB);
+
+  bool registerPressureExceedsLimit(const MachineBasicBlock &MBB);
 };
 
 } // end anonymous namespace
@@ -787,48 +785,63 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     EverMadeChange = true;
   }
 
-  if (SinkInstsIntoCycle || AggressivelySinkInstsIntoCycle) {
+  if (SinkInstsIntoCycle) {
     SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+    SchedModel.init(STI);
+    enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
 
-    DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
-        SunkInstrs;
-    for (auto *Cycle : Cycles) {
-      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
-      if (!Preheader) {
-        LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
-        continue;
-      }
-      SmallVector<MachineInstr *, 8> Candidates;
-      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
-      // Walk the candidates in reverse order so that we start with the use
-      // of a def-use chain, if there is any.
-      // TODO: Sort the candidates using a cost-model.
-      unsigned i = 0;
-
-      for (MachineInstr *I : llvm::reverse(Candidates)) {
-        // AggressivelySinkInstsIntoCycle sinks a superset of instructions
-        // relative to regular cycle sinking. Thus, this option supercedes
-        // captures all sinking opportunites done
-        if (AggressivelySinkInstsIntoCycle) {
-          aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
-          EverMadeChange = true;
-          ++NumCycleSunk;
+    CycleSinkStage Stage = CycleSinkStage::COPY;
+    bool HasHighPressure;
+    do {
+      HasHighPressure = false;
+      DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+          SunkInstrs;
+      for (auto *Cycle : Cycles) {
+        MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+        if (!Preheader) {
+          LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
           continue;
         }
+        SmallVector<MachineInstr *, 8> Candidates;
+        FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+        unsigned i = 0;
+
+        // Walk the candidates in reverse order so that we start with the use
+        // of a def-use chain, if there is any.
+        // TODO: Sort the candidates using a cost-model.
+        for (MachineInstr *I : llvm::reverse(Candidates)) {
+          // CycleSinkStage::COPY: Sink a limited number of copies
+          if (Stage == CycleSinkStage::COPY) {
+            if (i++ == SinkIntoCycleLimit) {
+              LLVM_DEBUG(dbgs()
+                         << "CycleSink:   Limit reached of instructions to "
+                            "be analysed.");
+              break;
+            }
+
+            if (!I->isCopy())
+              continue;
+          }
 
-        if (i++ == SinkIntoCycleLimit) {
-          LLVM_DEBUG(dbgs() << "CycleSink:   Limit reached of instructions to "
-                               "be analysed.");
-          break;
+          // CycleSinkStage::LOW_LATENCY: sink unlimited number of instructions
+          // which the target specifies as low-latency
+          if (Stage == CycleSinkStage::LOW_LATENCY &&
+              !TII->hasLowDefLatency(SchedModel, *I, 0))
+            continue;
+
+          if (!aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs))
+            break;
+          EverMadeChange = true;
+          ++NumCycleSunk;
         }
 
-        if (!SinkIntoCycle(Cycle, *I))
-          break;
-        EverMadeChange = true;
-        ++NumCycleSunk;
+        // Recalculate the pressure after sinking
+        if (!HasHighPressure)
+          HasHighPressure = registerPressureExceedsLimit(*Preheader);
       }
-    }
+      Stage = (CycleSinkStage)(Stage + 1);
+    } while (HasHighPressure && Stage < CycleSinkStage::END);
   }
 
   HasStoreCache.clear();
@@ -1081,13 +1094,15 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
 }
 
 std::vector<unsigned> &
-MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
+MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB,
+                                      bool UseCache) {
   // Currently to save compiling time, MBB's register pressure will not change
   // in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
   // register pressure is changed after sinking any instructions into it.
   // FIXME: need a accurate and cheap register pressure estiminate model here.
+
   auto RP = CachedRegisterPressure.find(&MBB);
-  if (RP != CachedRegisterPressure.end())
+  if (UseCache && RP != CachedRegisterPressure.end())
     return RP->second;
 
   RegionPressure Pressure;
@@ -1111,6 +1126,12 @@ MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
   }
 
   RPTracker.closeRegion();
+
+  if (RP != CachedRegisterPressure.end()) {
+    CachedRegisterPressure[&MBB] = RPTracker.getPressure().MaxSetPressure;
+    return CachedRegisterPressure[&MBB];
+  }
+
   auto It = CachedRegisterPressure.insert(
       std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
   return It.first->second;
@@ -1129,6 +1150,21 @@ bool MachineSinking::registerPressureSetExceedsLimit(
   return false;
 }
 
+// Recalculate RP and check if any pressure set exceeds the set limit.
+bool MachineSinking::registerPressureExceedsLimit(
+    const MachineBasicBlock &MBB) {
+  std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB, false);
+
+  for (unsigned PS = 0; PS < BBRegisterPressure.size(); ++PS) {
+    if (BBRegisterPressure[PS] >=
+        TRI->getRegPressureSetLimit(*MBB.getParent(), PS)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
 bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
                                           MachineBasicBlock *MBB,
@@ -1656,10 +1692,6 @@ bool MachineSinking::aggressivelySinkIntoCycle(
   if (I.getNumDefs() > 1)
     return false;
 
-  // Only sink instructions which the target considers to be low latency
-  if (!TII->isLowLatencyInstruction(I))
-    return false;
-
   LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
   MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
   assert(Preheader && "Cycle sink needs a preheader block");
@@ -1741,86 +1773,6 @@ bool MachineSinking::aggressivelySinkIntoCycle(
   return true;
 }
 
-/// Sink instructions into cycles if profitable. This especially tries to
-/// prevent register spills caused by register pressure if there is little to no
-/// overhead moving instructions into cycles.
-bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
-  LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
-  MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
-  assert(Preheader && "Cycle sink needs a preheader block");
-  MachineBasicBlock *SinkBlock = nullptr;
-  bool CanSink = true;
-  const MachineOperand &MO = I.getOperand(0);
-
-  for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
-    LLVM_DEBUG(dbgs() << "CycleSink:   Analysing use: " << MI);
-    if (!Cycle->contains(MI.getParent())) {
-      LLVM_DEBUG(dbgs() << "CycleSink:   Use not in cycle, can't sink.\n");
-      CanSink = false;
-      break;
-    }
-
-    // FIXME: Come up with a proper cost model that estimates whether sinking
-    // the instruction (and thus possibly executing it on every cycle
-    // iteration) is more expensive than a register.
-    // For now assumes that copies are cheap and thus almost always worth it.
-    if (!MI.isCopy()) {
-      LLVM_DEBUG(dbgs() << "CycleSink:   Use is not a copy\n");
-      CanSink = false;
-      break;
-    }
-    if (!SinkBlock) {
-      SinkBlock = MI.getParent();
-      LLVM_DEBUG(dbgs() << "CycleSink:   Setting sink block to: "
-                        << printMBBReference(*SinkBlock) << "\n");
-      continue;
-    }
-    SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
-    if (!SinkBlock) {
-      LLVM_DEBUG(dbgs() << "CycleSink:   Can't find nearest dominator\n");
-      CanSink = false;
-      break;
-    }
-    LLVM_DEBUG(dbgs() << "CycleSink:   Setting nearest common dom block: "
-                      << printMBBReference(*SinkBlock) << "\n");
-  }
-
-  if (!CanSink) {
-    LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
-    return false;
-  }
-  if (!SinkBlock) {
-    LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
-    return false;
-  }
-  if (SinkBlock == Preheader) {
-    LLVM_DEBUG(
-        dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
-    return false;
-  }
-  if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
-    LLVM_DEBUG(
-        dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
-    return false;
-  }
-
-  LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
-  SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
-                    I);
-
-  // Conservatively clear any kill flags on uses of sunk instruction
-  for (MachineOperand &MO : I.operands()) {
-    if (MO.isReg() && MO.readsReg())
-      RegsToClearKillFlags.insert(MO.getReg());
-  }
-
-  // The instruction is moved from its basic block, so do not retain the
-  // debug information.
-  assert(!I.isDebugInstr() && "Should not sink debug inst");
-  I.setDebugLoc(DebugLoc());
-  return true;
-}
-
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f3ef9a25dd82c1..c864f03f1f0f9e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8676,13 +8676,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
 
-  if (MI.isCopy() || isSMRD(Opc))
-    return true;
-
-  if (SchedModel.hasInstrSchedModel())
-    return SchedModel.computeInstrLatency(Opc) < 4;
-
-  return false;
+  return isSMRD(Opc);
 }
 
 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f103eb9e97e9b4..1f7fff76d15210 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1291,7 +1291,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
 
-  bool isLowLatencyInstruction(const MachineInstr &MI) const override;
+  bool isLowLatencyInstruction(const MachineInstr &MI) const;
   bool isHighLatencyDef(int Opc) const override;
 
   /// Return the descriptor of the target-specific machine instruction
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
deleted file mode 100644
index 9e53b8434cc021..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
+++ /dev/null
@@ -1,263 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressive-sink-insts-into-cycles=1 < %s | FileCheck -check-prefix=SUNK %s
-
-; Check that various edge cases do not crash the compiler
-
-; Multiple uses of sunk valu, chain of sink candidates
-
-define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
-; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
-; SUNK:       ; %bb.0:
-; SUNK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SUNK-NEXT:    v_mov_b32_e32 v3, v0
-; SUNK-NEXT:    v_and_b32_e32 v0, -4, v3
-; SUNK-NEXT:    global_load_dword v4, v[0:1], off
-; SUNK-NEXT:    v_and_b32_e32 v3, 3, v3
-; SUNK-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
-; SUNK-NEXT:    s_mov_b32 s2, 0xffff
-; SUNK-NEXT:    v_lshlrev_b32_e64 v5, v3, s2
-; SUNK-NEXT:    s_mov_b64 s[0:1], 0
-; SUNK-NEXT:    v_not_b32_e32 v5, v5
-; SUNK-NEXT:    v_max_f16_e32 v2, v2, v2
-; SUNK-NEXT:  .LBB0_1: ; %atomicrmw.start
-; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    v_mov_b32_e32 v7, v4
-; SUNK-NEXT:    v_lshrrev_b32_e32 v4, v3, v7
-; SUNK-NEXT:    v_max_f16_e32 v4, v4, v4
-; SUNK-NEXT:    v_min_f16_e32 v4, v4, v2
-; SUNK-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
-; SUNK-NEXT:    v_and_or_b32 v6, v7, v5, v4
-; SUNK-NEXT:    buffer_wbl2 sc1
-; SUNK-NEXT:    global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    buffer_inv sc1
-; SUNK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; SUNK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SUNK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; SUNK-NEXT:    s_cbranch_execnz .LBB0_1
-; SUNK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; SUNK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; SUNK-NEXT:    v_lshrrev_b32_e32 v0, v3, v4
-; SUNK-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
-  ret half %result
-}
-
-; Sink candidates with multiple defs
-
-define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
-; SUNK-LABEL: memmove_p5_p5:
-; SUNK:       ; %bb.0: ; %entry
-; SUNK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SUNK-NEXT:    v_and_b32_e32 v4, 15, v2
-; SUNK-NEXT:    v_mov_b32_e32 v5, 0
-; SUNK-NEXT:    v_and_b32_e32 v6, -16, v2
-; SUNK-NEXT:    v_mov_b32_e32 v7, v3
-; SUNK-NEXT:    v_cmp_ne_u64_e64 s[0:1], 0, v[4:5]
-; SUNK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SUNK-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v0
-; SUNK-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
-; SUNK-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; SUNK-NEXT:    s_cbranch_execnz .LBB1_3
-; SUNK-NEXT:  ; %bb.1: ; %Flow46
-; SUNK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[4:5]
-; SUNK-NEXT:    s_cbranch_execnz .LBB1_10
-; SUNK-NEXT:  .LBB1_2: ; %Flow47
-; SUNK-NEXT:    s_or_b64 exec, exec, s[2:3]
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    s_setpc_b64 s[30:31]
-; SUNK-NEXT:  .LBB1_3: ; %memmove_copy_forward
-; SUNK-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; SUNK-NEXT:    s_cbranch_execz .LBB1_6
-; SUNK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; SUNK-NEXT:    s_mov_b64 s[8:9], 0
-; SUNK-NEXT:    v_mov_b32_e32 v3, v1
-; SUNK-NEXT:    v_mov_b32_e32 v8, v0
-; SUNK-NEXT:  .LBB1_5: ; %memmove_fwd_main_loop
-; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT:    scratch_load_dwordx4 v[10:13], v3, off
-; SUNK-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, -16
-; SUNK-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
-; SUNK-NEXT:    v_add_u32_e32 v3, 16, v3
-; SUNK-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    scratch_store_dwordx4 v8, v[10:13], off
-; SUNK-NEXT:    v_add_u32_e32 v8, 16, v8
-; SUNK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; SUNK-NEXT:    s_cbranch_execnz .LBB1_5
-; SUNK-NEXT:  .LBB1_6: ; %Flow41
-; SUNK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SUNK-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
-; SUNK-NEXT:    s_cbranch_execz .LBB1_9
-; SUNK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; SUNK-NEXT:    v_and_b32_e32 v2, -16, v2
-; SUNK-NEXT:    v_add_u32_e32 v0, v0, v2
-; SUNK-NEXT:    v_add_u32_e32 v1, v1, v2
-; SUNK-NEXT:    s_mov_b64 s[8:9], 0
-; SUNK-NEXT:  .LBB1_8: ; %memmove_fwd_residual_loop
-; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT:    scratch_load_ubyte v2, v1, off
-; SUNK-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, -1
-; SUNK-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[4:5]
-; SUNK-NEXT:    v_add_u32_e32 v1, 1, v1
-; SUNK-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    scratch_store_byte v0, v2, off
-; SUNK-NEXT:    v_add_u32_e32 v0, 1, v0
-; SUNK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; SUNK-NEXT:    s_cbranch_execnz .LBB1_8
-; SUNK-NEXT:  .LBB1_9: ; %Flow39
-; SUNK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SUNK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; SUNK-NEXT:    ; implicit-def: $vgpr0
-; SUNK-NEXT:    ; implicit-def: $vgpr1
-; SUNK-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; SUNK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[4:5]
-; SUNK-NEXT:    s_cbranch_execz .LBB1_2
-; SUNK-NEXT:  .LBB1_10: ; %memmove_copy_backwards
-; SUNK-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
-; SUNK-NEXT:    s_cbranch_execz .LBB1_13
-; SUNK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; SUNK-NEXT:    v_add_u32_e32 v7, -1, v2
-; SUNK-NEXT:    v_add_u32_e32 v6, v0, v7
-; SUNK-NEXT:    v_add_u32_e32 v7, v1, v7
-; SUNK-NEXT:    s_mov_b64 s[6:7], 0
-; SUNK-NEXT:  .LBB1_12: ; %memmove_bwd_residual_loop
-; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT:    scratch_load_ubyte v8, v7, off
-; SUNK-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, -1
-; SUNK-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
-; SUNK-NEXT:    v_add_u32_e32 v7, -1, v7
-; SUNK-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    scratch_store_byte v6, v8, off
-; SUNK-NEXT:    v_add_u32_e32 v6, -1, v6
-; SUNK-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; SUNK-NEXT:    s_cbranch_execnz .LBB1_12
-; SUNK-NEXT:  .LBB1_13: ; %Flow45
-; SUNK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SUNK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SUNK-NEXT:    s_cbranch_execz .LBB1_16
-; SUNK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; SUNK-NEXT:    v_and_b32_e32 v5, -16, v2
-; SUNK-NEXT:    v_add_u32_e32 v4, -16, v5
-; SUNK-NEXT:    v_add_u32_e32 v2, v0, v4
-; SUNK-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v5
-; SUNK-NEXT:    v_add_u32_e32 v4, v1, v4
-; SUNK-NEXT:    s_mov_b64 s[4:5], 0
-; SUNK-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v3, vcc
-; SUNK-NEXT:  .LBB1_15: ; %memmove_bwd_main_loop
-; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT:    scratch_load_dwordx4 v[6:9], v4, off
-; SUNK-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 16
-; SUNK-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; SUNK-NEXT:    v_add_u32_e32 v4, -16, v4
-; SUNK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    scratch_store_dwordx4 v2, v[6:9], off
-; SUNK-NEXT:    v_add_u32_e32 v2, -16, v2
-; SUNK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; SUNK-NEXT:    s_cbranch_execnz .LBB1_15
-; SUNK-NEXT:  .LBB1_16: ; %Flow43
-; SUNK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; SUNK-NEXT:    s_or_b64 exec, exec, s[2:3]
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
-  ret void
-}
-
-; We should not sink the mfma into the if/else as it is convergent
-
-define void @convergent_sink(<4 x i16> %in0, <4 x i16> %in1, i32 %val, i32 %v, ptr addrspace(1) %outptr) #2 {
-; SUNK-LABEL: convergent_sink:
-; SUNK:       ; %bb.0: ; %entry
-; SUNK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SUNK-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], 0
-; SUNK-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
-; SUNK-NEXT:    v_lshlrev_b32_e32 v2, 1, v5
-; SUNK-NEXT:    s_mov_b32 s4, 0
-; SUNK-NEXT:    s_mov_b64 s[0:1], 0
-; SUNK-NEXT:    v_mov_b32_e32 v5, 0xde
-; SUNK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SUNK-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; SUNK-NEXT:    s_branch .LBB2_2
-; SUNK-NEXT:  .LBB2_1: ; %end
-; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v4
-; SUNK-NEXT:    s_add_i32 s4, s4, 1
-; SUNK-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SUNK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; SUNK-NEXT:    s_cbranch_execz .LBB2_7
-; SUNK-NEXT:  .LBB2_2: ; %loop.body
-; SUNK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT:    s_cmp_lt_i32 s4, 6
-; SUNK-NEXT:    global_store_dword v[6:7], v5, off
-; SUNK-NEXT:    s_cbranch_scc0 .LBB2_4
-; SUNK-NEXT:  ; %bb.3: ; %else
-; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 3, v[6:7]
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[12:15], off offset:48
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[8:11], off offset:32
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off offset:16
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[0:3], off
-; SUNK-NEXT:    s_mov_b64 s[2:3], 0
-; SUNK-NEXT:    s_branch .LBB2_5
-; SUNK-NEXT:  .LBB2_4: ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT:    s_mov_b64 s[2:3], -1
-; SUNK-NEXT:  .LBB2_5: ; %Flow
-; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; SUNK-NEXT:    v_mov_b32_e32 v8, v0
-; SUNK-NEXT:    s_cbranch_vccnz .LBB2_1
-; SUNK-NEXT:  ; %bb.6: ; %if
-; SUNK-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT:    v_lshl_add_u64 v[8:9], v[2:3], 3, v[6:7]
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[12:15], off offset:48
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[8:11], off offset:32
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off offset:16
-; SUNK-NEXT:    global_store_dwordx4 v[8:9], a[0:3], off
-; SUNK-NEXT:    v_mov_b32_e32 v8, v2
-; SUNK-NEXT:    s_branch .LBB2_1
-; SUNK-NEXT:  .LBB2_7: ; %exit
-; SUNK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[12:15], off offset:48
-; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[8:11], off offset:32
-; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[4:7], off offset:16
-; SUNK-NEXT:    global_store_dwordx4 v[6:7], a[0:3], off
-; SUNK-NEXT:    s_waitcnt vmcnt(0)
-; SUNK-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %1005 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %in0, <4 x i16> %in1, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
-  br label %loop.body
-
-loop.body:
-  %i = phi i32 [0, %entry], [%i.inc, %end]
-  store i32 222, ptr addrspace(1) %outptr
-  %cc = icmp sgt i32 %i, 5
-  br i1 %cc, label %if, label %else
-
-if:
-  %v.if = mul i32 %v, 2
-  %sptr.if =  getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.if
-  store <16 x float> %1005, ptr addrspace(1) %sptr.if
-  br label %end
-
-else:
-  %v.else = mul i32 %v, 3
-  %sptr.else =  getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.else
-  store <16 x float> %1005, ptr addrspace(1) %sptr.else
-  br label %end
-
-end:
-  %r = phi i32 [ %v.if, %if ], [ %v.else, %else ]
-  %cmp = icmp ne i32 %r, %val
-  %i.inc = add i32 %i, 1
-  br i1 %cmp, label %loop.body, label %exit
-
-exit:
-  store <16 x float> %1005, ptr addrspace(1) %outptr
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
deleted file mode 100644
index b5296a85b31352..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
+++ /dev/null
@@ -1,107 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o -  %s | FileCheck -check-prefixes=GFX9-SUNK %s
-
----
-name: latency_cycle_sink
-tracksRegLiveness: true
-body: |
-  ; GFX10-SUNK-LABEL: name: latency_cycle_sink
-  ; GFX10-SUNK: bb.0:
-  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX10-SUNK-NEXT:   liveins: $vgpr4, $vgpr5
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX10-SUNK-NEXT:   [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.1:
-  ; GFX10-SUNK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.2
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.2:
-  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.3:
-  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.4:
-  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.5:
-  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
-  ;
-  ; GFX9-SUNK-LABEL: name: latency_cycle_sink
-  ; GFX9-SUNK: bb.0:
-  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr4, $vgpr5
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit undef $scc
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_2]], 8, [[V_PK_MUL_LO_U16_2]], 0, 0, 0, 0, 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.4:
-  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.5:
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
-  bb.0:
-    successors: %bb.1(0x80000000)
-    liveins: $vgpr4, $vgpr5
-    %83:vgpr_32 = IMPLICIT_DEF
-    %80:vgpr_32 = V_PK_MUL_LO_U16 8, %83, 8, %83, 0, 0, 0, 0, 0, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:
-    S_CBRANCH_SCC1 %bb.3, implicit undef $scc
-    S_BRANCH %bb.2
-
-
-  bb.2:
-    %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
-    S_BRANCH %bb.4
-
-  bb.3:
-    %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
-    S_BRANCH %bb.4
-
-  bb.4:
-    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
-    S_BRANCH %bb.5
-
-  bb.5:
-    S_ENDPGM 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
new file mode 100644
index 00000000000000..bca1517ed183ac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
@@ -0,0 +1,1272 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --sink-insts-to-avoid-spills=1 --stop-after=machine-sink -o -  %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 --sink-insts-to-avoid-spills=1 -mattr=+wavefrontsize64 --stop-after=machine-sink -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
+
+---
+name:            test_sink_copy
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_copy
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_copy
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vreg_256 = COPY %4
+    %6:vreg_256 = COPY %4
+    %7:vreg_256 = COPY %4
+    %8:vreg_256 = COPY %4
+    %9:vreg_256 = COPY %4
+
+
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+# For gfx9, after sinking the copies, pressure is within the desired limit
+
+---
+name:            test_sink_multi_stage
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_multi_stage
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_multi_stage
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[COPY5:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY6:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY7:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY8:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   [[COPY9:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vreg_256 = COPY %4
+    %6:vreg_256 = COPY %4
+    %7:vreg_256 = COPY %4
+    %8:vreg_256 = COPY %4
+    %9:vreg_256 = COPY %4
+    %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    INLINEASM &"", 1, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    INLINEASM &"", 1, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+---
+name:            test_sink_low_rp
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_low_rp
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_low_rp
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+---
+name:            test_sink_high_rp
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_high_rp
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_high_rp
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %28:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %29:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %30:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %31:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+
+    S_BRANCH %bb.1
+
+  bb.1:
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+# Do not sink convergent op (MFMA)
+
+---
+name:            test_sink_convergent
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_convergent
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF6:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF5]], [[DEF7]], [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF8]], implicit [[V_MFMA_F32_4X4X1F32_e64_]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_convergent
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF6:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF5]], [[DEF7]], [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF8]], implicit [[V_MFMA_F32_4X4X1F32_e64_]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %28:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %29:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %30:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %31:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:areg_128_align2 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %40, %42, %41, 0, 0, 0, implicit $mode, implicit $exec
+
+
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1, implicit %43, implicit %44
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
+# Do not sink instructions with multiple defs
+
+---
+name:            test_sink_multi_def
+alignment:       1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX9-SUNK-LABEL: name: test_sink_multi_def
+  ; GFX9-SUNK: bb.0:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF]].sub2, [[DEF1]].sub4, 0, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.1:
+  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_1]]
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.2:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.3:
+  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX9-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.4:
+  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.5:
+  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX9-SUNK-NEXT: {{  $}}
+  ; GFX9-SUNK-NEXT: bb.6.entry:
+  ;
+  ; GFX10-SUNK-LABEL: name: test_sink_multi_def
+  ; GFX10-SUNK: bb.0:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF]].sub2, [[DEF1]].sub4, 0, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.1
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.1:
+  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_1]]
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.2:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.3:
+  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+  ; GFX10-SUNK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.4:
+  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.5(0x40000000)
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.5
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.5:
+  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
+  ; GFX10-SUNK-NEXT: {{  $}}
+  ; GFX10-SUNK-NEXT: bb.6.entry:
+  bb.0:
+    %0:vreg_256 = IMPLICIT_DEF
+    %1:vreg_256 = IMPLICIT_DEF
+    %2:vreg_256 = IMPLICIT_DEF
+    %3:vreg_256 = IMPLICIT_DEF
+    %4:vreg_256 = IMPLICIT_DEF
+    %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+    %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+    %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %28:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+    %29:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %30:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %31:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+    %32:vgpr_32, %33:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub2:vreg_256, %1.sub4:vreg_256, 0, implicit $exec
+
+    S_BRANCH %bb.1
+
+  bb.1:
+    INLINEASM &"", 1, implicit %32, implicit %33
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+    S_BRANCH %bb.4
+
+  bb.3:
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+    INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 259abae6d92c87..0fc31ea9d64379 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9 %s
 
 
 ---
@@ -50,47 +50,6 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
-  ;
-  ; GFX9-SUNK-LABEL: name: test_sink_fmac_to_only_use
-  ; GFX9-SUNK: bb.0:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
-  ; GFX9-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     %1:vgpr_32 = COPY $vgpr0
@@ -174,48 +133,6 @@ body:             |
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
-  ;
-  ; GFX9-SUNK-LABEL: name: test_no_sink_into_if_cond_multiple_uses
-  ; GFX9-SUNK: bb.0:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
-  ; GFX9-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     %1:vgpr_32 = COPY $vgpr0
@@ -300,48 +217,6 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
-  ;
-  ; GFX9-SUNK-LABEL: name: no_sink_fmac_not_constant_mode
-  ; GFX9-SUNK: bb.0:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   $mode = IMPLICIT_DEF
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
-  ; GFX9-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     $mode = IMPLICIT_DEF
@@ -414,36 +289,6 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
   ; GFX9-NEXT:   S_ENDPGM 0, implicit %6
-  ;
-  ; GFX9-SUNK-LABEL: name: test_no_sink_fmac_wwm
-  ; GFX9-SUNK: bb.0:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]]
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0, implicit %6
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
     %1:vgpr_32 = COPY $vgpr0
@@ -539,69 +384,6 @@ body:             |
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   S_ENDPGM 0
-  ;
-  ; GFX9-SUNK-LABEL: name: test_def_and_use_in_loop_sink_fmac
-  ; GFX9-SUNK: bb.0.entry:
-  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.4:
-  ; GFX9-SUNK-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.5:
-  ; GFX9-SUNK-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.6:
-  ; GFX9-SUNK-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.7:
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
   bb.0.entry:
     successors: %bb.1(0x80000000)
 
@@ -732,69 +514,6 @@ body:             |
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   S_ENDPGM 0
-  ;
-  ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop
-  ; GFX9-SUNK: bb.0.entry:
-  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
-  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.4:
-  ; GFX9-SUNK-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.5:
-  ; GFX9-SUNK-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.6:
-  ; GFX9-SUNK-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.7:
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
   bb.0.entry:
     successors: %bb.1(0x80000000)
 
@@ -939,83 +658,6 @@ body:             |
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   S_ENDPGM 0
-  ;
-  ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop2
-  ; GFX9-SUNK: bb.0.entry:
-  ; GFX9-SUNK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX9-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-SUNK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.1
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.1:
-  ; GFX9-SUNK-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.2
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.2:
-  ; GFX9-SUNK-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
-  ; GFX9-SUNK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-  ; GFX9-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX9-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.3
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.3:
-  ; GFX9-SUNK-NEXT:   successors: %bb.4(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.4
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.4:
-  ; GFX9-SUNK-NEXT:   successors: %bb.5(0x40000000), %bb.7(0x40000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.7, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.5
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.5:
-  ; GFX9-SUNK-NEXT:   successors: %bb.6(0x04000000), %bb.5(0x7c000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.6
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.6:
-  ; GFX9-SUNK-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_NOP 0
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.7
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.7:
-  ; GFX9-SUNK-NEXT:   successors: %bb.8(0x04000000), %bb.2(0x7c000000)
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
-  ; GFX9-SUNK-NEXT:   S_BRANCH %bb.8
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT: bb.8:
-  ; GFX9-SUNK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
-  ; GFX9-SUNK-NEXT: {{  $}}
-  ; GFX9-SUNK-NEXT:   S_ENDPGM 0
   bb.0.entry:
     successors: %bb.1(0x40000000), %bb.2 (0x40000000)
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index fafad600c47458..ef6771278b06f3 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o -  %s | FileCheck -check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o -  %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --sink-insts-to-avoid-spills=1 -o -  %s | FileCheck -check-prefixes=GFX10 %s
 
 ---
 name: multi_else_break
@@ -74,75 +74,6 @@ body: |
   ; GFX10-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
   ; GFX10-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX10-NEXT:   S_BRANCH %bb.3
-  ;
-  ; GFX10-SUNK-LABEL: name: multi_else_break
-  ; GFX10-SUNK: bb.0:
-  ; GFX10-SUNK-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX10-SUNK-NEXT:   liveins: $vgpr4, $vgpr5
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; GFX10-SUNK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX10-SUNK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX10-SUNK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
-  ; GFX10-SUNK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GFX10-SUNK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GFX10-SUNK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.1:
-  ; GFX10-SUNK-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
-  ; GFX10-SUNK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.2:
-  ; GFX10-SUNK-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
-  ; GFX10-SUNK-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
-  ; GFX10-SUNK-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
-  ; GFX10-SUNK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
-  ; GFX10-SUNK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
-  ; GFX10-SUNK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
-  ; GFX10-SUNK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
-  ; GFX10-SUNK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
-  ; GFX10-SUNK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.4
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.3:
-  ; GFX10-SUNK-NEXT:   SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_ENDPGM 0
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.4:
-  ; GFX10-SUNK-NEXT:   successors: %bb.5(0x80000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; GFX10-SUNK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
-  ; GFX10-SUNK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
-  ; GFX10-SUNK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
-  ; GFX10-SUNK-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
-  ; GFX10-SUNK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
-  ; GFX10-SUNK-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.5:
-  ; GFX10-SUNK-NEXT:   successors: %bb.6(0x04000000), %bb.2(0x7c000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
-  ; GFX10-SUNK-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
-  ; GFX10-SUNK-NEXT:   [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
-  ; GFX10-SUNK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
-  ; GFX10-SUNK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.6
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT: bb.6:
-  ; GFX10-SUNK-NEXT:   successors: %bb.3(0x04000000), %bb.1(0x7c000000)
-  ; GFX10-SUNK-NEXT: {{  $}}
-  ; GFX10-SUNK-NEXT:   [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
-  ; GFX10-SUNK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX10-SUNK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
-  ; GFX10-SUNK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX10-SUNK-NEXT:   S_BRANCH %bb.3
   bb.0:
     successors: %bb.1(0x80000000)
     liveins: $vgpr4, $vgpr5
diff --git a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
index 52c9d1067220ee..f23afe52f97de8 100644
--- a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
+++ b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
@@ -25,23 +25,24 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[LARL:%[0-9]+]]:addr64bit = LARL @b
+  ; CHECK-NEXT:   [[LA:%[0-9]+]]:gr64bit = LA killed [[LARL]], 49, $noreg
+  ; CHECK-NEXT:   [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[LA:%[0-9]+]]:gr64bit = LA [[LARL]], 49, $noreg
-  ; CHECK-NEXT:   [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64bit = COPY [[LA]]
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0
   ; CHECK-NEXT:   $r2d = COPY [[DEF]]
-  ; CHECK-NEXT:   $r3d = COPY [[LA]]
+  ; CHECK-NEXT:   $r3d = COPY [[COPY]]
   ; CHECK-NEXT:   $r4d = COPY [[LGHI]]
   ; CHECK-NEXT:   CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0
   ; CHECK-NEXT:   $r2d = COPY [[DEF1]]
-  ; CHECK-NEXT:   $r3d = COPY [[LA]]
+  ; CHECK-NEXT:   $r3d = COPY [[COPY]]
   ; CHECK-NEXT:   $r4d = COPY [[LGHI]]
   ; CHECK-NEXT:   CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0
@@ -54,19 +55,20 @@ body:             |
     %2:gr64bit = LGHI 7
     %3:gr64bit = IMPLICIT_DEF
     %5:gr64bit = IMPLICIT_DEF
+    %6:gr64bit = COPY killed %0
 
   bb.1:
     successors: %bb.1(0x80000000)
 
     ADJCALLSTACKDOWN 0, 0
     $r2d = COPY %3
-    $r3d = COPY %0
+    $r3d = COPY %6
     $r4d = COPY %2
     CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
     ADJCALLSTACKUP 0, 0
     ADJCALLSTACKDOWN 0, 0
     $r2d = COPY %5
-    $r3d = COPY %0
+    $r3d = COPY %6
     $r4d = COPY %2
     CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
     ADJCALLSTACKUP 0, 0

>From 4255e2a2265d1496c5f2deb1c8c08ba8b6a30ad9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 6 Dec 2024 12:50:16 -0800
Subject: [PATCH 7/8] Review comments

Change-Id: I3738cc0f14d7ab2db35109f3e02a2f7e4fa9f2e1
---
 llvm/lib/CodeGen/MachineSink.cpp | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 3f2e790b059041..2712d3324ebe75 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -788,14 +788,15 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   if (SinkInstsIntoCycle) {
     SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
     SchedModel.init(STI);
-    enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
-
-    CycleSinkStage Stage = CycleSinkStage::COPY;
     bool HasHighPressure;
-    do {
+    DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+        SunkInstrs;
+
+    enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
+    for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
+         ++Stage) {
       HasHighPressure = false;
-      DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
-          SunkInstrs;
+      SunkInstrs.clear();
       for (auto *Cycle : Cycles) {
         MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
         if (!Preheader) {
@@ -816,7 +817,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
             if (i++ == SinkIntoCycleLimit) {
               LLVM_DEBUG(dbgs()
                          << "CycleSink:   Limit reached of instructions to "
-                            "be analysed.");
+                            "be analyzed.");
               break;
             }
 
@@ -840,8 +841,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
         if (!HasHighPressure)
           HasHighPressure = registerPressureExceedsLimit(*Preheader);
       }
-      Stage = (CycleSinkStage)(Stage + 1);
-    } while (HasHighPressure && Stage < CycleSinkStage::END);
+      if (!HasHighPressure)
+        break;
+    }
   }
 
   HasStoreCache.clear();
@@ -1726,12 +1728,14 @@ bool MachineSinking::aggressivelySinkIntoCycle(
     MachineInstr *NewMI = nullptr;
     std::pair<MachineInstr *, MachineBasicBlock *> MapEntry(&I, SinkBlock);
 
+    auto SI = SunkInstrs.find(MapEntry);
+
     // Check for the case in which we have already sunk a copy of this
     // instruction into the user block.
-    if (SunkInstrs.contains(MapEntry)) {
+    if (SI != SunkInstrs.end()) {
       LLVM_DEBUG(dbgs() << "AggressiveCycleSink:   Already sunk to block: "
                         << printMBBReference(*SinkBlock) << "\n");
-      NewMI = SunkInstrs[MapEntry];
+      NewMI = SI->second;
     }
 
     // Create a copy of the instruction in the use block.
@@ -1748,12 +1752,12 @@ bool MachineSinking::aggressivelySinkIntoCycle(
       }
       SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
                         NewMI);
-      SunkInstrs[MapEntry] = NewMI;
+      SunkInstrs.insert({MapEntry, NewMI});
     }
 
     // Conservatively clear any kill flags on uses of sunk instruction
-    for (MachineOperand &MO : NewMI->operands()) {
-      if (MO.isReg() && MO.readsReg())
+    for (MachineOperand &MO : NewMI->all_uses()) {
+      if (MO.isReg())
         RegsToClearKillFlags.insert(MO.getReg());
     }
 

>From 672150ecd6c92244a7556a91ad8efa284e165624 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 9 Dec 2024 08:34:23 -0800
Subject: [PATCH 8/8] Review comments

Change-Id: I4d70eed99499df33f4bde04be05e88ea0c2de877
---
 llvm/lib/CodeGen/MachineSink.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 2712d3324ebe75..977ba940750c94 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -794,9 +794,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
 
     enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
     for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
-         ++Stage) {
+         ++Stage, SunkInstrs.clear()) {
       HasHighPressure = false;
-      SunkInstrs.clear();
+
       for (auto *Cycle : Cycles) {
         MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
         if (!Preheader) {
@@ -1757,8 +1757,8 @@ bool MachineSinking::aggressivelySinkIntoCycle(
 
     // Conservatively clear any kill flags on uses of sunk instruction
     for (MachineOperand &MO : NewMI->all_uses()) {
-      if (MO.isReg())
-        RegsToClearKillFlags.insert(MO.getReg());
+      assert(MO.isReg() && MO.isUse());
+      RegsToClearKillFlags.insert(MO.getReg());
     }
 
     // The instruction is moved from its basic block, so do not retain the



More information about the llvm-commits mailing list