[llvm] [MachineSink] Add capability for aggressive loop sinking (PR #117247)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 9 08:40:45 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/117247
>From b32aa2510a9724fcb815d3186dab1be469acc225 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 20 Nov 2024 14:24:09 -0800
Subject: [PATCH 1/8] [MachineSink] Add option for aggressive loop sinking
Change-Id: I62a6c6fc2c372523ce9ec98d084a434548609ead
---
llvm/lib/CodeGen/MachineSink.cpp | 184 +++++++++
.../aggressive-loop-sink-nonstandard.ll | 20 +
.../machine-sink-ignorable-exec-use.mir | 360 ++++++++++++++++++
.../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 208 ++++++----
4 files changed, 703 insertions(+), 69 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index c470bd71dfb29f..d8dd6e8478686d 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -100,6 +100,12 @@ static cl::opt<bool>
"register spills"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> AggressivelySinkInstsIntoCycle(
+ "aggressively-sink-insts-to-avoid-spills",
+ cl::desc("Aggressively sink instructions into cycles to avoid "
+ "register spills"),
+ cl::init(false), cl::Hidden);
+
static cl::opt<unsigned> SinkIntoCycleLimit(
"machine-sink-cycle-limit",
cl::desc(
@@ -256,6 +262,13 @@ class MachineSinking : public MachineFunctionPass {
SmallVectorImpl<MachineInstr *> &Candidates);
bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
+ bool isDead(const MachineInstr *MI) const;
+ bool AggressivelySinkIntoCycle(
+ MachineCycle *Cycle, MachineInstr &I,
+ DenseMap<MachineInstr *,
+ std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+ SunkInstrs);
+
bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *SuccToSinkTo,
@@ -679,6 +692,10 @@ void MachineSinking::FindCycleSinkCandidates(
SmallVectorImpl<MachineInstr *> &Candidates) {
for (auto &MI : *BB) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
+ if (MI.isDebugInstr()) {
+ LLVM_DEBUG(dbgs() << "CycleSink: Dont sink debug instructions\n");
+ continue;
+ }
if (!TII->shouldSink(MI)) {
LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
"target\n");
@@ -799,6 +816,30 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
}
}
+ if (AggressivelySinkInstsIntoCycle) {
+ SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+ DenseMap<MachineInstr *,
+ std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+ SunkInstrs;
+ for (auto *Cycle : Cycles) {
+ MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+ if (!Preheader) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Can't find preheader\n");
+ continue;
+ }
+ SmallVector<MachineInstr *, 8> Candidates;
+ FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+ // Walk the candidates in reverse order so that we start with the use
+ // of a def-use chain, if there is any.
+ for (MachineInstr *I : llvm::reverse(Candidates)) {
+ AggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
+ EverMadeChange = true;
+ ++NumCycleSunk;
+ }
+ }
+ }
+
HasStoreCache.clear();
StoreInstrCache.clear();
@@ -1574,6 +1615,149 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
return HasAliasedStore;
}
+/// Copy paste from DeadMachineInstructionElimImpl
+
+bool MachineSinking::isDead(const MachineInstr *MI) const {
+ // Instructions without side-effects are dead iff they only define dead regs.
+ // This function is hot and this loop returns early in the common case,
+ // so only perform additional checks before this if absolutely necessary.
+ for (const MachineOperand &MO : MI->all_defs()) {
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical()) {
+ return false;
+ } else {
+ if (MO.isDead()) {
+#ifndef NDEBUG
+ // Basic check on the register. All of them should be 'undef'.
+ for (auto &U : MRI->use_nodbg_operands(Reg))
+ assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+#endif
+ continue;
+ }
+ for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
+ if (&Use != MI)
+ // This def has a non-debug use. Don't delete the instruction!
+ return false;
+ }
+ }
+ }
+
+ // Technically speaking inline asm without side effects and no defs can still
+ // be deleted. But there is so much bad inline asm code out there, we should
+ // let them be.
+ if (MI->isInlineAsm())
+ return false;
+
+ // FIXME: See issue #105950 for why LIFETIME markers are considered dead here.
+ if (MI->isLifetimeMarker())
+ return true;
+
+ // If there are no defs with uses, the instruction might be dead.
+ return MI->wouldBeTriviallyDead();
+}
+
+/// Aggressively sink instructions into cycles. This will aggressively try to
+/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
+/// In particular, it will sink into multiple successor blocks without limits
+/// based on the amount of sinking, or the type of ops being sunk (so long as
+/// they are safe to sink).
+bool MachineSinking::AggressivelySinkIntoCycle(
+ MachineCycle *Cycle, MachineInstr &I,
+ DenseMap<MachineInstr *,
+ std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+ SunkInstrs) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
+ MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+ assert(Preheader && "Cycle sink needs a preheader block");
+ SmallVector<std::pair<MachineOperand, MachineInstr *>> Uses;
+ // TODO: support instructions with multiple defs
+ if (I.getNumDefs() > 1)
+ return false;
+
+ MachineOperand DefMO = I.getOperand(0);
+ for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
+ Uses.push_back({DefMO, &MI});
+ }
+
+ for (std::pair<MachineOperand, MachineInstr *> Entry : Uses) {
+ MachineInstr *MI = Entry.second;
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Analysing use: " << MI);
+ if (MI->isPHI()) {
+ LLVM_DEBUG(
+ dbgs() << "AggressiveCycleSink: Not attempting to sink for PHI.\n");
+ continue;
+ }
+ // We cannot sink before the prologue
+ if (TII->isBasicBlockPrologue(*MI) || MI->isPosition()) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Use is BasicBlock prologue, "
+ "can't sink.\n");
+ continue;
+ }
+ if (!Cycle->contains(MI->getParent())) {
+ LLVM_DEBUG(
+ dbgs() << "AggressiveCycleSink: Use not in cycle, can't sink.\n");
+ continue;
+ }
+
+ MachineBasicBlock *SinkBlock = MI->getParent();
+ MachineInstr *NewMI = nullptr;
+
+ // Check for the case in which we have already sunk a copy of this
+ // instruction into the user block.
+ if (SunkInstrs.contains(&I)) {
+ auto SunkBlocks = SunkInstrs[&I];
+ auto Match = std::find_if(
+ SunkBlocks.begin(), SunkBlocks.end(),
+ [&SinkBlock](
+ std::pair<MachineBasicBlock *, MachineInstr *> SunkEntry) {
+ return SunkEntry.first == SinkBlock;
+ });
+ if (Match != SunkBlocks.end()) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
+ << printMBBReference(*SinkBlock) << "\n");
+ NewMI = Match->second;
+ }
+ }
+
+ // Create a copy of the instruction in the use block.
+ if (!NewMI) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
+ << printMBBReference(*SinkBlock) << "\n");
+
+ NewMI = I.getMF()->CloneMachineInstr(&I);
+ if (DefMO.getReg().isVirtual()) {
+ const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
+ Register DestReg = MRI->createVirtualRegister(TRC);
+ NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
+ *TRI);
+ }
+ SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
+ NewMI);
+ SunkInstrs[&I].push_back({SinkBlock, NewMI});
+ }
+
+ // Conservatively clear any kill flags on uses of sunk instruction
+ for (MachineOperand &MO : NewMI->operands()) {
+ if (MO.isReg() && MO.readsReg())
+ RegsToClearKillFlags.insert(MO.getReg());
+ }
+
+ // The instruction is moved from its basic block, so do not retain the
+ // debug information.
+ assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
+ NewMI->setDebugLoc(DebugLoc());
+
+ // Replace the use with the newly created virtual register.
+ MachineOperand UseMO = Entry.first;
+ MI->substituteRegister(UseMO.getReg(), NewMI->getOperand(0).getReg(),
+ UseMO.getSubReg(), *TRI);
+ }
+ // If we have replaced all uses, then delete the dead instruction
+ if (isDead(&I))
+ I.eraseFromParent();
+ return true;
+}
+
/// Sink instructions into cycles if profitable. This especially tries to
/// prevent register spills caused by register pressure if there is little to no
/// overhead moving instructions into cycles.
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
new file mode 100644
index 00000000000000..72b4495297a1c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressively-sink-insts-to-avoid-spills=1 < %s | FileCheck -check-prefix=SUNK %s
+
+; Check that various edge cases do not crash the compiler
+
+; Multiple uses of sunk valu, chain of sink candidates
+
+define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
+; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+ ret half %result
+}
+
+; Sink candidates with multiple defs
+
+define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
+; SUNK-LABEL: memmove_p5_p5:
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index efa21052e3ae2f..f93d8f3dde21b6 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,5 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+
---
name: test_sink_fmac_to_only_use
@@ -48,6 +50,47 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+ ;
+ ; GFX9-SUNK-LABEL: name: test_sink_fmac_to_only_use
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+ ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -131,6 +174,48 @@ body: |
; GFX9-NEXT: bb.3:
; GFX9-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_into_if_cond_multiple_uses
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+ ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -215,6 +300,48 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+ ;
+ ; GFX9-SUNK-LABEL: name: no_sink_fmac_not_constant_mode
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: $mode = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+ ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
$mode = IMPLICIT_DEF
@@ -287,6 +414,36 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit %6
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_fmac_wwm
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]]
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit %6
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -382,6 +539,69 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: test_def_and_use_in_loop_sink_fmac
+ ; GFX9-SUNK: bb.0.entry:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: successors: %bb.6(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6:
+ ; GFX9-SUNK-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.7:
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x80000000)
@@ -512,6 +732,69 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop
+ ; GFX9-SUNK: bb.0.entry:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: successors: %bb.6(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6:
+ ; GFX9-SUNK-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.7:
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x80000000)
@@ -656,6 +939,83 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop2
+ ; GFX9-SUNK: bb.0.entry:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: successors: %bb.6(0x04000000), %bb.5(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.6
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6:
+ ; GFX9-SUNK-NEXT: successors: %bb.7(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.7
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.7:
+ ; GFX9-SUNK-NEXT: successors: %bb.8(0x04000000), %bb.2(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.8
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.8:
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2 (0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index 04c80582f6f079..2a14b85cf2bd56 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,78 +1,148 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
---
name: multi_else_break
tracksRegLiveness: true
body: |
- ; CHECK-LABEL: name: multi_else_break
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $vgpr4, $vgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
- ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
- ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
- ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
- ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
- ; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
- ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.4
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.3:
- ; CHECK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.4:
- ; CHECK-NEXT: successors: %bb.5(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
- ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
- ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
- ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.5:
- ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
- ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
- ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
- ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
- ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.6
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.6:
- ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
- ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
- ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.3
+ ; GFX10-LABEL: name: multi_else_break
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+ ; GFX10-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_BRANCH %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+ ; GFX10-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+ ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+ ; GFX10-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+ ; GFX10-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+ ; GFX10-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_BRANCH %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+ ; GFX10-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+ ; GFX10-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX10-SUNK-LABEL: name: multi_else_break
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX10-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX10-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+ ; GFX10-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+ ; GFX10-SUNK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+ ; GFX10-SUNK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+ ; GFX10-SUNK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+ ; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; GFX10-SUNK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX10-SUNK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+ ; GFX10-SUNK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+ ; GFX10-SUNK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+ ; GFX10-SUNK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+ ; GFX10-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+ ; GFX10-SUNK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.6
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6:
+ ; GFX10-SUNK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+ ; GFX10-SUNK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+ ; GFX10-SUNK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
bb.0:
successors: %bb.1(0x80000000)
liveins: $vgpr4, $vgpr5
>From 0813b87be351315d2a23c58e6f95c5be1eba5d45 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 22 Nov 2024 13:22:26 -0800
Subject: [PATCH 2/8] Address review comments
Change-Id: I975fab6cf7dba21788fb5677a5484916ef29d959
---
llvm/lib/CodeGen/MachineSink.cpp | 122 ++++-----
.../aggressive-loop-sink-nonstandard.ll | 245 +++++++++++++++++-
.../machine-sink-ignorable-exec-use.mir | 2 +-
.../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 4 +-
4 files changed, 297 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index d8dd6e8478686d..151348e6b1c1ba 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -101,7 +101,7 @@ static cl::opt<bool>
cl::init(false), cl::Hidden);
static cl::opt<bool> AggressivelySinkInstsIntoCycle(
- "aggressively-sink-insts-to-avoid-spills",
+ "aggressive-sink-insts-into-cycles",
cl::desc("Aggressively sink instructions into cycles to avoid "
"register spills"),
cl::init(false), cl::Hidden);
@@ -118,6 +118,8 @@ STATISTIC(NumSplit, "Number of critical edges split");
STATISTIC(NumCoalesces, "Number of copies coalesced");
STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
+using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
namespace {
class MachineSinking : public MachineFunctionPass {
@@ -263,11 +265,10 @@ class MachineSinking : public MachineFunctionPass {
bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
bool isDead(const MachineInstr *MI) const;
- bool AggressivelySinkIntoCycle(
+ bool aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
- DenseMap<MachineInstr *,
- std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
- SunkInstrs);
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ &SunkInstrs);
bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
@@ -692,8 +693,8 @@ void MachineSinking::FindCycleSinkCandidates(
SmallVectorImpl<MachineInstr *> &Candidates) {
for (auto &MI : *BB) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
- if (MI.isDebugInstr()) {
- LLVM_DEBUG(dbgs() << "CycleSink: Dont sink debug instructions\n");
+ if (MI.isMetaInstruction()) {
+ LLVM_DEBUG(dbgs() << "CycleSink: Dont sink meta instructions\n");
continue;
}
if (!TII->shouldSink(MI)) {
@@ -786,8 +787,11 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
EverMadeChange = true;
}
- if (SinkInstsIntoCycle) {
+ if (SinkInstsIntoCycle || AggressivelySinkInstsIntoCycle) {
SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ SunkInstrs;
for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
@@ -801,7 +805,18 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
// of a def-use chain, if there is any.
// TODO: Sort the candidates using a cost-model.
unsigned i = 0;
+
for (MachineInstr *I : llvm::reverse(Candidates)) {
+ // AggressivelySinkInstsIntoCycle sinks a superset of instructions
+ // relative to regular cycle sinking. Thus, this option supercedes
+ // captures all sinking opportunites done
+ if (AggressivelySinkInstsIntoCycle) {
+ aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
+ EverMadeChange = true;
+ ++NumCycleSunk;
+ continue;
+ }
+
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs() << "CycleSink: Limit reached of instructions to "
"be analysed.");
@@ -816,30 +831,6 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (AggressivelySinkInstsIntoCycle) {
- SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
- DenseMap<MachineInstr *,
- std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
- SunkInstrs;
- for (auto *Cycle : Cycles) {
- MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
- if (!Preheader) {
- LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Can't find preheader\n");
- continue;
- }
- SmallVector<MachineInstr *, 8> Candidates;
- FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
- // Walk the candidates in reverse order so that we start with the use
- // of a def-use chain, if there is any.
- for (MachineInstr *I : llvm::reverse(Candidates)) {
- AggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
- EverMadeChange = true;
- ++NumCycleSunk;
- }
- }
- }
-
HasStoreCache.clear();
StoreInstrCache.clear();
@@ -1615,31 +1606,27 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
return HasAliasedStore;
}
-/// Copy paste from DeadMachineInstructionElimImpl
-
bool MachineSinking::isDead(const MachineInstr *MI) const {
// Instructions without side-effects are dead iff they only define dead regs.
// This function is hot and this loop returns early in the common case,
// so only perform additional checks before this if absolutely necessary.
+
for (const MachineOperand &MO : MI->all_defs()) {
Register Reg = MO.getReg();
- if (Reg.isPhysical()) {
+ if (Reg.isPhysical())
return false;
- } else {
- if (MO.isDead()) {
+
+ if (MO.isDead()) {
#ifndef NDEBUG
- // Basic check on the register. All of them should be 'undef'.
- for (auto &U : MRI->use_nodbg_operands(Reg))
- assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+ // Basic check on the register. All of them should be 'undef'.
+ for (auto &U : MRI->use_nodbg_operands(Reg))
+ assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
#endif
- continue;
- }
- for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
- if (&Use != MI)
- // This def has a non-debug use. Don't delete the instruction!
- return false;
- }
+ continue;
}
+
+ if (!(MRI->hasAtMostUserInstrs(Reg, 0)))
+ return false;
}
// Technically speaking inline asm without side effects and no defs can still
@@ -1661,25 +1648,24 @@ bool MachineSinking::isDead(const MachineInstr *MI) const {
/// In particular, it will sink into multiple successor blocks without limits
/// based on the amount of sinking, or the type of ops being sunk (so long as
/// they are safe to sink).
-bool MachineSinking::AggressivelySinkIntoCycle(
+bool MachineSinking::aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
- DenseMap<MachineInstr *,
- std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
- SunkInstrs) {
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ &SunkInstrs) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
- SmallVector<std::pair<MachineOperand, MachineInstr *>> Uses;
+ SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
// TODO: support instructions with multiple defs
if (I.getNumDefs() > 1)
return false;
- MachineOperand DefMO = I.getOperand(0);
+ MachineOperand &DefMO = I.getOperand(0);
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
- Uses.push_back({DefMO, &MI});
+ Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
}
- for (std::pair<MachineOperand, MachineInstr *> Entry : Uses) {
+ for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
MachineInstr *MI = Entry.second;
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Analysing use: " << MI);
if (MI->isPHI()) {
@@ -1701,22 +1687,14 @@ bool MachineSinking::AggressivelySinkIntoCycle(
MachineBasicBlock *SinkBlock = MI->getParent();
MachineInstr *NewMI = nullptr;
+ std::pair<MachineInstr *, MachineBasicBlock *> MapEntry(&I, SinkBlock);
// Check for the case in which we have already sunk a copy of this
// instruction into the user block.
- if (SunkInstrs.contains(&I)) {
- auto SunkBlocks = SunkInstrs[&I];
- auto Match = std::find_if(
- SunkBlocks.begin(), SunkBlocks.end(),
- [&SinkBlock](
- std::pair<MachineBasicBlock *, MachineInstr *> SunkEntry) {
- return SunkEntry.first == SinkBlock;
- });
- if (Match != SunkBlocks.end()) {
- LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
- << printMBBReference(*SinkBlock) << "\n");
- NewMI = Match->second;
- }
+ if (SunkInstrs.contains(MapEntry)) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
+ << printMBBReference(*SinkBlock) << "\n");
+ NewMI = SunkInstrs[MapEntry];
}
// Create a copy of the instruction in the use block.
@@ -1733,7 +1711,7 @@ bool MachineSinking::AggressivelySinkIntoCycle(
}
SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
NewMI);
- SunkInstrs[&I].push_back({SinkBlock, NewMI});
+ SunkInstrs[MapEntry] = NewMI;
}
// Conservatively clear any kill flags on uses of sunk instruction
@@ -1748,9 +1726,9 @@ bool MachineSinking::AggressivelySinkIntoCycle(
NewMI->setDebugLoc(DebugLoc());
// Replace the use with the newly created virtual register.
- MachineOperand UseMO = Entry.first;
- MI->substituteRegister(UseMO.getReg(), NewMI->getOperand(0).getReg(),
- UseMO.getSubReg(), *TRI);
+ RegSubRegPair &UseReg = Entry.first;
+ MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
+ UseReg.SubReg, *TRI);
}
// If we have replaced all uses, then delete the dead instruction
if (isDead(&I))
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
index 72b4495297a1c5..9e53b8434cc021 100644
--- a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressively-sink-insts-to-avoid-spills=1 < %s | FileCheck -check-prefix=SUNK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressive-sink-insts-into-cycles=1 < %s | FileCheck -check-prefix=SUNK %s
; Check that various edge cases do not crash the compiler
@@ -6,6 +7,39 @@
define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; SUNK: ; %bb.0:
+; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT: v_mov_b32_e32 v3, v0
+; SUNK-NEXT: v_and_b32_e32 v0, -4, v3
+; SUNK-NEXT: global_load_dword v4, v[0:1], off
+; SUNK-NEXT: v_and_b32_e32 v3, 3, v3
+; SUNK-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; SUNK-NEXT: s_mov_b32 s2, 0xffff
+; SUNK-NEXT: v_lshlrev_b32_e64 v5, v3, s2
+; SUNK-NEXT: s_mov_b64 s[0:1], 0
+; SUNK-NEXT: v_not_b32_e32 v5, v5
+; SUNK-NEXT: v_max_f16_e32 v2, v2, v2
+; SUNK-NEXT: .LBB0_1: ; %atomicrmw.start
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: v_mov_b32_e32 v7, v4
+; SUNK-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; SUNK-NEXT: v_max_f16_e32 v4, v4, v4
+; SUNK-NEXT: v_min_f16_e32 v4, v4, v2
+; SUNK-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; SUNK-NEXT: v_and_or_b32 v6, v7, v5, v4
+; SUNK-NEXT: buffer_wbl2 sc1
+; SUNK-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: buffer_inv sc1
+; SUNK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; SUNK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; SUNK-NEXT: s_cbranch_execnz .LBB0_1
+; SUNK-NEXT: ; %bb.2: ; %atomicrmw.end
+; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; SUNK-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
ret half %result
}
@@ -14,7 +48,216 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
; SUNK-LABEL: memmove_p5_p5:
+; SUNK: ; %bb.0: ; %entry
+; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT: v_and_b32_e32 v4, 15, v2
+; SUNK-NEXT: v_mov_b32_e32 v5, 0
+; SUNK-NEXT: v_and_b32_e32 v6, -16, v2
+; SUNK-NEXT: v_mov_b32_e32 v7, v3
+; SUNK-NEXT: v_cmp_ne_u64_e64 s[0:1], 0, v[4:5]
+; SUNK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SUNK-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v0
+; SUNK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
+; SUNK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_3
+; SUNK-NEXT: ; %bb.1: ; %Flow46
+; SUNK-NEXT: s_andn2_saveexec_b64 s[2:3], s[4:5]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_10
+; SUNK-NEXT: .LBB1_2: ; %Flow47
+; SUNK-NEXT: s_or_b64 exec, exec, s[2:3]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: s_setpc_b64 s[30:31]
+; SUNK-NEXT: .LBB1_3: ; %memmove_copy_forward
+; SUNK-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SUNK-NEXT: s_cbranch_execz .LBB1_6
+; SUNK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
+; SUNK-NEXT: s_mov_b64 s[8:9], 0
+; SUNK-NEXT: v_mov_b32_e32 v3, v1
+; SUNK-NEXT: v_mov_b32_e32 v8, v0
+; SUNK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_dwordx4 v[10:13], v3, off
+; SUNK-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, -16
+; SUNK-NEXT: v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
+; SUNK-NEXT: v_add_u32_e32 v3, 16, v3
+; SUNK-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_dwordx4 v8, v[10:13], off
+; SUNK-NEXT: v_add_u32_e32 v8, 16, v8
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_5
+; SUNK-NEXT: .LBB1_6: ; %Flow41
+; SUNK-NEXT: s_or_b64 exec, exec, s[6:7]
+; SUNK-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
+; SUNK-NEXT: s_cbranch_execz .LBB1_9
+; SUNK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
+; SUNK-NEXT: v_and_b32_e32 v2, -16, v2
+; SUNK-NEXT: v_add_u32_e32 v0, v0, v2
+; SUNK-NEXT: v_add_u32_e32 v1, v1, v2
+; SUNK-NEXT: s_mov_b64 s[8:9], 0
+; SUNK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_ubyte v2, v1, off
+; SUNK-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, -1
+; SUNK-NEXT: v_cmp_eq_u64_e64 s[2:3], 0, v[4:5]
+; SUNK-NEXT: v_add_u32_e32 v1, 1, v1
+; SUNK-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_byte v0, v2, off
+; SUNK-NEXT: v_add_u32_e32 v0, 1, v0
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_8
+; SUNK-NEXT: .LBB1_9: ; %Flow39
+; SUNK-NEXT: s_or_b64 exec, exec, s[6:7]
+; SUNK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SUNK-NEXT: ; implicit-def: $vgpr0
+; SUNK-NEXT: ; implicit-def: $vgpr1
+; SUNK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SUNK-NEXT: s_andn2_saveexec_b64 s[2:3], s[4:5]
+; SUNK-NEXT: s_cbranch_execz .LBB1_2
+; SUNK-NEXT: .LBB1_10: ; %memmove_copy_backwards
+; SUNK-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
+; SUNK-NEXT: s_cbranch_execz .LBB1_13
+; SUNK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
+; SUNK-NEXT: v_add_u32_e32 v7, -1, v2
+; SUNK-NEXT: v_add_u32_e32 v6, v0, v7
+; SUNK-NEXT: v_add_u32_e32 v7, v1, v7
+; SUNK-NEXT: s_mov_b64 s[6:7], 0
+; SUNK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_ubyte v8, v7, off
+; SUNK-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, -1
+; SUNK-NEXT: v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
+; SUNK-NEXT: v_add_u32_e32 v7, -1, v7
+; SUNK-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_byte v6, v8, off
+; SUNK-NEXT: v_add_u32_e32 v6, -1, v6
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_12
+; SUNK-NEXT: .LBB1_13: ; %Flow45
+; SUNK-NEXT: s_or_b64 exec, exec, s[4:5]
+; SUNK-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; SUNK-NEXT: s_cbranch_execz .LBB1_16
+; SUNK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
+; SUNK-NEXT: v_and_b32_e32 v5, -16, v2
+; SUNK-NEXT: v_add_u32_e32 v4, -16, v5
+; SUNK-NEXT: v_add_u32_e32 v2, v0, v4
+; SUNK-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v5
+; SUNK-NEXT: v_add_u32_e32 v4, v1, v4
+; SUNK-NEXT: s_mov_b64 s[4:5], 0
+; SUNK-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v3, vcc
+; SUNK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_dwordx4 v[6:9], v4, off
+; SUNK-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 16
+; SUNK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; SUNK-NEXT: v_add_u32_e32 v4, -16, v4
+; SUNK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_dwordx4 v2, v[6:9], off
+; SUNK-NEXT: v_add_u32_e32 v2, -16, v2
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_15
+; SUNK-NEXT: .LBB1_16: ; %Flow43
+; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT: s_or_b64 exec, exec, s[2:3]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
+
+; We should not sink the mfma into the if/else as it is convergent
+
+define void @convergent_sink(<4 x i16> %in0, <4 x i16> %in1, i32 %val, i32 %v, ptr addrspace(1) %outptr) #2 {
+; SUNK-LABEL: convergent_sink:
+; SUNK: ; %bb.0: ; %entry
+; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], 0
+; SUNK-NEXT: v_lshl_add_u32 v0, v5, 1, v5
+; SUNK-NEXT: v_lshlrev_b32_e32 v2, 1, v5
+; SUNK-NEXT: s_mov_b32 s4, 0
+; SUNK-NEXT: s_mov_b64 s[0:1], 0
+; SUNK-NEXT: v_mov_b32_e32 v5, 0xde
+; SUNK-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SUNK-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SUNK-NEXT: s_branch .LBB2_2
+; SUNK-NEXT: .LBB2_1: ; %end
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v4
+; SUNK-NEXT: s_add_i32 s4, s4, 1
+; SUNK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; SUNK-NEXT: s_cbranch_execz .LBB2_7
+; SUNK-NEXT: .LBB2_2: ; %loop.body
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: s_cmp_lt_i32 s4, 6
+; SUNK-NEXT: global_store_dword v[6:7], v5, off
+; SUNK-NEXT: s_cbranch_scc0 .LBB2_4
+; SUNK-NEXT: ; %bb.3: ; %else
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 3, v[6:7]
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[12:15], off offset:48
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[8:11], off offset:32
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[4:7], off offset:16
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[0:3], off
+; SUNK-NEXT: s_mov_b64 s[2:3], 0
+; SUNK-NEXT: s_branch .LBB2_5
+; SUNK-NEXT: .LBB2_4: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: s_mov_b64 s[2:3], -1
+; SUNK-NEXT: .LBB2_5: ; %Flow
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; SUNK-NEXT: v_mov_b32_e32 v8, v0
+; SUNK-NEXT: s_cbranch_vccnz .LBB2_1
+; SUNK-NEXT: ; %bb.6: ; %if
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 3, v[6:7]
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[12:15], off offset:48
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[8:11], off offset:32
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[4:7], off offset:16
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[0:3], off
+; SUNK-NEXT: v_mov_b32_e32 v8, v2
+; SUNK-NEXT: s_branch .LBB2_1
+; SUNK-NEXT: .LBB2_7: ; %exit
+; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[12:15], off offset:48
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[8:11], off offset:32
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[4:7], off offset:16
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[0:3], off
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %1005 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %in0, <4 x i16> %in1, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ br label %loop.body
+
+loop.body:
+ %i = phi i32 [0, %entry], [%i.inc, %end]
+ store i32 222, ptr addrspace(1) %outptr
+ %cc = icmp sgt i32 %i, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %v.if = mul i32 %v, 2
+ %sptr.if = getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.if
+ store <16 x float> %1005, ptr addrspace(1) %sptr.if
+ br label %end
+
+else:
+ %v.else = mul i32 %v, 3
+ %sptr.else = getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.else
+ store <16 x float> %1005, ptr addrspace(1) %sptr.else
+ br label %end
+
+end:
+ %r = phi i32 [ %v.if, %if ], [ %v.else, %else ]
+ %cmp = icmp ne i32 %r, %val
+ %i.inc = add i32 %i, 1
+ br i1 %cmp, label %loop.body, label %exit
+
+exit:
+ store <16 x float> %1005, ptr addrspace(1) %outptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index f93d8f3dde21b6..259abae6d92c87 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index 2a14b85cf2bd56..fafad600c47458 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
---
name: multi_else_break
@@ -86,6 +86,7 @@ body: |
; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX10-SUNK-NEXT: {{ $}}
; GFX10-SUNK-NEXT: bb.1:
; GFX10-SUNK-NEXT: successors: %bb.2(0x80000000)
@@ -100,7 +101,6 @@ body: |
; GFX10-SUNK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
; GFX10-SUNK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
; GFX10-SUNK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
- ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX10-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; GFX10-SUNK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
>From 7e3caf54cfdd9eb757944f8cec657af2ff8c0efd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Sat, 23 Nov 2024 14:00:44 -0800
Subject: [PATCH 3/8] Fix SystemZ test
Change-Id: I8f1138f9fc82251538f2c428f1e67fa2941266b5
---
llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
index 43c286a830b42e..52c9d1067220ee 100644
--- a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
+++ b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
@@ -25,14 +25,14 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[LARL:%[0-9]+]]:addr64bit = LARL @b
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[LA:%[0-9]+]]:gr64bit = LA [[LARL]], 49, $noreg
; CHECK-NEXT: [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0
; CHECK-NEXT: $r2d = COPY [[DEF]]
; CHECK-NEXT: $r3d = COPY [[LA]]
>From d09d4f1bdaec14b01c1cc1bd2cd8bccd58d0c8b4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 2 Dec 2024 16:03:05 -0800
Subject: [PATCH 4/8] Add low latency check
Change-Id: Iec36f11060ca1b46b6c33130d4ee02863360c671
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 7 +-
llvm/lib/CodeGen/MachineSink.cpp | 11 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
.../machine-sink-aggressive-latency.mir | 107 ++++++++++++++++++
5 files changed, 129 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 07b59b241d9f9a..c4c69e5129602c 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1806,8 +1806,13 @@ class TargetInstrInfo : public MCInstrInfo {
unsigned defaultDefLatency(const MCSchedModel &SchedModel,
const MachineInstr &DefMI) const;
+ /// Return true if this instruction is considered low latency.
+ virtual bool isLowLatencyInstruction(const MachineInstr &MI) const {
+ return false;
+ };
+
/// Return true if this opcode has high latency to its result.
- virtual bool isHighLatencyDef(int opc) const { return false; }
+ virtual bool isHighLatencyDef(int opc) const { return false; };
/// Compute operand latency between a def of 'Reg'
/// and a use in the current loop. Return true if the target considered
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 151348e6b1c1ba..8e4f02495229a2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1652,13 +1652,18 @@ bool MachineSinking::aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
&SunkInstrs) {
+ // TODO: support instructions with multiple defs
+ if (I.getNumDefs() > 1)
+ return false;
+
+ // Only sink instructions which the target considers to be low latency
+ if (!TII->isLowLatencyInstruction(I))
+ return false;
+
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
- // TODO: support instructions with multiple defs
- if (I.getNumDefs() > 1)
- return false;
MachineOperand &DefMO = I.getOperand(0);
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c864f03f1f0f9e..f3ef9a25dd82c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8676,7 +8676,13 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
- return isSMRD(Opc);
+ if (MI.isCopy() || isSMRD(Opc))
+ return true;
+
+ if (SchedModel.hasInstrSchedModel())
+ return SchedModel.computeInstrLatency(Opc) < 4;
+
+ return false;
}
bool SIInstrInfo::isHighLatencyDef(int Opc) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1f7fff76d15210..f103eb9e97e9b4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1291,7 +1291,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
uint64_t getDefaultRsrcDataFormat() const;
uint64_t getScratchRsrcWords23() const;
- bool isLowLatencyInstruction(const MachineInstr &MI) const;
+ bool isLowLatencyInstruction(const MachineInstr &MI) const override;
bool isHighLatencyDef(int Opc) const override;
/// Return the descriptor of the target-specific machine instruction
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
new file mode 100644
index 00000000000000..b5296a85b31352
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
@@ -0,0 +1,107 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+
+---
+name: latency_cycle_sink
+tracksRegLiveness: true
+body: |
+ ; GFX10-SUNK-LABEL: name: latency_cycle_sink
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: latency_cycle_sink
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_2]], 8, [[V_PK_MUL_LO_U16_2]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr4, $vgpr5
+ %83:vgpr_32 = IMPLICIT_DEF
+ %80:vgpr_32 = V_PK_MUL_LO_U16 8, %83, 8, %83, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ S_BRANCH %bb.2
+
+
+ bb.2:
+ %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3:
+ %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
>From 16b16e38a0b3d6b1de887830228e960aba30d87a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 3 Dec 2024 14:38:28 -0800
Subject: [PATCH 5/8] Extra semicolon
Change-Id: I17405578571a711f53db71df0e9329600c01fceb
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index c4c69e5129602c..bfc3450c97c024 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1812,7 +1812,7 @@ class TargetInstrInfo : public MCInstrInfo {
};
/// Return true if this opcode has high latency to its result.
- virtual bool isHighLatencyDef(int opc) const { return false; };
+ virtual bool isHighLatencyDef(int opc) const { return false; }
/// Compute operand latency between a def of 'Reg'
/// and a use in the current loop. Return true if the target considered
>From 5dddd8498e30fdb1352a4015b52b50f4d3e5dcc5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 5 Dec 2024 11:00:25 -0800
Subject: [PATCH 6/8] Use stage approach
Change-Id: I4082bd57dd03236e4d578dac4804949544f4dcf2
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 5 -
llvm/lib/CodeGen/MachineSink.cpp | 208 ++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
.../aggressive-loop-sink-nonstandard.ll | 263 ----
.../machine-sink-aggressive-latency.mir | 107 --
.../CodeGen/AMDGPU/machine-sink-cycle.mir | 1272 +++++++++++++++++
.../machine-sink-ignorable-exec-use.mir | 360 +----
.../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 71 +-
.../SystemZ/machinelicm-sunk-kill-flags.mir | 14 +-
10 files changed, 1364 insertions(+), 946 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index bfc3450c97c024..07b59b241d9f9a 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1806,11 +1806,6 @@ class TargetInstrInfo : public MCInstrInfo {
unsigned defaultDefLatency(const MCSchedModel &SchedModel,
const MachineInstr &DefMI) const;
- /// Return true if this instruction is considered low latency.
- virtual bool isLowLatencyInstruction(const MachineInstr &MI) const {
- return false;
- };
-
/// Return true if this opcode has high latency to its result.
virtual bool isHighLatencyDef(int opc) const { return false; }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 8e4f02495229a2..3f2e790b059041 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -44,6 +44,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -100,12 +101,6 @@ static cl::opt<bool>
"register spills"),
cl::init(false), cl::Hidden);
-static cl::opt<bool> AggressivelySinkInstsIntoCycle(
- "aggressive-sink-insts-into-cycles",
- cl::desc("Aggressively sink instructions into cycles to avoid "
- "register spills"),
- cl::init(false), cl::Hidden);
-
static cl::opt<unsigned> SinkIntoCycleLimit(
"machine-sink-cycle-limit",
cl::desc(
@@ -135,6 +130,7 @@ class MachineSinking : public MachineFunctionPass {
const MachineBranchProbabilityInfo *MBPI = nullptr;
AliasAnalysis *AA = nullptr;
RegisterClassInfo RegClassInfo;
+ TargetSchedModel SchedModel;
// Remember which edges have been considered for breaking.
SmallSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>, 8>
@@ -262,7 +258,6 @@ class MachineSinking : public MachineFunctionPass {
void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
SmallVectorImpl<MachineInstr *> &Candidates);
- bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
bool isDead(const MachineInstr *MI) const;
bool aggressivelySinkIntoCycle(
@@ -284,11 +279,14 @@ class MachineSinking : public MachineFunctionPass {
GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
AllSuccsCache &AllSuccessors) const;
- std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB);
+ std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB,
+ bool UseCache = true);
bool registerPressureSetExceedsLimit(unsigned NRegs,
const TargetRegisterClass *RC,
const MachineBasicBlock &MBB);
+
+ bool registerPressureExceedsLimit(const MachineBasicBlock &MBB);
};
} // end anonymous namespace
@@ -787,48 +785,63 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
EverMadeChange = true;
}
- if (SinkInstsIntoCycle || AggressivelySinkInstsIntoCycle) {
+ if (SinkInstsIntoCycle) {
SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+ SchedModel.init(STI);
+ enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
- DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
- SunkInstrs;
- for (auto *Cycle : Cycles) {
- MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
- if (!Preheader) {
- LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
- continue;
- }
- SmallVector<MachineInstr *, 8> Candidates;
- FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
- // Walk the candidates in reverse order so that we start with the use
- // of a def-use chain, if there is any.
- // TODO: Sort the candidates using a cost-model.
- unsigned i = 0;
-
- for (MachineInstr *I : llvm::reverse(Candidates)) {
- // AggressivelySinkInstsIntoCycle sinks a superset of instructions
- // relative to regular cycle sinking. Thus, this option supercedes
- // captures all sinking opportunites done
- if (AggressivelySinkInstsIntoCycle) {
- aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
- EverMadeChange = true;
- ++NumCycleSunk;
+ CycleSinkStage Stage = CycleSinkStage::COPY;
+ bool HasHighPressure;
+ do {
+ HasHighPressure = false;
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ SunkInstrs;
+ for (auto *Cycle : Cycles) {
+ MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+ if (!Preheader) {
+ LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
continue;
}
+ SmallVector<MachineInstr *, 8> Candidates;
+ FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+ unsigned i = 0;
+
+ // Walk the candidates in reverse order so that we start with the use
+ // of a def-use chain, if there is any.
+ // TODO: Sort the candidates using a cost-model.
+ for (MachineInstr *I : llvm::reverse(Candidates)) {
+ // CycleSinkStage::COPY: Sink a limited number of copies
+ if (Stage == CycleSinkStage::COPY) {
+ if (i++ == SinkIntoCycleLimit) {
+ LLVM_DEBUG(dbgs()
+ << "CycleSink: Limit reached of instructions to "
+ "be analysed.");
+ break;
+ }
+
+ if (!I->isCopy())
+ continue;
+ }
- if (i++ == SinkIntoCycleLimit) {
- LLVM_DEBUG(dbgs() << "CycleSink: Limit reached of instructions to "
- "be analysed.");
- break;
+ // CycleSinkStage::LOW_LATENCY: sink unlimited number of instructions
+ // which the target specifies as low-latency
+ if (Stage == CycleSinkStage::LOW_LATENCY &&
+ !TII->hasLowDefLatency(SchedModel, *I, 0))
+ continue;
+
+ if (!aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs))
+ break;
+ EverMadeChange = true;
+ ++NumCycleSunk;
}
- if (!SinkIntoCycle(Cycle, *I))
- break;
- EverMadeChange = true;
- ++NumCycleSunk;
+ // Recalculate the pressure after sinking
+ if (!HasHighPressure)
+ HasHighPressure = registerPressureExceedsLimit(*Preheader);
}
- }
+ Stage = (CycleSinkStage)(Stage + 1);
+ } while (HasHighPressure && Stage < CycleSinkStage::END);
}
HasStoreCache.clear();
@@ -1081,13 +1094,15 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
}
std::vector<unsigned> &
-MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
+MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB,
+ bool UseCache) {
// Currently to save compiling time, MBB's register pressure will not change
// in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
// register pressure is changed after sinking any instructions into it.
// FIXME: need a accurate and cheap register pressure estiminate model here.
+
auto RP = CachedRegisterPressure.find(&MBB);
- if (RP != CachedRegisterPressure.end())
+ if (UseCache && RP != CachedRegisterPressure.end())
return RP->second;
RegionPressure Pressure;
@@ -1111,6 +1126,12 @@ MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
}
RPTracker.closeRegion();
+
+ if (RP != CachedRegisterPressure.end()) {
+ CachedRegisterPressure[&MBB] = RPTracker.getPressure().MaxSetPressure;
+ return CachedRegisterPressure[&MBB];
+ }
+
auto It = CachedRegisterPressure.insert(
std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
return It.first->second;
@@ -1129,6 +1150,21 @@ bool MachineSinking::registerPressureSetExceedsLimit(
return false;
}
+// Recalculate RP and check if any pressure set exceeds the set limit.
+bool MachineSinking::registerPressureExceedsLimit(
+ const MachineBasicBlock &MBB) {
+ std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB, false);
+
+ for (unsigned PS = 0; PS < BBRegisterPressure.size(); ++PS) {
+ if (BBRegisterPressure[PS] >=
+ TRI->getRegPressureSetLimit(*MBB.getParent(), PS)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
@@ -1656,10 +1692,6 @@ bool MachineSinking::aggressivelySinkIntoCycle(
if (I.getNumDefs() > 1)
return false;
- // Only sink instructions which the target considers to be low latency
- if (!TII->isLowLatencyInstruction(I))
- return false;
-
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
@@ -1741,86 +1773,6 @@ bool MachineSinking::aggressivelySinkIntoCycle(
return true;
}
-/// Sink instructions into cycles if profitable. This especially tries to
-/// prevent register spills caused by register pressure if there is little to no
-/// overhead moving instructions into cycles.
-bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
- LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
- MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
- assert(Preheader && "Cycle sink needs a preheader block");
- MachineBasicBlock *SinkBlock = nullptr;
- bool CanSink = true;
- const MachineOperand &MO = I.getOperand(0);
-
- for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
- LLVM_DEBUG(dbgs() << "CycleSink: Analysing use: " << MI);
- if (!Cycle->contains(MI.getParent())) {
- LLVM_DEBUG(dbgs() << "CycleSink: Use not in cycle, can't sink.\n");
- CanSink = false;
- break;
- }
-
- // FIXME: Come up with a proper cost model that estimates whether sinking
- // the instruction (and thus possibly executing it on every cycle
- // iteration) is more expensive than a register.
- // For now assumes that copies are cheap and thus almost always worth it.
- if (!MI.isCopy()) {
- LLVM_DEBUG(dbgs() << "CycleSink: Use is not a copy\n");
- CanSink = false;
- break;
- }
- if (!SinkBlock) {
- SinkBlock = MI.getParent();
- LLVM_DEBUG(dbgs() << "CycleSink: Setting sink block to: "
- << printMBBReference(*SinkBlock) << "\n");
- continue;
- }
- SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
- if (!SinkBlock) {
- LLVM_DEBUG(dbgs() << "CycleSink: Can't find nearest dominator\n");
- CanSink = false;
- break;
- }
- LLVM_DEBUG(dbgs() << "CycleSink: Setting nearest common dom block: "
- << printMBBReference(*SinkBlock) << "\n");
- }
-
- if (!CanSink) {
- LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
- return false;
- }
- if (!SinkBlock) {
- LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
- return false;
- }
- if (SinkBlock == Preheader) {
- LLVM_DEBUG(
- dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
- return false;
- }
- if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
- LLVM_DEBUG(
- dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
- SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
- I);
-
- // Conservatively clear any kill flags on uses of sunk instruction
- for (MachineOperand &MO : I.operands()) {
- if (MO.isReg() && MO.readsReg())
- RegsToClearKillFlags.insert(MO.getReg());
- }
-
- // The instruction is moved from its basic block, so do not retain the
- // debug information.
- assert(!I.isDebugInstr() && "Should not sink debug inst");
- I.setDebugLoc(DebugLoc());
- return true;
-}
-
/// SinkInstruction - Determine whether it is safe to sink the specified machine
/// instruction out of its current block into a successor.
bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f3ef9a25dd82c1..c864f03f1f0f9e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8676,13 +8676,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
- if (MI.isCopy() || isSMRD(Opc))
- return true;
-
- if (SchedModel.hasInstrSchedModel())
- return SchedModel.computeInstrLatency(Opc) < 4;
-
- return false;
+ return isSMRD(Opc);
}
bool SIInstrInfo::isHighLatencyDef(int Opc) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f103eb9e97e9b4..1f7fff76d15210 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1291,7 +1291,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
uint64_t getDefaultRsrcDataFormat() const;
uint64_t getScratchRsrcWords23() const;
- bool isLowLatencyInstruction(const MachineInstr &MI) const override;
+ bool isLowLatencyInstruction(const MachineInstr &MI) const;
bool isHighLatencyDef(int Opc) const override;
/// Return the descriptor of the target-specific machine instruction
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
deleted file mode 100644
index 9e53b8434cc021..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
+++ /dev/null
@@ -1,263 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressive-sink-insts-into-cycles=1 < %s | FileCheck -check-prefix=SUNK %s
-
-; Check that various edge cases do not crash the compiler
-
-; Multiple uses of sunk valu, chain of sink candidates
-
-define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
-; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
-; SUNK: ; %bb.0:
-; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SUNK-NEXT: v_mov_b32_e32 v3, v0
-; SUNK-NEXT: v_and_b32_e32 v0, -4, v3
-; SUNK-NEXT: global_load_dword v4, v[0:1], off
-; SUNK-NEXT: v_and_b32_e32 v3, 3, v3
-; SUNK-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; SUNK-NEXT: s_mov_b32 s2, 0xffff
-; SUNK-NEXT: v_lshlrev_b32_e64 v5, v3, s2
-; SUNK-NEXT: s_mov_b64 s[0:1], 0
-; SUNK-NEXT: v_not_b32_e32 v5, v5
-; SUNK-NEXT: v_max_f16_e32 v2, v2, v2
-; SUNK-NEXT: .LBB0_1: ; %atomicrmw.start
-; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: v_mov_b32_e32 v7, v4
-; SUNK-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; SUNK-NEXT: v_max_f16_e32 v4, v4, v4
-; SUNK-NEXT: v_min_f16_e32 v4, v4, v2
-; SUNK-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; SUNK-NEXT: v_and_or_b32 v6, v7, v5, v4
-; SUNK-NEXT: buffer_wbl2 sc1
-; SUNK-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: buffer_inv sc1
-; SUNK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; SUNK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SUNK-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; SUNK-NEXT: s_cbranch_execnz .LBB0_1
-; SUNK-NEXT: ; %bb.2: ; %atomicrmw.end
-; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
-; SUNK-NEXT: v_lshrrev_b32_e32 v0, v3, v4
-; SUNK-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
- ret half %result
-}
-
-; Sink candidates with multiple defs
-
-define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
-; SUNK-LABEL: memmove_p5_p5:
-; SUNK: ; %bb.0: ; %entry
-; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SUNK-NEXT: v_and_b32_e32 v4, 15, v2
-; SUNK-NEXT: v_mov_b32_e32 v5, 0
-; SUNK-NEXT: v_and_b32_e32 v6, -16, v2
-; SUNK-NEXT: v_mov_b32_e32 v7, v3
-; SUNK-NEXT: v_cmp_ne_u64_e64 s[0:1], 0, v[4:5]
-; SUNK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SUNK-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v0
-; SUNK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
-; SUNK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SUNK-NEXT: s_cbranch_execnz .LBB1_3
-; SUNK-NEXT: ; %bb.1: ; %Flow46
-; SUNK-NEXT: s_andn2_saveexec_b64 s[2:3], s[4:5]
-; SUNK-NEXT: s_cbranch_execnz .LBB1_10
-; SUNK-NEXT: .LBB1_2: ; %Flow47
-; SUNK-NEXT: s_or_b64 exec, exec, s[2:3]
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: s_setpc_b64 s[30:31]
-; SUNK-NEXT: .LBB1_3: ; %memmove_copy_forward
-; SUNK-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; SUNK-NEXT: s_cbranch_execz .LBB1_6
-; SUNK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; SUNK-NEXT: s_mov_b64 s[8:9], 0
-; SUNK-NEXT: v_mov_b32_e32 v3, v1
-; SUNK-NEXT: v_mov_b32_e32 v8, v0
-; SUNK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop
-; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT: scratch_load_dwordx4 v[10:13], v3, off
-; SUNK-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, -16
-; SUNK-NEXT: v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
-; SUNK-NEXT: v_add_u32_e32 v3, 16, v3
-; SUNK-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: scratch_store_dwordx4 v8, v[10:13], off
-; SUNK-NEXT: v_add_u32_e32 v8, 16, v8
-; SUNK-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SUNK-NEXT: s_cbranch_execnz .LBB1_5
-; SUNK-NEXT: .LBB1_6: ; %Flow41
-; SUNK-NEXT: s_or_b64 exec, exec, s[6:7]
-; SUNK-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
-; SUNK-NEXT: s_cbranch_execz .LBB1_9
-; SUNK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; SUNK-NEXT: v_and_b32_e32 v2, -16, v2
-; SUNK-NEXT: v_add_u32_e32 v0, v0, v2
-; SUNK-NEXT: v_add_u32_e32 v1, v1, v2
-; SUNK-NEXT: s_mov_b64 s[8:9], 0
-; SUNK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop
-; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT: scratch_load_ubyte v2, v1, off
-; SUNK-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, -1
-; SUNK-NEXT: v_cmp_eq_u64_e64 s[2:3], 0, v[4:5]
-; SUNK-NEXT: v_add_u32_e32 v1, 1, v1
-; SUNK-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: scratch_store_byte v0, v2, off
-; SUNK-NEXT: v_add_u32_e32 v0, 1, v0
-; SUNK-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SUNK-NEXT: s_cbranch_execnz .LBB1_8
-; SUNK-NEXT: .LBB1_9: ; %Flow39
-; SUNK-NEXT: s_or_b64 exec, exec, s[6:7]
-; SUNK-NEXT: ; implicit-def: $vgpr2_vgpr3
-; SUNK-NEXT: ; implicit-def: $vgpr0
-; SUNK-NEXT: ; implicit-def: $vgpr1
-; SUNK-NEXT: ; implicit-def: $vgpr4_vgpr5
-; SUNK-NEXT: s_andn2_saveexec_b64 s[2:3], s[4:5]
-; SUNK-NEXT: s_cbranch_execz .LBB1_2
-; SUNK-NEXT: .LBB1_10: ; %memmove_copy_backwards
-; SUNK-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
-; SUNK-NEXT: s_cbranch_execz .LBB1_13
-; SUNK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; SUNK-NEXT: v_add_u32_e32 v7, -1, v2
-; SUNK-NEXT: v_add_u32_e32 v6, v0, v7
-; SUNK-NEXT: v_add_u32_e32 v7, v1, v7
-; SUNK-NEXT: s_mov_b64 s[6:7], 0
-; SUNK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop
-; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT: scratch_load_ubyte v8, v7, off
-; SUNK-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, -1
-; SUNK-NEXT: v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
-; SUNK-NEXT: v_add_u32_e32 v7, -1, v7
-; SUNK-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: scratch_store_byte v6, v8, off
-; SUNK-NEXT: v_add_u32_e32 v6, -1, v6
-; SUNK-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; SUNK-NEXT: s_cbranch_execnz .LBB1_12
-; SUNK-NEXT: .LBB1_13: ; %Flow45
-; SUNK-NEXT: s_or_b64 exec, exec, s[4:5]
-; SUNK-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SUNK-NEXT: s_cbranch_execz .LBB1_16
-; SUNK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; SUNK-NEXT: v_and_b32_e32 v5, -16, v2
-; SUNK-NEXT: v_add_u32_e32 v4, -16, v5
-; SUNK-NEXT: v_add_u32_e32 v2, v0, v4
-; SUNK-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v5
-; SUNK-NEXT: v_add_u32_e32 v4, v1, v4
-; SUNK-NEXT: s_mov_b64 s[4:5], 0
-; SUNK-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v3, vcc
-; SUNK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop
-; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT: scratch_load_dwordx4 v[6:9], v4, off
-; SUNK-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 16
-; SUNK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; SUNK-NEXT: v_add_u32_e32 v4, -16, v4
-; SUNK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: scratch_store_dwordx4 v2, v[6:9], off
-; SUNK-NEXT: v_add_u32_e32 v2, -16, v2
-; SUNK-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; SUNK-NEXT: s_cbranch_execnz .LBB1_15
-; SUNK-NEXT: .LBB1_16: ; %Flow43
-; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
-; SUNK-NEXT: s_or_b64 exec, exec, s[2:3]
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: s_setpc_b64 s[30:31]
-entry:
- tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
- ret void
-}
-
-; We should not sink the mfma into the if/else as it is convergent
-
-define void @convergent_sink(<4 x i16> %in0, <4 x i16> %in1, i32 %val, i32 %v, ptr addrspace(1) %outptr) #2 {
-; SUNK-LABEL: convergent_sink:
-; SUNK: ; %bb.0: ; %entry
-; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SUNK-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], 0
-; SUNK-NEXT: v_lshl_add_u32 v0, v5, 1, v5
-; SUNK-NEXT: v_lshlrev_b32_e32 v2, 1, v5
-; SUNK-NEXT: s_mov_b32 s4, 0
-; SUNK-NEXT: s_mov_b64 s[0:1], 0
-; SUNK-NEXT: v_mov_b32_e32 v5, 0xde
-; SUNK-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; SUNK-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; SUNK-NEXT: s_branch .LBB2_2
-; SUNK-NEXT: .LBB2_1: ; %end
-; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v4
-; SUNK-NEXT: s_add_i32 s4, s4, 1
-; SUNK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SUNK-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; SUNK-NEXT: s_cbranch_execz .LBB2_7
-; SUNK-NEXT: .LBB2_2: ; %loop.body
-; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
-; SUNK-NEXT: s_cmp_lt_i32 s4, 6
-; SUNK-NEXT: global_store_dword v[6:7], v5, off
-; SUNK-NEXT: s_cbranch_scc0 .LBB2_4
-; SUNK-NEXT: ; %bb.3: ; %else
-; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 3, v[6:7]
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[12:15], off offset:48
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[8:11], off offset:32
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[4:7], off offset:16
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[0:3], off
-; SUNK-NEXT: s_mov_b64 s[2:3], 0
-; SUNK-NEXT: s_branch .LBB2_5
-; SUNK-NEXT: .LBB2_4: ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT: s_mov_b64 s[2:3], -1
-; SUNK-NEXT: .LBB2_5: ; %Flow
-; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; SUNK-NEXT: v_mov_b32_e32 v8, v0
-; SUNK-NEXT: s_cbranch_vccnz .LBB2_1
-; SUNK-NEXT: ; %bb.6: ; %if
-; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; SUNK-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 3, v[6:7]
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[12:15], off offset:48
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[8:11], off offset:32
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[4:7], off offset:16
-; SUNK-NEXT: global_store_dwordx4 v[8:9], a[0:3], off
-; SUNK-NEXT: v_mov_b32_e32 v8, v2
-; SUNK-NEXT: s_branch .LBB2_1
-; SUNK-NEXT: .LBB2_7: ; %exit
-; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
-; SUNK-NEXT: global_store_dwordx4 v[6:7], a[12:15], off offset:48
-; SUNK-NEXT: global_store_dwordx4 v[6:7], a[8:11], off offset:32
-; SUNK-NEXT: global_store_dwordx4 v[6:7], a[4:7], off offset:16
-; SUNK-NEXT: global_store_dwordx4 v[6:7], a[0:3], off
-; SUNK-NEXT: s_waitcnt vmcnt(0)
-; SUNK-NEXT: s_setpc_b64 s[30:31]
-entry:
- %1005 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %in0, <4 x i16> %in1, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
- br label %loop.body
-
-loop.body:
- %i = phi i32 [0, %entry], [%i.inc, %end]
- store i32 222, ptr addrspace(1) %outptr
- %cc = icmp sgt i32 %i, 5
- br i1 %cc, label %if, label %else
-
-if:
- %v.if = mul i32 %v, 2
- %sptr.if = getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.if
- store <16 x float> %1005, ptr addrspace(1) %sptr.if
- br label %end
-
-else:
- %v.else = mul i32 %v, 3
- %sptr.else = getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.else
- store <16 x float> %1005, ptr addrspace(1) %sptr.else
- br label %end
-
-end:
- %r = phi i32 [ %v.if, %if ], [ %v.else, %else ]
- %cmp = icmp ne i32 %r, %val
- %i.inc = add i32 %i, 1
- br i1 %cmp, label %loop.body, label %exit
-
-exit:
- store <16 x float> %1005, ptr addrspace(1) %outptr
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
deleted file mode 100644
index b5296a85b31352..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
+++ /dev/null
@@ -1,107 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
-
----
-name: latency_cycle_sink
-tracksRegLiveness: true
-body: |
- ; GFX10-SUNK-LABEL: name: latency_cycle_sink
- ; GFX10-SUNK: bb.0:
- ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
- ; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.1:
- ; GFX10-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.2
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.2:
- ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.3:
- ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.4:
- ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.5:
- ; GFX10-SUNK-NEXT: S_ENDPGM 0
- ;
- ; GFX9-SUNK-LABEL: name: latency_cycle_sink
- ; GFX9-SUNK: bb.0:
- ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr4, $vgpr5
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_2]], 8, [[V_PK_MUL_LO_U16_2]], 0, 0, 0, 0, 0, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.4:
- ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.5:
- ; GFX9-SUNK-NEXT: S_ENDPGM 0
- bb.0:
- successors: %bb.1(0x80000000)
- liveins: $vgpr4, $vgpr5
- %83:vgpr_32 = IMPLICIT_DEF
- %80:vgpr_32 = V_PK_MUL_LO_U16 8, %83, 8, %83, 0, 0, 0, 0, 0, implicit $exec
- S_BRANCH %bb.1
-
- bb.1:
- S_CBRANCH_SCC1 %bb.3, implicit undef $scc
- S_BRANCH %bb.2
-
-
- bb.2:
- %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
- S_BRANCH %bb.4
-
- bb.3:
- %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
- S_BRANCH %bb.4
-
- bb.4:
- S_CBRANCH_SCC1 %bb.1, implicit undef $scc
- S_BRANCH %bb.5
-
- bb.5:
- S_ENDPGM 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
new file mode 100644
index 00000000000000..bca1517ed183ac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-cycle.mir
@@ -0,0 +1,1272 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --sink-insts-to-avoid-spills=1 --stop-after=machine-sink -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 --sink-insts-to-avoid-spills=1 -mattr=+wavefrontsize64 --stop-after=machine-sink -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
+
+---
+name: test_sink_copy
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-SUNK-LABEL: name: test_sink_copy
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY5:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY6:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY7:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY8:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY9:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6.entry:
+ ;
+ ; GFX10-SUNK-LABEL: name: test_sink_copy
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY1:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY4:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[COPY5:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY6:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY7:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY8:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY9:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6.entry:
+ bb.0:
+ %0:vreg_256 = IMPLICIT_DEF
+ %1:vreg_256 = IMPLICIT_DEF
+ %2:vreg_256 = IMPLICIT_DEF
+ %3:vreg_256 = IMPLICIT_DEF
+ %4:vreg_256 = IMPLICIT_DEF
+ %5:vreg_256 = COPY %4
+ %6:vreg_256 = COPY %4
+ %7:vreg_256 = COPY %4
+ %8:vreg_256 = COPY %4
+ %9:vreg_256 = COPY %4
+
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+ S_BRANCH %bb.4
+
+ bb.3:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
+
+# For gfx9, after sinking the copies, pressure is within the desired limit
+
+---
+name: test_sink_multi_stage
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-SUNK-LABEL: name: test_sink_multi_stage
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY5:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY6:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY7:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY8:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: [[COPY9:%[0-9]+]]:vreg_256_align2 = COPY [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6.entry:
+ ;
+ ; GFX10-SUNK-LABEL: name: test_sink_multi_stage
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY1:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY4:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[COPY5:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY6:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY7:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY8:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: [[COPY9:%[0-9]+]]:vreg_256 = COPY [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]], implicit [[COPY8]], implicit [[COPY9]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6.entry:
+ bb.0:
+ %0:vreg_256 = IMPLICIT_DEF
+ %1:vreg_256 = IMPLICIT_DEF
+ %2:vreg_256 = IMPLICIT_DEF
+ %3:vreg_256 = IMPLICIT_DEF
+ %4:vreg_256 = IMPLICIT_DEF
+ %5:vreg_256 = COPY %4
+ %6:vreg_256 = COPY %4
+ %7:vreg_256 = COPY %4
+ %8:vreg_256 = COPY %4
+ %9:vreg_256 = COPY %4
+ %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+ INLINEASM &"", 1, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14
+ S_BRANCH %bb.4
+
+ bb.3:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+ INLINEASM &"", 1, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
+
+---
+name: test_sink_low_rp
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-SUNK-LABEL: name: test_sink_low_rp
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6.entry:
+ ;
+ ; GFX10-SUNK-LABEL: name: test_sink_low_rp
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]], implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6.entry:
+ bb.0:
+ %0:vreg_256 = IMPLICIT_DEF
+ %1:vreg_256 = IMPLICIT_DEF
+ %2:vreg_256 = IMPLICIT_DEF
+ %3:vreg_256 = IMPLICIT_DEF
+ %4:vreg_256 = IMPLICIT_DEF
+ %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25
+ S_BRANCH %bb.4
+
+ bb.3:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
+
+---
+name: test_sink_high_rp
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-SUNK-LABEL: name: test_sink_high_rp
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6.entry:
+ ;
+ ; GFX10-SUNK-LABEL: name: test_sink_high_rp
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6.entry:
+ bb.0:
+ %0:vreg_256 = IMPLICIT_DEF
+ %1:vreg_256 = IMPLICIT_DEF
+ %2:vreg_256 = IMPLICIT_DEF
+ %3:vreg_256 = IMPLICIT_DEF
+ %4:vreg_256 = IMPLICIT_DEF
+ %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %26:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %27:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %28:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %29:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %30:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+ S_BRANCH %bb.4
+
+ bb.3:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
+
+# Do not sink convergent op (MFMA)
+
+---
+name: test_sink_convergent
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-SUNK-LABEL: name: test_sink_convergent
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF6:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF5]], [[DEF7]], [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF8]], implicit [[V_MFMA_F32_4X4X1F32_e64_]]
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6.entry:
+ ;
+ ; GFX10-SUNK-LABEL: name: test_sink_convergent
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF6:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF5]], [[DEF7]], [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF8]], implicit [[V_MFMA_F32_4X4X1F32_e64_]]
+ ; GFX10-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_27]], implicit [[V_ADD_U32_e64_28]], implicit [[V_ADD_U32_e64_29]], implicit [[V_ADD_U32_e64_30]], implicit [[V_ADD_U32_e64_31]], implicit [[V_ADD_U32_e64_32]], implicit [[V_ADD_U32_e64_33]], implicit [[V_ADD_U32_e64_34]], implicit [[V_ADD_U32_e64_35]], implicit [[V_ADD_U32_e64_36]], implicit [[V_ADD_U32_e64_37]], implicit [[V_ADD_U32_e64_38]], implicit [[V_ADD_U32_e64_39]], implicit [[V_ADD_U32_e64_40]], implicit [[V_ADD_U32_e64_41]], implicit [[V_ADD_U32_e64_42]], implicit [[V_ADD_U32_e64_43]], implicit [[V_ADD_U32_e64_44]], implicit [[V_ADD_U32_e64_45]], implicit [[V_ADD_U32_e64_46]], implicit [[V_ADD_U32_e64_47]], implicit [[V_ADD_U32_e64_48]], implicit [[V_ADD_U32_e64_49]], implicit [[V_ADD_U32_e64_50]], implicit [[V_ADD_U32_e64_51]], implicit [[V_ADD_U32_e64_52]], implicit [[V_ADD_U32_e64_53]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6.entry:
+ bb.0:
+ %0:vreg_256 = IMPLICIT_DEF
+ %1:vreg_256 = IMPLICIT_DEF
+ %2:vreg_256 = IMPLICIT_DEF
+ %3:vreg_256 = IMPLICIT_DEF
+ %4:vreg_256 = IMPLICIT_DEF
+ %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %26:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %27:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %28:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %29:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %30:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %40:vgpr_32 = IMPLICIT_DEF
+ %41:areg_128_align2 = IMPLICIT_DEF
+ %42:vgpr_32 = IMPLICIT_DEF
+ %43:vgpr_32 = IMPLICIT_DEF
+ %44:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %40, %42, %41, 0, 0, 0, implicit $mode, implicit $exec
+
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ INLINEASM &"", 1, implicit %43, implicit %44
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+ S_BRANCH %bb.4
+
+ bb.3:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
+
+# Do not sink instructions with multiple defs
+
+---
+name: test_sink_multi_def
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-SUNK-LABEL: name: test_sink_multi_def
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF]].sub2, [[DEF1]].sub4, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_1]]
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX9-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6.entry:
+ ;
+ ; GFX10-SUNK-LABEL: name: test_sink_multi_def
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF3:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF4:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub1, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub3, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub4, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub5, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub6, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub7, [[DEF1]].sub2, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub0, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub1, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DEF]].sub2, [[DEF1]].sub3, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF]].sub2, [[DEF1]].sub4, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_1]]
+ ; GFX10-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
+ ; GFX10-SUNK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]], implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_2]], implicit [[V_ADD_U32_e64_3]], implicit [[V_ADD_U32_e64_4]], implicit [[V_ADD_U32_e64_5]], implicit [[V_ADD_U32_e64_6]], implicit [[V_ADD_U32_e64_7]], implicit [[V_ADD_U32_e64_8]], implicit [[V_ADD_U32_e64_9]], implicit [[V_ADD_U32_e64_10]], implicit [[V_ADD_U32_e64_11]], implicit [[V_ADD_U32_e64_12]], implicit [[V_ADD_U32_e64_13]], implicit [[V_ADD_U32_e64_14]], implicit [[V_ADD_U32_e64_15]], implicit [[V_ADD_U32_e64_16]], implicit [[V_ADD_U32_e64_17]], implicit [[V_ADD_U32_e64_18]], implicit [[V_ADD_U32_e64_19]], implicit [[V_ADD_U32_e64_20]], implicit [[V_ADD_U32_e64_21]], implicit [[V_ADD_U32_e64_22]], implicit [[V_ADD_U32_e64_23]], implicit [[V_ADD_U32_e64_24]], implicit [[V_ADD_U32_e64_25]], implicit [[V_ADD_U32_e64_26]]
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6.entry:
+ bb.0:
+ %0:vreg_256 = IMPLICIT_DEF
+ %1:vreg_256 = IMPLICIT_DEF
+ %2:vreg_256 = IMPLICIT_DEF
+ %3:vreg_256 = IMPLICIT_DEF
+ %4:vreg_256 = IMPLICIT_DEF
+ %5:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %6:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %12:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub0:vreg_256, 0, implicit $exec
+ %13:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %14:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %15:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %16:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %17:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %18:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %19:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %20:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub1:vreg_256, 0, implicit $exec
+ %21:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %22:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %23:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %24:vgpr_32 = V_ADD_U32_e64 %0.sub3:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %25:vgpr_32 = V_ADD_U32_e64 %0.sub4:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %26:vgpr_32 = V_ADD_U32_e64 %0.sub5:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %27:vgpr_32 = V_ADD_U32_e64 %0.sub6:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %28:vgpr_32 = V_ADD_U32_e64 %0.sub7:vreg_256, %1.sub2:vreg_256, 0, implicit $exec
+ %29:vgpr_32 = V_ADD_U32_e64 %0.sub0:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %30:vgpr_32 = V_ADD_U32_e64 %0.sub1:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %31:vgpr_32 = V_ADD_U32_e64 %0.sub2:vreg_256, %1.sub3:vreg_256, 0, implicit $exec
+ %32:vgpr_32, %33:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub2:vreg_256, %1.sub4:vreg_256, 0, implicit $exec
+
+ S_BRANCH %bb.1
+
+ bb.1:
+ INLINEASM &"", 1, implicit %32, implicit %33
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+ S_BRANCH %bb.4
+
+ bb.3:
+ INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
+ INLINEASM &"", 1, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 259abae6d92c87..0fc31ea9d64379 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9 %s
---
@@ -50,47 +50,6 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
- ;
- ; GFX9-SUNK-LABEL: name: test_sink_fmac_to_only_use
- ; GFX9-SUNK: bb.0:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
- ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -174,48 +133,6 @@ body: |
; GFX9-NEXT: bb.3:
; GFX9-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
- ;
- ; GFX9-SUNK-LABEL: name: test_no_sink_into_if_cond_multiple_uses
- ; GFX9-SUNK: bb.0:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
- ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -300,48 +217,6 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
- ;
- ; GFX9-SUNK-LABEL: name: no_sink_fmac_not_constant_mode
- ; GFX9-SUNK: bb.0:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: $mode = IMPLICIT_DEF
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
- ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
$mode = IMPLICIT_DEF
@@ -414,36 +289,6 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit %6
- ;
- ; GFX9-SUNK-LABEL: name: test_no_sink_fmac_wwm
- ; GFX9-SUNK: bb.0:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]]
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit %6
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -539,69 +384,6 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
- ;
- ; GFX9-SUNK-LABEL: name: test_def_and_use_in_loop_sink_fmac
- ; GFX9-SUNK: bb.0.entry:
- ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.4:
- ; GFX9-SUNK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.5:
- ; GFX9-SUNK-NEXT: successors: %bb.6(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.6:
- ; GFX9-SUNK-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.7:
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x80000000)
@@ -732,69 +514,6 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
- ;
- ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop
- ; GFX9-SUNK: bb.0.entry:
- ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
- ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.4:
- ; GFX9-SUNK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.5:
- ; GFX9-SUNK-NEXT: successors: %bb.6(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.6:
- ; GFX9-SUNK-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.7:
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x80000000)
@@ -939,83 +658,6 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
- ;
- ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop2
- ; GFX9-SUNK: bb.0.entry:
- ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
- ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.1:
- ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.2:
- ; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
- ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
- ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
- ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.3:
- ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.4:
- ; GFX9-SUNK-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.5:
- ; GFX9-SUNK-NEXT: successors: %bb.6(0x04000000), %bb.5(0x7c000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.6
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.6:
- ; GFX9-SUNK-NEXT: successors: %bb.7(0x80000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_NOP 0
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.7
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.7:
- ; GFX9-SUNK-NEXT: successors: %bb.8(0x04000000), %bb.2(0x7c000000)
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc
- ; GFX9-SUNK-NEXT: S_BRANCH %bb.8
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: bb.8:
- ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
- ; GFX9-SUNK-NEXT: {{ $}}
- ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2 (0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index fafad600c47458..ef6771278b06f3 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX10 %s
---
name: multi_else_break
@@ -74,75 +74,6 @@ body: |
; GFX10-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
; GFX10-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX10-NEXT: S_BRANCH %bb.3
- ;
- ; GFX10-SUNK-LABEL: name: multi_else_break
- ; GFX10-SUNK: bb.0:
- ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
- ; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX10-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX10-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
- ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.1:
- ; GFX10-SUNK-NEXT: successors: %bb.2(0x80000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
- ; GFX10-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.2:
- ; GFX10-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
- ; GFX10-SUNK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
- ; GFX10-SUNK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
- ; GFX10-SUNK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
- ; GFX10-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
- ; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
- ; GFX10-SUNK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
- ; GFX10-SUNK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
- ; GFX10-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.3:
- ; GFX10-SUNK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX10-SUNK-NEXT: S_ENDPGM 0
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.4:
- ; GFX10-SUNK-NEXT: successors: %bb.5(0x80000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
- ; GFX10-SUNK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
- ; GFX10-SUNK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
- ; GFX10-SUNK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
- ; GFX10-SUNK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
- ; GFX10-SUNK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
- ; GFX10-SUNK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.5:
- ; GFX10-SUNK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
- ; GFX10-SUNK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
- ; GFX10-SUNK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
- ; GFX10-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX10-SUNK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
- ; GFX10-SUNK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.6
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: bb.6:
- ; GFX10-SUNK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
- ; GFX10-SUNK-NEXT: {{ $}}
- ; GFX10-SUNK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
- ; GFX10-SUNK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX10-SUNK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
- ; GFX10-SUNK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
bb.0:
successors: %bb.1(0x80000000)
liveins: $vgpr4, $vgpr5
diff --git a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
index 52c9d1067220ee..f23afe52f97de8 100644
--- a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
+++ b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
@@ -25,23 +25,24 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[LARL:%[0-9]+]]:addr64bit = LARL @b
+ ; CHECK-NEXT: [[LA:%[0-9]+]]:gr64bit = LA killed [[LARL]], 49, $noreg
+ ; CHECK-NEXT: [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[LA:%[0-9]+]]:gr64bit = LA [[LARL]], 49, $noreg
- ; CHECK-NEXT: [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64bit = COPY [[LA]]
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0
; CHECK-NEXT: $r2d = COPY [[DEF]]
- ; CHECK-NEXT: $r3d = COPY [[LA]]
+ ; CHECK-NEXT: $r3d = COPY [[COPY]]
; CHECK-NEXT: $r4d = COPY [[LGHI]]
; CHECK-NEXT: CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
; CHECK-NEXT: ADJCALLSTACKUP 0, 0
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0
; CHECK-NEXT: $r2d = COPY [[DEF1]]
- ; CHECK-NEXT: $r3d = COPY [[LA]]
+ ; CHECK-NEXT: $r3d = COPY [[COPY]]
; CHECK-NEXT: $r4d = COPY [[LGHI]]
; CHECK-NEXT: CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
; CHECK-NEXT: ADJCALLSTACKUP 0, 0
@@ -54,19 +55,20 @@ body: |
%2:gr64bit = LGHI 7
%3:gr64bit = IMPLICIT_DEF
%5:gr64bit = IMPLICIT_DEF
+ %6:gr64bit = COPY killed %0
bb.1:
successors: %bb.1(0x80000000)
ADJCALLSTACKDOWN 0, 0
$r2d = COPY %3
- $r3d = COPY %0
+ $r3d = COPY %6
$r4d = COPY %2
CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
ADJCALLSTACKUP 0, 0
ADJCALLSTACKDOWN 0, 0
$r2d = COPY %5
- $r3d = COPY %0
+ $r3d = COPY %6
$r4d = COPY %2
CallBRASL &memcpy, $r2d, $r3d, $r4d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2d
ADJCALLSTACKUP 0, 0
>From 4255e2a2265d1496c5f2deb1c8c08ba8b6a30ad9 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 6 Dec 2024 12:50:16 -0800
Subject: [PATCH 7/8] Review comments
Change-Id: I3738cc0f14d7ab2db35109f3e02a2f7e4fa9f2e1
---
llvm/lib/CodeGen/MachineSink.cpp | 32 ++++++++++++++++++--------------
1 file changed, 18 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 3f2e790b059041..2712d3324ebe75 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -788,14 +788,15 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
if (SinkInstsIntoCycle) {
SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
SchedModel.init(STI);
- enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
-
- CycleSinkStage Stage = CycleSinkStage::COPY;
bool HasHighPressure;
- do {
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ SunkInstrs;
+
+ enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
+ for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
+ ++Stage) {
HasHighPressure = false;
- DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
- SunkInstrs;
+ SunkInstrs.clear();
for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
@@ -816,7 +817,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs()
<< "CycleSink: Limit reached of instructions to "
- "be analysed.");
+ "be analyzed.");
break;
}
@@ -840,8 +841,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
if (!HasHighPressure)
HasHighPressure = registerPressureExceedsLimit(*Preheader);
}
- Stage = (CycleSinkStage)(Stage + 1);
- } while (HasHighPressure && Stage < CycleSinkStage::END);
+ if (!HasHighPressure)
+ break;
+ }
}
HasStoreCache.clear();
@@ -1726,12 +1728,14 @@ bool MachineSinking::aggressivelySinkIntoCycle(
MachineInstr *NewMI = nullptr;
std::pair<MachineInstr *, MachineBasicBlock *> MapEntry(&I, SinkBlock);
+ auto SI = SunkInstrs.find(MapEntry);
+
// Check for the case in which we have already sunk a copy of this
// instruction into the user block.
- if (SunkInstrs.contains(MapEntry)) {
+ if (SI != SunkInstrs.end()) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
<< printMBBReference(*SinkBlock) << "\n");
- NewMI = SunkInstrs[MapEntry];
+ NewMI = SI->second;
}
// Create a copy of the instruction in the use block.
@@ -1748,12 +1752,12 @@ bool MachineSinking::aggressivelySinkIntoCycle(
}
SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
NewMI);
- SunkInstrs[MapEntry] = NewMI;
+ SunkInstrs.insert({MapEntry, NewMI});
}
// Conservatively clear any kill flags on uses of sunk instruction
- for (MachineOperand &MO : NewMI->operands()) {
- if (MO.isReg() && MO.readsReg())
+ for (MachineOperand &MO : NewMI->all_uses()) {
+ if (MO.isReg())
RegsToClearKillFlags.insert(MO.getReg());
}
>From 672150ecd6c92244a7556a91ad8efa284e165624 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 9 Dec 2024 08:34:23 -0800
Subject: [PATCH 8/8] Review comments
Change-Id: I4d70eed99499df33f4bde04be05e88ea0c2de877
---
llvm/lib/CodeGen/MachineSink.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 2712d3324ebe75..977ba940750c94 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -794,9 +794,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
- ++Stage) {
+ ++Stage, SunkInstrs.clear()) {
HasHighPressure = false;
- SunkInstrs.clear();
+
for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
@@ -1757,8 +1757,8 @@ bool MachineSinking::aggressivelySinkIntoCycle(
// Conservatively clear any kill flags on uses of sunk instruction
for (MachineOperand &MO : NewMI->all_uses()) {
- if (MO.isReg())
- RegsToClearKillFlags.insert(MO.getReg());
+ assert(MO.isReg() && MO.isUse());
+ RegsToClearKillFlags.insert(MO.getReg());
}
// The instruction is moved from its basic block, so do not retain the
More information about the llvm-commits
mailing list