[llvm] [MachineSink] Add option for aggressive loop sinking (PR #117247)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 14:38:54 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/117247
>From b32aa2510a9724fcb815d3186dab1be469acc225 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 20 Nov 2024 14:24:09 -0800
Subject: [PATCH 1/5] [MachineSink] Add option for aggressive loop sinking
Change-Id: I62a6c6fc2c372523ce9ec98d084a434548609ead
---
llvm/lib/CodeGen/MachineSink.cpp | 184 +++++++++
.../aggressive-loop-sink-nonstandard.ll | 20 +
.../machine-sink-ignorable-exec-use.mir | 360 ++++++++++++++++++
.../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 208 ++++++----
4 files changed, 703 insertions(+), 69 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index c470bd71dfb29f..d8dd6e8478686d 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -100,6 +100,12 @@ static cl::opt<bool>
"register spills"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> AggressivelySinkInstsIntoCycle(
+ "aggressively-sink-insts-to-avoid-spills",
+ cl::desc("Aggressively sink instructions into cycles to avoid "
+ "register spills"),
+ cl::init(false), cl::Hidden);
+
static cl::opt<unsigned> SinkIntoCycleLimit(
"machine-sink-cycle-limit",
cl::desc(
@@ -256,6 +262,13 @@ class MachineSinking : public MachineFunctionPass {
SmallVectorImpl<MachineInstr *> &Candidates);
bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
+ bool isDead(const MachineInstr *MI) const;
+ bool AggressivelySinkIntoCycle(
+ MachineCycle *Cycle, MachineInstr &I,
+ DenseMap<MachineInstr *,
+ std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+ SunkInstrs);
+
bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *SuccToSinkTo,
@@ -679,6 +692,10 @@ void MachineSinking::FindCycleSinkCandidates(
SmallVectorImpl<MachineInstr *> &Candidates) {
for (auto &MI : *BB) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
+ if (MI.isDebugInstr()) {
+ LLVM_DEBUG(dbgs() << "CycleSink: Dont sink debug instructions\n");
+ continue;
+ }
if (!TII->shouldSink(MI)) {
LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
"target\n");
@@ -799,6 +816,30 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
}
}
+ if (AggressivelySinkInstsIntoCycle) {
+ SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+ DenseMap<MachineInstr *,
+ std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+ SunkInstrs;
+ for (auto *Cycle : Cycles) {
+ MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+ if (!Preheader) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Can't find preheader\n");
+ continue;
+ }
+ SmallVector<MachineInstr *, 8> Candidates;
+ FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+ // Walk the candidates in reverse order so that we start with the use
+ // of a def-use chain, if there is any.
+ for (MachineInstr *I : llvm::reverse(Candidates)) {
+ AggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
+ EverMadeChange = true;
+ ++NumCycleSunk;
+ }
+ }
+ }
+
HasStoreCache.clear();
StoreInstrCache.clear();
@@ -1574,6 +1615,149 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
return HasAliasedStore;
}
+/// Copy paste from DeadMachineInstructionElimImpl
+
+bool MachineSinking::isDead(const MachineInstr *MI) const {
+ // Instructions without side-effects are dead iff they only define dead regs.
+ // This function is hot and this loop returns early in the common case,
+ // so only perform additional checks before this if absolutely necessary.
+ for (const MachineOperand &MO : MI->all_defs()) {
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical()) {
+ return false;
+ } else {
+ if (MO.isDead()) {
+#ifndef NDEBUG
+ // Basic check on the register. All of them should be 'undef'.
+ for (auto &U : MRI->use_nodbg_operands(Reg))
+ assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+#endif
+ continue;
+ }
+ for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
+ if (&Use != MI)
+ // This def has a non-debug use. Don't delete the instruction!
+ return false;
+ }
+ }
+ }
+
+ // Technically speaking inline asm without side effects and no defs can still
+ // be deleted. But there is so much bad inline asm code out there, we should
+ // let them be.
+ if (MI->isInlineAsm())
+ return false;
+
+ // FIXME: See issue #105950 for why LIFETIME markers are considered dead here.
+ if (MI->isLifetimeMarker())
+ return true;
+
+ // If there are no defs with uses, the instruction might be dead.
+ return MI->wouldBeTriviallyDead();
+}
+
+/// Aggressively sink instructions into cycles. This will aggressively try to
+/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
+/// In particular, it will sink into multiple successor blocks without limits
+/// based on the amount of sinking, or the type of ops being sunk (so long as
+/// they are safe to sink).
+bool MachineSinking::AggressivelySinkIntoCycle(
+ MachineCycle *Cycle, MachineInstr &I,
+ DenseMap<MachineInstr *,
+ std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
+ SunkInstrs) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
+ MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+ assert(Preheader && "Cycle sink needs a preheader block");
+ SmallVector<std::pair<MachineOperand, MachineInstr *>> Uses;
+ // TODO: support instructions with multiple defs
+ if (I.getNumDefs() > 1)
+ return false;
+
+ MachineOperand DefMO = I.getOperand(0);
+ for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
+ Uses.push_back({DefMO, &MI});
+ }
+
+ for (std::pair<MachineOperand, MachineInstr *> Entry : Uses) {
+ MachineInstr *MI = Entry.second;
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Analysing use: " << MI);
+ if (MI->isPHI()) {
+ LLVM_DEBUG(
+ dbgs() << "AggressiveCycleSink: Not attempting to sink for PHI.\n");
+ continue;
+ }
+ // We cannot sink before the prologue
+ if (TII->isBasicBlockPrologue(*MI) || MI->isPosition()) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Use is BasicBlock prologue, "
+ "can't sink.\n");
+ continue;
+ }
+ if (!Cycle->contains(MI->getParent())) {
+ LLVM_DEBUG(
+ dbgs() << "AggressiveCycleSink: Use not in cycle, can't sink.\n");
+ continue;
+ }
+
+ MachineBasicBlock *SinkBlock = MI->getParent();
+ MachineInstr *NewMI = nullptr;
+
+ // Check for the case in which we have already sunk a copy of this
+ // instruction into the user block.
+ if (SunkInstrs.contains(&I)) {
+ auto SunkBlocks = SunkInstrs[&I];
+ auto Match = std::find_if(
+ SunkBlocks.begin(), SunkBlocks.end(),
+ [&SinkBlock](
+ std::pair<MachineBasicBlock *, MachineInstr *> SunkEntry) {
+ return SunkEntry.first == SinkBlock;
+ });
+ if (Match != SunkBlocks.end()) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
+ << printMBBReference(*SinkBlock) << "\n");
+ NewMI = Match->second;
+ }
+ }
+
+ // Create a copy of the instruction in the use block.
+ if (!NewMI) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
+ << printMBBReference(*SinkBlock) << "\n");
+
+ NewMI = I.getMF()->CloneMachineInstr(&I);
+ if (DefMO.getReg().isVirtual()) {
+ const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
+ Register DestReg = MRI->createVirtualRegister(TRC);
+ NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
+ *TRI);
+ }
+ SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
+ NewMI);
+ SunkInstrs[&I].push_back({SinkBlock, NewMI});
+ }
+
+ // Conservatively clear any kill flags on uses of sunk instruction
+ for (MachineOperand &MO : NewMI->operands()) {
+ if (MO.isReg() && MO.readsReg())
+ RegsToClearKillFlags.insert(MO.getReg());
+ }
+
+ // The instruction is moved from its basic block, so do not retain the
+ // debug information.
+ assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
+ NewMI->setDebugLoc(DebugLoc());
+
+ // Replace the use with the newly created virtual register.
+ MachineOperand UseMO = Entry.first;
+ MI->substituteRegister(UseMO.getReg(), NewMI->getOperand(0).getReg(),
+ UseMO.getSubReg(), *TRI);
+ }
+ // If we have replaced all uses, then delete the dead instruction
+ if (isDead(&I))
+ I.eraseFromParent();
+ return true;
+}
+
/// Sink instructions into cycles if profitable. This especially tries to
/// prevent register spills caused by register pressure if there is little to no
/// overhead moving instructions into cycles.
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
new file mode 100644
index 00000000000000..72b4495297a1c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressively-sink-insts-to-avoid-spills=1 < %s | FileCheck -check-prefix=SUNK %s
+
+; Check that various edge cases do not crash the compiler
+
+; Multiple uses of sunk valu, chain of sink candidates
+
+define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
+; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+ ret half %result
+}
+
+; Sink candidates with multiple defs
+
+define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
+; SUNK-LABEL: memmove_p5_p5:
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index efa21052e3ae2f..f93d8f3dde21b6 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,5 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+
---
name: test_sink_fmac_to_only_use
@@ -48,6 +50,47 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+ ;
+ ; GFX9-SUNK-LABEL: name: test_sink_fmac_to_only_use
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+ ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -131,6 +174,48 @@ body: |
; GFX9-NEXT: bb.3:
; GFX9-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_into_if_cond_multiple_uses
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+ ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -215,6 +300,48 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
+ ;
+ ; GFX9-SUNK-LABEL: name: no_sink_fmac_not_constant_mode
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: $mode = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1
+ ; GFX9-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
$mode = IMPLICIT_DEF
@@ -287,6 +414,36 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: bb.3:
; GFX9-NEXT: S_ENDPGM 0, implicit %6
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_fmac_wwm
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]]
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0, implicit %6
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
%1:vgpr_32 = COPY $vgpr0
@@ -382,6 +539,69 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: test_def_and_use_in_loop_sink_fmac
+ ; GFX9-SUNK: bb.0.entry:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: successors: %bb.6(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6:
+ ; GFX9-SUNK-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.7:
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x80000000)
@@ -512,6 +732,69 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop
+ ; GFX9-SUNK: bb.0.entry:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: successors: %bb.6(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6:
+ ; GFX9-SUNK-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.7:
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x80000000)
@@ -656,6 +939,83 @@ body: |
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: test_no_sink_def_into_loop2
+ ; GFX9-SUNK: bb.0.entry:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9-SUNK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; GFX9-SUNK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-SUNK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-SUNK-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
+ ; GFX9-SUNK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.3
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: successors: %bb.6(0x04000000), %bb.5(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.6
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.6:
+ ; GFX9-SUNK-NEXT: successors: %bb.7(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_NOP 0
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.7
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.7:
+ ; GFX9-SUNK-NEXT: successors: %bb.8(0x04000000), %bb.2(0x7c000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.8
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.8:
+ ; GFX9-SUNK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2 (0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index 04c80582f6f079..2a14b85cf2bd56 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,78 +1,148 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
---
name: multi_else_break
tracksRegLiveness: true
body: |
- ; CHECK-LABEL: name: multi_else_break
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $vgpr4, $vgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
- ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
- ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
- ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
- ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
- ; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
- ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.4
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.3:
- ; CHECK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.4:
- ; CHECK-NEXT: successors: %bb.5(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
- ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
- ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
- ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
- ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.5:
- ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
- ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
- ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
- ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
- ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.6
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.6:
- ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
- ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
- ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.3
+ ; GFX10-LABEL: name: multi_else_break
+ ; GFX10: bb.0:
+ ; GFX10-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.1:
+ ; GFX10-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.2:
+ ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+ ; GFX10-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_BRANCH %bb.4
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.3:
+ ; GFX10-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.4:
+ ; GFX10-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+ ; GFX10-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+ ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.5:
+ ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+ ; GFX10-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+ ; GFX10-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+ ; GFX10-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_BRANCH %bb.6
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: bb.6:
+ ; GFX10-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+ ; GFX10-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+ ; GFX10-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-NEXT: S_BRANCH %bb.3
+ ;
+ ; GFX10-SUNK-LABEL: name: multi_else_break
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX10-SUNK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX10-SUNK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+ ; GFX10-SUNK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+ ; GFX10-SUNK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+ ; GFX10-SUNK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+ ; GFX10-SUNK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+ ; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; GFX10-SUNK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.5(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX10-SUNK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+ ; GFX10-SUNK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+ ; GFX10-SUNK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+ ; GFX10-SUNK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+ ; GFX10-SUNK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+ ; GFX10-SUNK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+ ; GFX10-SUNK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+ ; GFX10-SUNK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+ ; GFX10-SUNK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.6
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.6:
+ ; GFX10-SUNK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+ ; GFX10-SUNK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+ ; GFX10-SUNK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.3
bb.0:
successors: %bb.1(0x80000000)
liveins: $vgpr4, $vgpr5
>From 0813b87be351315d2a23c58e6f95c5be1eba5d45 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 22 Nov 2024 13:22:26 -0800
Subject: [PATCH 2/5] Address review comments
Change-Id: I975fab6cf7dba21788fb5677a5484916ef29d959
---
llvm/lib/CodeGen/MachineSink.cpp | 122 ++++-----
.../aggressive-loop-sink-nonstandard.ll | 245 +++++++++++++++++-
.../machine-sink-ignorable-exec-use.mir | 2 +-
.../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 4 +-
4 files changed, 297 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index d8dd6e8478686d..151348e6b1c1ba 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -101,7 +101,7 @@ static cl::opt<bool>
cl::init(false), cl::Hidden);
static cl::opt<bool> AggressivelySinkInstsIntoCycle(
- "aggressively-sink-insts-to-avoid-spills",
+ "aggressive-sink-insts-into-cycles",
cl::desc("Aggressively sink instructions into cycles to avoid "
"register spills"),
cl::init(false), cl::Hidden);
@@ -118,6 +118,8 @@ STATISTIC(NumSplit, "Number of critical edges split");
STATISTIC(NumCoalesces, "Number of copies coalesced");
STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
+using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
namespace {
class MachineSinking : public MachineFunctionPass {
@@ -263,11 +265,10 @@ class MachineSinking : public MachineFunctionPass {
bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
bool isDead(const MachineInstr *MI) const;
- bool AggressivelySinkIntoCycle(
+ bool aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
- DenseMap<MachineInstr *,
- std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
- SunkInstrs);
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ &SunkInstrs);
bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
@@ -692,8 +693,8 @@ void MachineSinking::FindCycleSinkCandidates(
SmallVectorImpl<MachineInstr *> &Candidates) {
for (auto &MI : *BB) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
- if (MI.isDebugInstr()) {
- LLVM_DEBUG(dbgs() << "CycleSink: Dont sink debug instructions\n");
+ if (MI.isMetaInstruction()) {
+ LLVM_DEBUG(dbgs() << "CycleSink: Dont sink meta instructions\n");
continue;
}
if (!TII->shouldSink(MI)) {
@@ -786,8 +787,11 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
EverMadeChange = true;
}
- if (SinkInstsIntoCycle) {
+ if (SinkInstsIntoCycle || AggressivelySinkInstsIntoCycle) {
SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
+
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ SunkInstrs;
for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
@@ -801,7 +805,18 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
// of a def-use chain, if there is any.
// TODO: Sort the candidates using a cost-model.
unsigned i = 0;
+
for (MachineInstr *I : llvm::reverse(Candidates)) {
+ // AggressivelySinkInstsIntoCycle sinks a superset of instructions
+ // relative to regular cycle sinking. Thus, this option supercedes
+ // captures all sinking opportunites done
+ if (AggressivelySinkInstsIntoCycle) {
+ aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
+ EverMadeChange = true;
+ ++NumCycleSunk;
+ continue;
+ }
+
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs() << "CycleSink: Limit reached of instructions to "
"be analysed.");
@@ -816,30 +831,6 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (AggressivelySinkInstsIntoCycle) {
- SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
- DenseMap<MachineInstr *,
- std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
- SunkInstrs;
- for (auto *Cycle : Cycles) {
- MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
- if (!Preheader) {
- LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Can't find preheader\n");
- continue;
- }
- SmallVector<MachineInstr *, 8> Candidates;
- FindCycleSinkCandidates(Cycle, Preheader, Candidates);
-
- // Walk the candidates in reverse order so that we start with the use
- // of a def-use chain, if there is any.
- for (MachineInstr *I : llvm::reverse(Candidates)) {
- AggressivelySinkIntoCycle(Cycle, *I, SunkInstrs);
- EverMadeChange = true;
- ++NumCycleSunk;
- }
- }
- }
-
HasStoreCache.clear();
StoreInstrCache.clear();
@@ -1615,31 +1606,27 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
return HasAliasedStore;
}
-/// Copy paste from DeadMachineInstructionElimImpl
-
bool MachineSinking::isDead(const MachineInstr *MI) const {
// Instructions without side-effects are dead iff they only define dead regs.
// This function is hot and this loop returns early in the common case,
// so only perform additional checks before this if absolutely necessary.
+
for (const MachineOperand &MO : MI->all_defs()) {
Register Reg = MO.getReg();
- if (Reg.isPhysical()) {
+ if (Reg.isPhysical())
return false;
- } else {
- if (MO.isDead()) {
+
+ if (MO.isDead()) {
#ifndef NDEBUG
- // Basic check on the register. All of them should be 'undef'.
- for (auto &U : MRI->use_nodbg_operands(Reg))
- assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+ // Basic check on the register. All of them should be 'undef'.
+ for (auto &U : MRI->use_nodbg_operands(Reg))
+ assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
#endif
- continue;
- }
- for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
- if (&Use != MI)
- // This def has a non-debug use. Don't delete the instruction!
- return false;
- }
+ continue;
}
+
+ if (!(MRI->hasAtMostUserInstrs(Reg, 0)))
+ return false;
}
// Technically speaking inline asm without side effects and no defs can still
@@ -1661,25 +1648,24 @@ bool MachineSinking::isDead(const MachineInstr *MI) const {
/// In particular, it will sink into multiple successor blocks without limits
/// based on the amount of sinking, or the type of ops being sunk (so long as
/// they are safe to sink).
-bool MachineSinking::AggressivelySinkIntoCycle(
+bool MachineSinking::aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
- DenseMap<MachineInstr *,
- std::list<std::pair<MachineBasicBlock *, MachineInstr *>>>
- SunkInstrs) {
+ DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
+ &SunkInstrs) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
- SmallVector<std::pair<MachineOperand, MachineInstr *>> Uses;
+ SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
// TODO: support instructions with multiple defs
if (I.getNumDefs() > 1)
return false;
- MachineOperand DefMO = I.getOperand(0);
+ MachineOperand &DefMO = I.getOperand(0);
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
- Uses.push_back({DefMO, &MI});
+ Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
}
- for (std::pair<MachineOperand, MachineInstr *> Entry : Uses) {
+ for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
MachineInstr *MI = Entry.second;
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Analysing use: " << MI);
if (MI->isPHI()) {
@@ -1701,22 +1687,14 @@ bool MachineSinking::AggressivelySinkIntoCycle(
MachineBasicBlock *SinkBlock = MI->getParent();
MachineInstr *NewMI = nullptr;
+ std::pair<MachineInstr *, MachineBasicBlock *> MapEntry(&I, SinkBlock);
// Check for the case in which we have already sunk a copy of this
// instruction into the user block.
- if (SunkInstrs.contains(&I)) {
- auto SunkBlocks = SunkInstrs[&I];
- auto Match = std::find_if(
- SunkBlocks.begin(), SunkBlocks.end(),
- [&SinkBlock](
- std::pair<MachineBasicBlock *, MachineInstr *> SunkEntry) {
- return SunkEntry.first == SinkBlock;
- });
- if (Match != SunkBlocks.end()) {
- LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
- << printMBBReference(*SinkBlock) << "\n");
- NewMI = Match->second;
- }
+ if (SunkInstrs.contains(MapEntry)) {
+ LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
+ << printMBBReference(*SinkBlock) << "\n");
+ NewMI = SunkInstrs[MapEntry];
}
// Create a copy of the instruction in the use block.
@@ -1733,7 +1711,7 @@ bool MachineSinking::AggressivelySinkIntoCycle(
}
SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
NewMI);
- SunkInstrs[&I].push_back({SinkBlock, NewMI});
+ SunkInstrs[MapEntry] = NewMI;
}
// Conservatively clear any kill flags on uses of sunk instruction
@@ -1748,9 +1726,9 @@ bool MachineSinking::AggressivelySinkIntoCycle(
NewMI->setDebugLoc(DebugLoc());
// Replace the use with the newly created virtual register.
- MachineOperand UseMO = Entry.first;
- MI->substituteRegister(UseMO.getReg(), NewMI->getOperand(0).getReg(),
- UseMO.getSubReg(), *TRI);
+ RegSubRegPair &UseReg = Entry.first;
+ MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
+ UseReg.SubReg, *TRI);
}
// If we have replaced all uses, then delete the dead instruction
if (isDead(&I))
diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
index 72b4495297a1c5..9e53b8434cc021 100644
--- a/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
+++ b/llvm/test/CodeGen/AMDGPU/aggressive-loop-sink-nonstandard.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressively-sink-insts-to-avoid-spills=1 < %s | FileCheck -check-prefix=SUNK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 --aggressive-sink-insts-into-cycles=1 < %s | FileCheck -check-prefix=SUNK %s
; Check that various edge cases do not crash the compiler
@@ -6,6 +7,39 @@
define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) {
; SUNK-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; SUNK: ; %bb.0:
+; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT: v_mov_b32_e32 v3, v0
+; SUNK-NEXT: v_and_b32_e32 v0, -4, v3
+; SUNK-NEXT: global_load_dword v4, v[0:1], off
+; SUNK-NEXT: v_and_b32_e32 v3, 3, v3
+; SUNK-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; SUNK-NEXT: s_mov_b32 s2, 0xffff
+; SUNK-NEXT: v_lshlrev_b32_e64 v5, v3, s2
+; SUNK-NEXT: s_mov_b64 s[0:1], 0
+; SUNK-NEXT: v_not_b32_e32 v5, v5
+; SUNK-NEXT: v_max_f16_e32 v2, v2, v2
+; SUNK-NEXT: .LBB0_1: ; %atomicrmw.start
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: v_mov_b32_e32 v7, v4
+; SUNK-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; SUNK-NEXT: v_max_f16_e32 v4, v4, v4
+; SUNK-NEXT: v_min_f16_e32 v4, v4, v2
+; SUNK-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; SUNK-NEXT: v_and_or_b32 v6, v7, v5, v4
+; SUNK-NEXT: buffer_wbl2 sc1
+; SUNK-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: buffer_inv sc1
+; SUNK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; SUNK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; SUNK-NEXT: s_cbranch_execnz .LBB0_1
+; SUNK-NEXT: ; %bb.2: ; %atomicrmw.end
+; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; SUNK-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
ret half %result
}
@@ -14,7 +48,216 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
; SUNK-LABEL: memmove_p5_p5:
+; SUNK: ; %bb.0: ; %entry
+; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT: v_and_b32_e32 v4, 15, v2
+; SUNK-NEXT: v_mov_b32_e32 v5, 0
+; SUNK-NEXT: v_and_b32_e32 v6, -16, v2
+; SUNK-NEXT: v_mov_b32_e32 v7, v3
+; SUNK-NEXT: v_cmp_ne_u64_e64 s[0:1], 0, v[4:5]
+; SUNK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SUNK-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v0
+; SUNK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
+; SUNK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_3
+; SUNK-NEXT: ; %bb.1: ; %Flow46
+; SUNK-NEXT: s_andn2_saveexec_b64 s[2:3], s[4:5]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_10
+; SUNK-NEXT: .LBB1_2: ; %Flow47
+; SUNK-NEXT: s_or_b64 exec, exec, s[2:3]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: s_setpc_b64 s[30:31]
+; SUNK-NEXT: .LBB1_3: ; %memmove_copy_forward
+; SUNK-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SUNK-NEXT: s_cbranch_execz .LBB1_6
+; SUNK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
+; SUNK-NEXT: s_mov_b64 s[8:9], 0
+; SUNK-NEXT: v_mov_b32_e32 v3, v1
+; SUNK-NEXT: v_mov_b32_e32 v8, v0
+; SUNK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_dwordx4 v[10:13], v3, off
+; SUNK-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, -16
+; SUNK-NEXT: v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
+; SUNK-NEXT: v_add_u32_e32 v3, 16, v3
+; SUNK-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_dwordx4 v8, v[10:13], off
+; SUNK-NEXT: v_add_u32_e32 v8, 16, v8
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_5
+; SUNK-NEXT: .LBB1_6: ; %Flow41
+; SUNK-NEXT: s_or_b64 exec, exec, s[6:7]
+; SUNK-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
+; SUNK-NEXT: s_cbranch_execz .LBB1_9
+; SUNK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
+; SUNK-NEXT: v_and_b32_e32 v2, -16, v2
+; SUNK-NEXT: v_add_u32_e32 v0, v0, v2
+; SUNK-NEXT: v_add_u32_e32 v1, v1, v2
+; SUNK-NEXT: s_mov_b64 s[8:9], 0
+; SUNK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_ubyte v2, v1, off
+; SUNK-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, -1
+; SUNK-NEXT: v_cmp_eq_u64_e64 s[2:3], 0, v[4:5]
+; SUNK-NEXT: v_add_u32_e32 v1, 1, v1
+; SUNK-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_byte v0, v2, off
+; SUNK-NEXT: v_add_u32_e32 v0, 1, v0
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_8
+; SUNK-NEXT: .LBB1_9: ; %Flow39
+; SUNK-NEXT: s_or_b64 exec, exec, s[6:7]
+; SUNK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SUNK-NEXT: ; implicit-def: $vgpr0
+; SUNK-NEXT: ; implicit-def: $vgpr1
+; SUNK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SUNK-NEXT: s_andn2_saveexec_b64 s[2:3], s[4:5]
+; SUNK-NEXT: s_cbranch_execz .LBB1_2
+; SUNK-NEXT: .LBB1_10: ; %memmove_copy_backwards
+; SUNK-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
+; SUNK-NEXT: s_cbranch_execz .LBB1_13
+; SUNK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
+; SUNK-NEXT: v_add_u32_e32 v7, -1, v2
+; SUNK-NEXT: v_add_u32_e32 v6, v0, v7
+; SUNK-NEXT: v_add_u32_e32 v7, v1, v7
+; SUNK-NEXT: s_mov_b64 s[6:7], 0
+; SUNK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_ubyte v8, v7, off
+; SUNK-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, -1
+; SUNK-NEXT: v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
+; SUNK-NEXT: v_add_u32_e32 v7, -1, v7
+; SUNK-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_byte v6, v8, off
+; SUNK-NEXT: v_add_u32_e32 v6, -1, v6
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_12
+; SUNK-NEXT: .LBB1_13: ; %Flow45
+; SUNK-NEXT: s_or_b64 exec, exec, s[4:5]
+; SUNK-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; SUNK-NEXT: s_cbranch_execz .LBB1_16
+; SUNK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
+; SUNK-NEXT: v_and_b32_e32 v5, -16, v2
+; SUNK-NEXT: v_add_u32_e32 v4, -16, v5
+; SUNK-NEXT: v_add_u32_e32 v2, v0, v4
+; SUNK-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v5
+; SUNK-NEXT: v_add_u32_e32 v4, v1, v4
+; SUNK-NEXT: s_mov_b64 s[4:5], 0
+; SUNK-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v3, vcc
+; SUNK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: scratch_load_dwordx4 v[6:9], v4, off
+; SUNK-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 16
+; SUNK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; SUNK-NEXT: v_add_u32_e32 v4, -16, v4
+; SUNK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: scratch_store_dwordx4 v2, v[6:9], off
+; SUNK-NEXT: v_add_u32_e32 v2, -16, v2
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SUNK-NEXT: s_cbranch_execnz .LBB1_15
+; SUNK-NEXT: .LBB1_16: ; %Flow43
+; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT: s_or_b64 exec, exec, s[2:3]
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
+
+; We should not sink the mfma into the if/else as it is convergent
+
+define void @convergent_sink(<4 x i16> %in0, <4 x i16> %in1, i32 %val, i32 %v, ptr addrspace(1) %outptr) #2 {
+; SUNK-LABEL: convergent_sink:
+; SUNK: ; %bb.0: ; %entry
+; SUNK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SUNK-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], 0
+; SUNK-NEXT: v_lshl_add_u32 v0, v5, 1, v5
+; SUNK-NEXT: v_lshlrev_b32_e32 v2, 1, v5
+; SUNK-NEXT: s_mov_b32 s4, 0
+; SUNK-NEXT: s_mov_b64 s[0:1], 0
+; SUNK-NEXT: v_mov_b32_e32 v5, 0xde
+; SUNK-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SUNK-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SUNK-NEXT: s_branch .LBB2_2
+; SUNK-NEXT: .LBB2_1: ; %end
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v4
+; SUNK-NEXT: s_add_i32 s4, s4, 1
+; SUNK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; SUNK-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; SUNK-NEXT: s_cbranch_execz .LBB2_7
+; SUNK-NEXT: .LBB2_2: ; %loop.body
+; SUNK-NEXT: ; =>This Inner Loop Header: Depth=1
+; SUNK-NEXT: s_cmp_lt_i32 s4, 6
+; SUNK-NEXT: global_store_dword v[6:7], v5, off
+; SUNK-NEXT: s_cbranch_scc0 .LBB2_4
+; SUNK-NEXT: ; %bb.3: ; %else
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 3, v[6:7]
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[12:15], off offset:48
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[8:11], off offset:32
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[4:7], off offset:16
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[0:3], off
+; SUNK-NEXT: s_mov_b64 s[2:3], 0
+; SUNK-NEXT: s_branch .LBB2_5
+; SUNK-NEXT: .LBB2_4: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: s_mov_b64 s[2:3], -1
+; SUNK-NEXT: .LBB2_5: ; %Flow
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; SUNK-NEXT: v_mov_b32_e32 v8, v0
+; SUNK-NEXT: s_cbranch_vccnz .LBB2_1
+; SUNK-NEXT: ; %bb.6: ; %if
+; SUNK-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SUNK-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 3, v[6:7]
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[12:15], off offset:48
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[8:11], off offset:32
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[4:7], off offset:16
+; SUNK-NEXT: global_store_dwordx4 v[8:9], a[0:3], off
+; SUNK-NEXT: v_mov_b32_e32 v8, v2
+; SUNK-NEXT: s_branch .LBB2_1
+; SUNK-NEXT: .LBB2_7: ; %exit
+; SUNK-NEXT: s_or_b64 exec, exec, s[0:1]
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[12:15], off offset:48
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[8:11], off offset:32
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[4:7], off offset:16
+; SUNK-NEXT: global_store_dwordx4 v[6:7], a[0:3], off
+; SUNK-NEXT: s_waitcnt vmcnt(0)
+; SUNK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %1005 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %in0, <4 x i16> %in1, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ br label %loop.body
+
+loop.body:
+ %i = phi i32 [0, %entry], [%i.inc, %end]
+ store i32 222, ptr addrspace(1) %outptr
+ %cc = icmp sgt i32 %i, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %v.if = mul i32 %v, 2
+ %sptr.if = getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.if
+ store <16 x float> %1005, ptr addrspace(1) %sptr.if
+ br label %end
+
+else:
+ %v.else = mul i32 %v, 3
+ %sptr.else = getelementptr <4 x i16>, ptr addrspace(1) %outptr, i32 %v.else
+ store <16 x float> %1005, ptr addrspace(1) %sptr.else
+ br label %end
+
+end:
+ %r = phi i32 [ %v.if, %if ], [ %v.else, %else ]
+ %cmp = icmp ne i32 %r, %val
+ %i.inc = add i32 %i, 1
+ br i1 %cmp, label %loop.body, label %exit
+
+exit:
+ store <16 x float> %1005, ptr addrspace(1) %outptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index f93d8f3dde21b6..259abae6d92c87 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index 2a14b85cf2bd56..fafad600c47458 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck -check-prefixes=GFX10 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -aggressively-sink-insts-to-avoid-spills=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
---
name: multi_else_break
@@ -86,6 +86,7 @@ body: |
; GFX10-SUNK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX10-SUNK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX10-SUNK-NEXT: {{ $}}
; GFX10-SUNK-NEXT: bb.1:
; GFX10-SUNK-NEXT: successors: %bb.2(0x80000000)
@@ -100,7 +101,6 @@ body: |
; GFX10-SUNK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
; GFX10-SUNK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
; GFX10-SUNK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
- ; GFX10-SUNK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX10-SUNK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
; GFX10-SUNK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; GFX10-SUNK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
>From 7e3caf54cfdd9eb757944f8cec657af2ff8c0efd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Sat, 23 Nov 2024 14:00:44 -0800
Subject: [PATCH 3/5] Fix SystemZ test
Change-Id: I8f1138f9fc82251538f2c428f1e67fa2941266b5
---
llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
index 43c286a830b42e..52c9d1067220ee 100644
--- a/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
+++ b/llvm/test/CodeGen/SystemZ/machinelicm-sunk-kill-flags.mir
@@ -25,14 +25,14 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[LARL:%[0-9]+]]:addr64bit = LARL @b
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[LA:%[0-9]+]]:gr64bit = LA [[LARL]], 49, $noreg
; CHECK-NEXT: [[LGHI:%[0-9]+]]:gr64bit = LGHI 7
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:gr64bit = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gr64bit = IMPLICIT_DEF
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0
; CHECK-NEXT: $r2d = COPY [[DEF]]
; CHECK-NEXT: $r3d = COPY [[LA]]
>From d09d4f1bdaec14b01c1cc1bd2cd8bccd58d0c8b4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 2 Dec 2024 16:03:05 -0800
Subject: [PATCH 4/5] Add low latency check
Change-Id: Iec36f11060ca1b46b6c33130d4ee02863360c671
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 7 +-
llvm/lib/CodeGen/MachineSink.cpp | 11 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
.../machine-sink-aggressive-latency.mir | 107 ++++++++++++++++++
5 files changed, 129 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 07b59b241d9f9a..c4c69e5129602c 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1806,8 +1806,13 @@ class TargetInstrInfo : public MCInstrInfo {
unsigned defaultDefLatency(const MCSchedModel &SchedModel,
const MachineInstr &DefMI) const;
+ /// Return true if this instruction is considered low latency.
+ virtual bool isLowLatencyInstruction(const MachineInstr &MI) const {
+ return false;
+ };
+
/// Return true if this opcode has high latency to its result.
- virtual bool isHighLatencyDef(int opc) const { return false; }
+ virtual bool isHighLatencyDef(int opc) const { return false; };
/// Compute operand latency between a def of 'Reg'
/// and a use in the current loop. Return true if the target considered
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 151348e6b1c1ba..8e4f02495229a2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1652,13 +1652,18 @@ bool MachineSinking::aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
DenseMap<std::pair<MachineInstr *, MachineBasicBlock *>, MachineInstr *>
&SunkInstrs) {
+ // TODO: support instructions with multiple defs
+ if (I.getNumDefs() > 1)
+ return false;
+
+ // Only sink instructions which the target considers to be low latency
+ if (!TII->isLowLatencyInstruction(I))
+ return false;
+
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;
- // TODO: support instructions with multiple defs
- if (I.getNumDefs() > 1)
- return false;
MachineOperand &DefMO = I.getOperand(0);
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c864f03f1f0f9e..f3ef9a25dd82c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8676,7 +8676,13 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
- return isSMRD(Opc);
+ if (MI.isCopy() || isSMRD(Opc))
+ return true;
+
+ if (SchedModel.hasInstrSchedModel())
+ return SchedModel.computeInstrLatency(Opc) < 4;
+
+ return false;
}
bool SIInstrInfo::isHighLatencyDef(int Opc) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1f7fff76d15210..f103eb9e97e9b4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1291,7 +1291,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
uint64_t getDefaultRsrcDataFormat() const;
uint64_t getScratchRsrcWords23() const;
- bool isLowLatencyInstruction(const MachineInstr &MI) const;
+ bool isLowLatencyInstruction(const MachineInstr &MI) const override;
bool isHighLatencyDef(int Opc) const override;
/// Return the descriptor of the target-specific machine instruction
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
new file mode 100644
index 00000000000000..b5296a85b31352
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-aggressive-latency.mir
@@ -0,0 +1,107 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX10-SUNK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-sink --aggressive-sink-insts-into-cycles=1 -o - %s | FileCheck -check-prefixes=GFX9-SUNK %s
+
+---
+name: latency_cycle_sink
+tracksRegLiveness: true
+body: |
+ ; GFX10-SUNK-LABEL: name: latency_cycle_sink
+ ; GFX10-SUNK: bb.0:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX10-SUNK-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.1:
+ ; GFX10-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.2:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.3:
+ ; GFX10-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.4:
+ ; GFX10-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX10-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX10-SUNK-NEXT: {{ $}}
+ ; GFX10-SUNK-NEXT: bb.5:
+ ; GFX10-SUNK-NEXT: S_ENDPGM 0
+ ;
+ ; GFX9-SUNK-LABEL: name: latency_cycle_sink
+ ; GFX9-SUNK: bb.0:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x80000000)
+ ; GFX9-SUNK-NEXT: liveins: $vgpr4, $vgpr5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.1
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.1:
+ ; GFX9-SUNK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.2
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.2:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_]], 8, [[V_PK_MUL_LO_U16_]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.3:
+ ; GFX9-SUNK-NEXT: successors: %bb.4(0x80000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_2:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[DEF]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: [[V_PK_MUL_LO_U16_1:%[0-9]+]]:vgpr_32 = V_PK_MUL_LO_U16 8, [[V_PK_MUL_LO_U16_2]], 8, [[V_PK_MUL_LO_U16_2]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.4
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.4:
+ ; GFX9-SUNK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; GFX9-SUNK-NEXT: S_BRANCH %bb.5
+ ; GFX9-SUNK-NEXT: {{ $}}
+ ; GFX9-SUNK-NEXT: bb.5:
+ ; GFX9-SUNK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr4, $vgpr5
+ %83:vgpr_32 = IMPLICIT_DEF
+ %80:vgpr_32 = V_PK_MUL_LO_U16 8, %83, 8, %83, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_CBRANCH_SCC1 %bb.3, implicit undef $scc
+ S_BRANCH %bb.2
+
+
+ bb.2:
+ %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3:
+ %90:vgpr_32 = V_PK_MUL_LO_U16 8, %80, 8, %80, 0, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+...
>From 16b16e38a0b3d6b1de887830228e960aba30d87a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 3 Dec 2024 14:38:28 -0800
Subject: [PATCH 5/5] Extra semicolon
Change-Id: I17405578571a711f53db71df0e9329600c01fceb
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index c4c69e5129602c..bfc3450c97c024 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1812,7 +1812,7 @@ class TargetInstrInfo : public MCInstrInfo {
};
/// Return true if this opcode has high latency to its result.
- virtual bool isHighLatencyDef(int opc) const { return false; };
+ virtual bool isHighLatencyDef(int opc) const { return false; }
/// Compute operand latency between a def of 'Reg'
/// and a use in the current loop. Return true if the target considered
More information about the llvm-commits
mailing list