[llvm] r301228 - [AMDGPU] Merge M0 initializations
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 24 12:37:54 PDT 2017
Author: rampitec
Date: Mon Apr 24 14:37:54 2017
New Revision: 301228
URL: http://llvm.org/viewvc/llvm-project?rev=301228&view=rev
Log:
[AMDGPU] Merge M0 initializations
Merges equivalent initializations of M0 and hoists them into a common
dominator block. Technically the same code can be used with any
register, physical or virtual.
Differential Revision: https://reviews.llvm.org/D32279
Added:
llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir
Modified:
llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp?rev=301228&r1=301227&r2=301228&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp Mon Apr 24 14:37:54 2017
@@ -81,6 +81,11 @@ using namespace llvm;
#define DEBUG_TYPE "si-fix-sgpr-copies"
+static cl::opt<bool> EnableM0Merge(
+ "amdgpu-enable-merge-m0",
+ cl::desc("Merge and hoist M0 initializations"),
+ cl::init(false));
+
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
@@ -108,7 +113,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
@@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(cons
return true;
}
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
- const TargetRegisterInfo *TRI) {
- DenseSet<MachineBasicBlock*> Visited;
+template <class UnaryPredicate>
+bool searchPredecessors(const MachineBasicBlock *MBB,
+ const MachineBasicBlock *CutOff,
+ UnaryPredicate Predicate) {
+
+ if (MBB == CutOff)
+ return false;
+
+ DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
MBB->pred_end());
while (!Worklist.empty()) {
- MachineBasicBlock *mbb = Worklist.back();
- Worklist.pop_back();
+ MachineBasicBlock *MBB = Worklist.pop_back_val();
- if (!Visited.insert(mbb).second)
+ if (!Visited.insert(MBB).second)
+ continue;
+ if (MBB == CutOff)
continue;
- if (hasTerminatorThatModifiesExec(*mbb, *TRI))
+ if (Predicate(MBB))
return true;
- Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
+ Worklist.append(MBB->pred_begin(), MBB->pred_end());
}
return false;
}
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+ const TargetRegisterInfo *TRI) {
+ return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+ return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
+// Checks if there is potential path From instruction To instruction.
+// If CutOff is specified and it sits in between of that path we ignore
+// a higher portion of the path and report it is not reachable.
+static bool isReachable(const MachineInstr *From,
+ const MachineInstr *To,
+ const MachineBasicBlock *CutOff,
+ MachineDominatorTree &MDT) {
+ // If either From block dominates To block or instructions are in the same
+ // block and From is higher.
+ if (MDT.dominates(From, To))
+ return true;
+
+ const MachineBasicBlock *MBBFrom = From->getParent();
+ const MachineBasicBlock *MBBTo = To->getParent();
+ if (MBBFrom == MBBTo)
+ return false;
+
+ // Instructions are in different blocks, do predecessor search.
+ // We should almost never get here since we do not usually produce M0 stores
+ // other than -1.
+ return searchPredecessors(MBBTo, CutOff, [MBBFrom]
+ (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
+}
+
+// Hoist and merge identical SGPR initializations into a common predecessor.
+// This is intended to combine M0 initializations, but can work with any
+// SGPR. A VGPR cannot be processed since we cannot guarantee vector
+// executioon.
+static bool hoistAndMergeSGPRInits(unsigned Reg,
+ const MachineRegisterInfo &MRI,
+ MachineDominatorTree &MDT) {
+ // List of inits by immediate value.
+ typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
+ InitListMap Inits;
+ // List of clobbering instructions.
+ SmallVector<MachineInstr*, 8> Clobbers;
+ bool Changed = false;
+
+ for (auto &MI : MRI.def_instructions(Reg)) {
+ MachineOperand *Imm = nullptr;
+ for (auto &MO: MI.operands()) {
+ if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
+ (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
+ Imm = nullptr;
+ break;
+ } else if (MO.isImm())
+ Imm = &MO;
+ }
+ if (Imm)
+ Inits[Imm->getImm()].push_front(&MI);
+ else
+ Clobbers.push_back(&MI);
+ }
+
+ for (auto &Init : Inits) {
+ auto &Defs = Init.second;
+
+ for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
+ MachineInstr *MI1 = *I1;
+
+ for (auto I2 = std::next(I1); I2 != E; ) {
+ MachineInstr *MI2 = *I2;
+
+ // Check any possible interference
+ auto intereferes = [&](MachineBasicBlock::iterator From,
+ MachineBasicBlock::iterator To) -> bool {
+
+ assert(MDT.dominates(&*To, &*From));
+
+ auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
+ const MachineBasicBlock *MBBFrom = From->getParent();
+ const MachineBasicBlock *MBBTo = To->getParent();
+ bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
+ bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
+ if (!MayClobberFrom && !MayClobberTo)
+ return false;
+ if ((MayClobberFrom && !MayClobberTo) ||
+ (!MayClobberFrom && MayClobberTo))
+ return true;
+ // Both can clobber, this is not an interference only if both are
+ // dominated by Clobber and belong to the same block or if Clobber
+ // properly dominates To, given that To >> From, so it dominates
+ // both and located in a common dominator.
+ return !((MBBFrom == MBBTo &&
+ MDT.dominates(Clobber, &*From) &&
+ MDT.dominates(Clobber, &*To)) ||
+ MDT.properlyDominates(Clobber->getParent(), MBBTo));
+ };
+
+ return (any_of(Clobbers, interferes)) ||
+ (any_of(Inits, [&](InitListMap::value_type &C) {
+ return C.first != Init.first && any_of(C.second, interferes);
+ }));
+ };
+
+ if (MDT.dominates(MI1, MI2)) {
+ if (!intereferes(MI2, MI1)) {
+ DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
+ << " " << *MI2);
+ MI2->eraseFromParent();
+ Defs.erase(I2++);
+ Changed = true;
+ continue;
+ }
+ } else if (MDT.dominates(MI2, MI1)) {
+ if (!intereferes(MI1, MI2)) {
+ DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+ << " " << *MI1);
+ MI1->eraseFromParent();
+ Defs.erase(I1++);
+ Changed = true;
+ break;
+ }
+ } else {
+ auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
+ MI2->getParent());
+ if (!MBB) {
+ ++I2;
+ continue;
+ }
+
+ MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
+ if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
+ DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+ << " " << *MI1 << "and moving from BB#"
+ << MI2->getParent()->getNumber() << " to BB#"
+ << I->getParent()->getNumber() << " " << *MI2);
+ I->getParent()->splice(I, MI2->getParent(), MI2);
+ MI1->eraseFromParent();
+ Defs.erase(I1++);
+ Changed = true;
+ break;
+ }
+ }
+ ++I2;
+ }
+ ++I1;
+ }
+ }
+
+ if (Changed)
+ MRI.clearKillFlags(Reg);
+
+ return Changed;
+}
+
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFuncti
}
}
+ if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
+ hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
+
return true;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=301228&r1=301227&r2=301228&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Mon Apr 24 14:37:54 2017
@@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedReg
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
+ // M0 has to be reserved so that llvm accepts it as a live-in into a block.
+ reserveRegisterTuples(Reserved, AMDGPU::M0);
+
// Reserve the memory aperture registers.
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
Added: llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir?rev=301228&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/merge-m0.mir Mon Apr 24 14:37:54 2017
@@ -0,0 +1,132 @@
+# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN: bb.0.entry:
+# GCN: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 65536
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 65536
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.1:
+# GCN: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.2:
+# GCN: SI_INIT_M0 65536
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.3:
+# GCN: SI_INIT_M0 3
+
+# GCN: bb.4:
+# GCN-NOT: SI_INIT_M0
+# GCN: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 4
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.5:
+# GCN-NOT: SI_INIT_M0
+# GCN: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 4
+# GCN-NEXT: DS_WRITE_B32
+
+# GCN: bb.6:
+# GCN: SI_INIT_M0 -1,
+# GCN-NEXT: DS_WRITE_B32
+# GCN: SI_INIT_M0 %2
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 %2
+# GCN-NEXT: DS_WRITE_B32
+# GCN-NEXT: SI_INIT_M0 -1
+# GCN-NEXT: DS_WRITE_B32
+
+---
+name: test
+alignment: 0
+exposesReturnsTwice: false
+noVRegs: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: sreg_32_xm0 }
+body: |
+ bb.0.entry:
+ successors: %bb.1, %bb.2
+
+ %0 = IMPLICIT_DEF
+ %1 = IMPLICIT_DEF
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_CBRANCH_VCCZ %bb.1, implicit undef %vcc
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3
+ SI_INIT_M0 65536, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4, %bb.5
+ S_CBRANCH_VCCZ %bb.4, implicit undef %vcc
+ S_BRANCH %bb.5
+
+ bb.4:
+ successors: %bb.6
+ SI_INIT_M0 3, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 4, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.6
+
+ bb.5:
+ successors: %bb.6
+ SI_INIT_M0 3, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 4, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_BRANCH %bb.6
+
+ bb.6:
+ successors: %bb.0.entry, %bb.6
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ %2 = IMPLICIT_DEF
+ SI_INIT_M0 %2, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 %2, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ SI_INIT_M0 -1, implicit-def %m0
+ DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec
+ S_CBRANCH_VCCZ %bb.6, implicit undef %vcc
+ S_BRANCH %bb.0.entry
+
+...
Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll?rev=301228&r1=301227&r2=301228&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll Mon Apr 24 14:37:54 2017
@@ -69,19 +69,20 @@ endif:
; TOSMEM-NOT: s_m0
; TOSMEM: s_add_u32 m0, s7, 0x100
; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
+; FIXME-TOSMEM-NOT: m0
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s7, 0x200
; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_mov_b64 exec,
; TOSMEM: s_cbranch_execz
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200
+; TOSMEM: s_add_u32 m0, s7, 0x200
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
@@ -130,7 +131,7 @@ endif:
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
+; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
; GCN-NOT: v_readlane_b32 m0
@@ -159,13 +160,14 @@ endif:
; GCN-LABEL: {{^}}restore_m0_lds:
; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
; TOSMEM: s_cmp_eq_u32
-; TOSMEM-NOT: m0
+; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x100
; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x300
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_cbranch_scc1
; TOSMEM: s_mov_b32 m0, -1
@@ -178,10 +180,10 @@ endif:
; TOSMEM: ds_write_b64
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x300
; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
-; TOSMEM-NOT: m0
+; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_waitcnt lgkmcnt(0)
; TOSMEM-NOT: m0
; TOSMEM: s_mov_b32 m0, s0
More information about the llvm-commits
mailing list