[llvm] 53eb0f8 - [AMDGPU] Attempt to reschedule withou clustering
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 10:27:25 PST 2020
Author: Stanislav Mekhanoshin
Date: 2020-01-27T10:27:16-08:00
New Revision: 53eb0f8c07130d19cc79a439fbd797ffd45a49da
URL: https://github.com/llvm/llvm-project/commit/53eb0f8c07130d19cc79a439fbd797ffd45a49da
DIFF: https://github.com/llvm/llvm-project/commit/53eb0f8c07130d19cc79a439fbd797ffd45a49da.diff
LOG: [AMDGPU] Attempt to reschedule withou clustering
We want to have more load/store clustering but we also want
to maintain low register pressure which are oposit targets.
Allow scheduler to reschedule regions without mutations
applied if we hit a register limit.
Differential Revision: https://reviews.llvm.org/D73386
Added:
llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
Modified:
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e109eed5f607..98added38b9c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,13 +316,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()),
- MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
+ MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
- if (Stage == 0) {
+ if (Stage == Collect) {
// Just record regions at the first pass.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
return;
@@ -348,6 +348,7 @@ void GCNScheduleDAGMILive::schedule() {
ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+ RescheduleRegions[RegionIdx] = false;
if (!LIS)
return;
@@ -389,20 +390,28 @@ void GCNScheduleDAGMILive::schedule() {
<< MinOccupancy << ".\n");
}
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+ if (PressureAfter.getVGPRNum() > MaxVGPRs ||
+ PressureAfter.getSGPRNum() > MaxSGPRs)
+ RescheduleRegions[RegionIdx] = true;
+
if (WavesAfter >= MinOccupancy) {
- unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
- unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
- if (WavesAfter > MFI.getMinWavesPerEU() ||
+ if (Stage == UnclusteredReschedule &&
+ !PressureAfter.less(ST, PressureBefore)) {
+ LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+ } else if (WavesAfter > MFI.getMinWavesPerEU() ||
PressureAfter.less(ST, PressureBefore) ||
- (TotalVGPRs >= PressureAfter.getVGPRNum() &&
- TotalSGPRs >= PressureAfter.getSGPRNum())) {
+ !RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
return;
+ } else {
+ LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
- LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+ RescheduleRegions[RegionIdx] = true;
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
@@ -532,33 +541,55 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
+ RescheduleRegions.resize(Regions.size());
+ RescheduleRegions.set();
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
do {
Stage++;
RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;
- if (Stage > 1) {
+ if (Stage > InitialSchedule) {
+ if (!LIS)
+ break;
+
// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom
// to schedule low register pressure blocks.
// Code is partially copied from MachineSchedulerBase::scheduleRegions().
- if (!LIS || StartingOccupancy <= MinOccupancy)
- break;
+ if (Stage == UnclusteredReschedule) {
+ if (RescheduleRegions.none())
+ continue;
+ LLVM_DEBUG(dbgs() <<
+ "Retrying function scheduling without clustering.\n");
+ }
+
+ if (Stage == ClusteredLowOccupancyReschedule) {
+ if (StartingOccupancy <= MinOccupancy)
+ break;
- LLVM_DEBUG(
- dbgs()
- << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Retrying function scheduling with lowest recorded occupancy "
+ << MinOccupancy << ".\n");
- S.setTargetOccupancy(MinOccupancy);
+ S.setTargetOccupancy(MinOccupancy);
+ }
}
+ if (Stage == UnclusteredReschedule)
+ SavedMutations.swap(Mutations);
+
for (auto Region : Regions) {
+ if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+ continue;
+
RegionBegin = Region.first;
RegionEnd = Region.second;
@@ -566,7 +597,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
if (MBB) finishBlock();
MBB = RegionBegin->getParent();
startBlock(MBB);
- if (Stage == 1)
+ if (Stage == InitialSchedule)
computeBlockPressure(MBB);
}
@@ -594,5 +625,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
}
finishBlock();
- } while (Stage < 2);
+ if (Stage == UnclusteredReschedule)
+ SavedMutations.swap(Mutations);
+ } while (Stage != LastStage);
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index dd687a930c79..2d81d9977c31 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -64,6 +64,14 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+ enum : unsigned {
+ Collect,
+ InitialSchedule,
+ UnclusteredReschedule,
+ ClusteredLowOccupancyReschedule,
+ LastStage = ClusteredLowOccupancyReschedule
+ };
+
const GCNSubtarget &ST;
SIMachineFunctionInfo &MFI;
@@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
+ // Records if a region is not yet scheduled, or schedule has been reverted,
+ // or we generally desire to reschedule it.
+ BitVector RescheduleRegions;
+
// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
new file mode 100644
index 000000000000..884d0cbd4dbe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Interleave loads and stores to fit into 9 VGPR limit.
+; This requires to avoid load/store clustering.
+
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: global_load_dwordx4
+; GCN: global_store_dwordx4
+; GCN: NumVgprs: {{[0-9]$}}
+; GCN: ScratchSize: 0{{$}}
+
+define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
+ %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
+ %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
+ %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
+ %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
+ %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
+ %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
+ store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
+ %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
+ store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
+ %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
+ store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { "amdgpu-num-vgpr"="9" }
More information about the llvm-commits
mailing list