[PATCH] R600: Use a refined heuristic to choose when switching clause
Vincent Lejeune
vljn at ovi.com
Thu Jun 6 15:15:20 PDT 2013
---
lib/Target/R600/R600MachineScheduler.cpp | 129 +++++++++++++++++++++++++++++--
lib/Target/R600/R600MachineScheduler.h | 5 +-
2 files changed, 125 insertions(+), 9 deletions(-)
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index e4a78bf..af57022 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -38,6 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+ AluInstCount = 0;
+ FetchInstCount = 0;
}
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@@ -47,6 +49,90 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
QSrc.clear();
}
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+ switch (GPRCount) {
+ case 0:
+ case 1:
+ return 248;
+ case 2:
+ return 124;
+ case 3:
+ return 82;
+ case 4:
+ return 62;
+ case 5:
+ return 49;
+ case 6:
+ return 41;
+ case 7:
+ return 35;
+ case 8:
+ return 31;
+ case 9:
+ return 27;
+ case 10:
+ return 24;
+ case 11:
+ return 22;
+ case 12:
+ return 20;
+ case 13:
+ return 19;
+ case 14:
+ return 17;
+ case 15:
+ return 16;
+ case 16:
+ return 15;
+ case 17:
+ return 14;
+ case 18:
+ case 19:
+ return 13;
+ case 20:
+ return 12;
+ case 21:
+ case 22:
+ return 11;
+ case 23:
+ case 24:
+ return 10;
+ case 25:
+ case 26:
+ case 27:
+ return 9;
+ case 28:
+ case 29:
+ case 30:
+ case 31:
+ return 8;
+ case 32:
+ case 33:
+ case 34:
+ case 35:
+ return 7;
+ case 36:
+ case 37:
+ case 38:
+ case 39:
+ case 40:
+ case 41:
+ return 6;
+ case 42:
+ case 43:
+ case 44:
+ case 45:
+ case 46:
+ case 47:
+ case 48:
+ case 49:
+ return 5;
+ default:
+ return 2;
+ }
+}
+
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
SUnit *SU = 0;
NextInstKind = IDOther;
@@ -59,6 +145,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
(!Available[IDFetch].empty() || !Available[IDOther].empty());
+ if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+ // We use the heuristic provided by AMD Accelerated Parallel Processing
+ // OpenCL Programming Guide :
+ // The approx. number of WF that allows TEX inst to hide ALU inst is :
+ // 500 (cycles for TEX) / (AluFetchRatio * 4 (cycles for ALU))
+ float ALUFetchRationEstimate =
+ (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+ (FetchInstCount + Available[IDFetch].size());
+ unsigned NeededWF = 125.0f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We "predict" the number of required gpr to be :
+ // std::sqrt(std::sqrt(AluInstCount / 4.0f)) :
+ // GPRS so far (TODO : use RegisterPressure)
+ // Available[IDFetch].size() * 0.5 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (using
+ // already existing GPR) or TmXYZW = TnXYZW (requiring a new GPR).
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement =
+ std::sqrt(std::sqrt(AluInstCount / 4.0f)) +
+ Available[IDFetch].size() * 0.5;
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
+
+
// We want to scheduled AR defs as soon as possible to make sure they aren't
// put in a different ALU clause from their uses.
if (!SU && !UnscheduledARDefs.empty()) {
@@ -132,6 +244,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
}
if (CurInstKind == IDAlu) {
+ AluInstCount ++;
switch (getAluKind(SU)) {
case AluT_XYZW:
CurEmitted += 4;
@@ -157,7 +270,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
- }
+ } else
+ FetchInstCount++;
}
static bool
@@ -369,16 +483,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
return UnslotedSU;
}
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
- return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
- AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
- AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
- AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
- AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+ return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+ AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+ AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+ AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
}
SUnit* R600SchedStrategy::pickAlu() {
- while (!isAvailablesAluEmpty()) {
+ while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
if (!OccupedSlotsMask) {
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index c5024d2..aae8b3f 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
int CurEmitted;
InstKind NextInstKind;
+ unsigned AluInstCount;
+ unsigned FetchInstCount;
+
int InstKindLimit[IDLast];
int OccupedSlotsMask;
@@ -85,7 +88,7 @@ private:
bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
AluKind getAluKind(SUnit *SU) const;
void LoadAlu();
- bool isAvailablesAluEmpty() const;
+ unsigned AvailablesAluCount() const;
SUnit *AttemptFillSlot (unsigned Slot);
void PrepareNextSlot();
SUnit *PopInst(std::vector<SUnit*> &Q);
--
1.8.2.1
More information about the llvm-commits
mailing list