[PATCH] R600: Use a refined heuristic to choose when switching clause

Thu Jun 6 15:15:20 PDT 2013

---
 lib/Target/R600/R600MachineScheduler.cpp | 129 +++++++++++++++++++++++++++++--
 lib/Target/R600/R600MachineScheduler.h   |   5 +-
 2 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index e4a78bf..af57022 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -38,6 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
 
   const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
   InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+  AluInstCount = 0;
+  FetchInstCount = 0;
 }
 
 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@@ -47,6 +49,90 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
   QSrc.clear();
 }
 
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  switch (GPRCount) {
+  case 0:
+  case 1:
+    return 248;
+  case 2:
+    return 124;
+  case 3:
+    return 82;
+  case 4:
+    return 62;
+  case 5:
+    return 49;
+  case 6:
+    return 41;
+  case 7:
+    return 35;
+  case 8:
+    return 31;
+  case 9:
+    return 27;
+  case 10:
+    return 24;
+  case 11:
+    return 22;
+  case 12:
+    return 20;
+  case 13:
+    return 19;
+  case 14:
+    return 17;
+  case 15:
+    return 16;
+  case 16:
+    return 15;
+  case 17:
+    return 14;
+  case 18:
+  case 19:
+    return 13;
+  case 20:
+    return 12;
+  case 21:
+  case 22:
+    return 11;
+  case 23:
+  case 24:
+    return 10;
+  case 25:
+  case 26:
+  case 27:
+    return 9;
+  case 28:
+  case 29:
+  case 30:
+  case 31:
+    return 8;
+  case 32:
+  case 33:
+  case 34:
+  case 35:
+    return 7;
+  case 36:
+  case 37:
+  case 38:
+  case 39:
+  case 40:
+  case 41:
+    return 6;
+  case 42:
+  case 43:
+  case 44:
+  case 45:
+  case 46:
+  case 47:
+  case 48:
+  case 49:
+    return 5;
+  default:
+    return 2;
+  }
+}
+
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   SUnit *SU = 0;
   NextInstKind = IDOther;
@@ -59,6 +145,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
       (!Available[IDFetch].empty() || !Available[IDOther].empty());
 
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 4 (cycles for ALU))
+    float ALUFetchRationEstimate = 
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    unsigned NeededWF = 125.0f / ALUFetchRationEstimate;
+    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+    // We "predict" the number of required gpr to be :
+    // std::sqrt(std::sqrt(AluInstCount / 4.0f)) :
+    //     GPRS so far (TODO : use RegisterPressure)
+    // Available[IDFetch].size() * 0.5 : GPRs required in the Fetch clause
+    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (using
+    // already existing GPR) or TmXYZW = TnXYZW (requiring a new GPR).
+    // If we are going too use too many GPR, we flush Fetch instruction to lower
+    // register pressure on 128 bits regs.
+    unsigned NearRegisterRequirement = 
+        std::sqrt(std::sqrt(AluInstCount / 4.0f)) +
+        Available[IDFetch].size() * 0.5;
+    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+      AllowSwitchFromAlu = true;
+  }
+
+
   // We want to scheduled AR defs as soon as possible to make sure they aren't
   // put in a different ALU clause from their uses.
   if (!SU && !UnscheduledARDefs.empty()) {
@@ -132,6 +244,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   }
 
   if (CurInstKind == IDAlu) {
+    AluInstCount ++;
     switch (getAluKind(SU)) {
     case AluT_XYZW:
       CurEmitted += 4;
@@ -157,7 +270,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
-  }
+  } else
+    FetchInstCount++;
 }
 
 static bool
@@ -369,16 +483,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
   return UnslotedSU;
 }
 
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
-  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
-      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
-      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
-      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
-      AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
 }
 
 SUnit* R600SchedStrategy::pickAlu() {
-  while (!isAvailablesAluEmpty()) {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
     if (!OccupedSlotsMask) {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index c5024d2..aae8b3f 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
   int CurEmitted;
   InstKind NextInstKind;
 
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
   int InstKindLimit[IDLast];
 
   int OccupedSlotsMask;
@@ -85,7 +88,7 @@ private:
   bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
-  bool isAvailablesAluEmpty() const;
+  unsigned AvailablesAluCount() const;
   SUnit *AttemptFillSlot (unsigned Slot);
   void PrepareNextSlot();
   SUnit *PopInst(std::vector<SUnit*> &Q);
-- 
1.8.2.1