[llvm] r183593 - R600: Use a refined heuristic to choose when switching clause

Fri Jun 7 16:30:34 PDT 2013

Author: vljn
Date: Fri Jun  7 18:30:34 2013
New Revision: 183593

URL: http://llvm.org/viewvc/llvm-project?rev=183593&view=rev
Log:
R600: Use a refined heuristic to choose when switching clause

This is using a hint from AMD APP OpenCL Programming Guide with
empirically tweaked parameters.
I used Unigine Heaven 3.0 to determine best parameters on my system
(i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark :
it went from 38.8 average fps to 39.6, which is ~3% gain.
(Lightmark 2008.2 gain is much more marginal: from 537 to 539)

There is no lit test provided as the parameter were determined
empirically and it it would be nearly impossiblet to find a test
program that check for optimal behavior.

Modified:
    llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp
    llvm/trunk/lib/Target/R600/R600MachineScheduler.h

Modified: llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp?rev=183593&r1=183592&r2=183593&view=diff
==============================================================================

--- llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp (original)
+++ llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp Fri Jun  7 18:30:34 2013
@@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(Sched
 
   const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
   InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
-
+  AluInstCount = 0;
+  FetchInstCount = 0;
 }
 
 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::v
   QSrc.clear();
 }
 
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
+}
+
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   SUnit *SU = 0;
   NextInstKind = IDOther;
@@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool
   bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
       (!Available[IDFetch].empty() || !Available[IDOther].empty());
 
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate = 
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+    // We assume the local GPR requirements to be "dominated" by the requirement
+    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+    // after TEX are indeed likely to consume or generate values from/for the
+    // TEX clause.
+    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+    // (TODO : use RegisterPressure)
+    // If we are going too use too many GPR, we flush Fetch instruction to lower
+    // register pressure on 128 bits regs.
+    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+      AllowSwitchFromAlu = true;
+  }
+
+
   // We want to scheduled AR defs as soon as possible to make sure they aren't
   // put in a different ALU clause from their uses.
   if (!SU && !UnscheduledARDefs.empty()) {
@@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit
   }
 
   if (CurInstKind == IDAlu) {
+    AluInstCount ++;
     switch (getAluKind(SU)) {
     case AluT_XYZW:
       CurEmitted += 4;
@@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
-  }
+  } else
+    FetchInstCount++;
 }
 
 static bool
@@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlo
   return UnslotedSU;
 }
 
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
-  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
-      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
-      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
-      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
-      AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
 }
 
 SUnit* R600SchedStrategy::pickAlu() {
-  while (!isAvailablesAluEmpty()) {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
     if (!OccupedSlotsMask) {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {

Modified: llvm/trunk/lib/Target/R600/R600MachineScheduler.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/R600MachineScheduler.h?rev=183593&r1=183592&r2=183593&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/R600MachineScheduler.h (original)
+++ llvm/trunk/lib/Target/R600/R600MachineScheduler.h Fri Jun  7 18:30:34 2013
@@ -60,6 +60,9 @@ class R600SchedStrategy : public Machine
   int CurEmitted;
   InstKind NextInstKind;
 
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
   int InstKindLimit[IDLast];
 
   int OccupedSlotsMask;
@@ -85,7 +88,7 @@ private:
   bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
-  bool isAvailablesAluEmpty() const;
+  unsigned AvailablesAluCount() const;
   SUnit *AttemptFillSlot (unsigned Slot);
   void PrepareNextSlot();
   SUnit *PopInst(std::vector<SUnit*> &Q);