[PATCH] R600: Use a refined heuristic to choose when switching clause

Fri Jun 7 08:38:58 PDT 2013



----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : Vincent Lejeune <vljn at ovi.com>
> Cc : llvm-commits at cs.uiuc.edu
> Envoyé le : Vendredi 7 juin 2013 4h52
> Objet : Re: [PATCH] R600: Use a refined heuristic to choose when switching
 clause
> 
> On Fri, Jun 07, 2013 at 12:15:20AM +0200, Vincent Lejeune wrote:
>>  ---
>>   lib/Target/R600/R600MachineScheduler.cpp | 129 
> +++++++++++++++++++++++++++++--
>>   lib/Target/R600/R600MachineScheduler.h   |   5 +-
>>   2 files changed, 125 insertions(+), 9 deletions(-)
>> 
> 
> I really don't think there is a good way to test this with lit, so you
> might want to add a comment to the commit message explaining why and
> maybe also mention which app this improves performance for and by how
> much.
> 
>>  diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
> b/lib/Target/R600/R600MachineScheduler.cpp
>>  index e4a78bf..af57022 100644
>>  --- a/lib/Target/R600/R600MachineScheduler.cpp
>>  +++ b/lib/Target/R600/R600MachineScheduler.cpp
>>  @@ -38,6 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
>>   
>>     const AMDGPUSubtarget &ST = 
> DAG->TM.getSubtarget<AMDGPUSubtarget>();
>>     InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
>>  +  AluInstCount = 0;
>>  +  FetchInstCount = 0;
>>   }
>>   
>>   void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
>>  @@ -47,6 +49,90 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit 
> *> &QSrc,
>>     QSrc.clear();
>>   }
>>   
>>  +static
>>  +unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
> 
> There is a formula for this, and it is:
> 
> WFCountLimitedByGPR = 248 / GPRCount
> 
> I think the value 248 is the maximum number of threads per SIMD, which can be
> different depending on the GPU.  It should be easier to add per GPU
> constants like this once my outstanding device info patches are merged,
> so for now I think we should use the formula and hard-code the value to
> 248 and then add a TODO comment for adding a device specific query later.
> 
> -Tom
> 
>>  +  switch (GPRCount) {
>>  +  case 0:
>>  +  case 1:
>>  +    return 248;
>>  +  case 2:
>>  +    return 124;
>>  +  case 3:
>>  +    return 82;
>>  +  case 4:
>>  +    return 62;
>>  +  case 5:
>>  +    return 49;
>>  +  case 6:
>>  +    return 41;
>>  +  case 7:
>>  +    return 35;
>>  +  case 8:
>>  +    return 31;
>>  +  case 9:
>>  +    return 27;
>>  +  case 10:
>>  +    return 24;
>>  +  case 11:
>>  +    return 22;
>>  +  case 12:
>>  +    return 20;
>>  +  case 13:
>>  +    return 19;
>>  +  case 14:
>>  +    return 17;
>>  +  case 15:
>>  +    return 16;
>>  +  case 16:
>>  +    return 15;
>>  +  case 17:
>>  +    return 14;
>>  +  case 18:
>>  +  case 19:
>>  +    return 13;
>>  +  case 20:
>>  +    return 12;
>>  +  case 21:
>>  +  case 22:
>>  +    return 11;
>>  +  case 23:
>>  +  case 24:
>>  +    return 10;
>>  +  case 25:
>>  +  case 26:
>>  +  case 27:
>>  +    return 9;
>>  +  case 28:
>>  +  case 29:
>>  +  case 30:
>>  +  case 31:
>>  +    return 8;
>>  +  case 32:
>>  +  case 33:
>>  +  case 34:
>>  +  case 35:
>>  +    return 7;
>>  +  case 36:
>>  +  case 37:
>>  +  case 38:
>>  +  case 39:
>>  +  case 40:
>>  +  case 41:
>>  +    return 6;
>>  +  case 42:
>>  +  case 43:
>>  +  case 44:
>>  +  case 45:
>>  +  case 46:
>>  +  case 47:
>>  +  case 48:
>>  +  case 49:
>>  +    return 5;
>>  +  default:
>>  +    return 2;
>>  +  }
>>  +}
>>  +
>>   SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>>     SUnit *SU = 0;
>>     NextInstKind = IDOther;
>>  @@ -59,6 +145,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) 
> {
>>     bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) 
> &&
>>         (!Available[IDFetch].empty() || !Available[IDOther].empty());
>>   
>>  +  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
>>  +    // We use the heuristic provided by AMD Accelerated Parallel 
> Processing
>>  +    // OpenCL Programming Guide :
>>  +    // The approx. number of WF that allows TEX inst to hide ALU inst is :
>>  +    // 500 (cycles for TEX) / (AluFetchRatio * 4 (cycles for ALU))
>>  +    float ALUFetchRationEstimate = 
>>  +        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
>>  +        (FetchInstCount + Available[IDFetch].size());
>>  +    unsigned NeededWF = 125.0f / ALUFetchRationEstimate;
>>  +    DEBUG( dbgs() << NeededWF << " approx. Wavefronts 
> Required\n" );
>>  +    // We "predict" the number of required gpr to be :
>>  +    // std::sqrt(std::sqrt(AluInstCount / 4.0f)) :
>>  +    //     GPRS so far (TODO : use RegisterPressure)
>>  +    // Available[IDFetch].size() * 0.5 : GPRs required in the Fetch clause
>>  +    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW 
> (using
>>  +    // already existing GPR) or TmXYZW = TnXYZW (requiring a new GPR).
>>  +    // If we are going too use too many GPR, we flush Fetch instruction to 
> lower
>>  +    // register pressure on 128 bits regs.
>>  +    unsigned NearRegisterRequirement = 
>>  +        std::sqrt(std::sqrt(AluInstCount / 4.0f)) +
>>  +        Available[IDFetch].size() * 0.5;
>>  +    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
>>  +      AllowSwitchFromAlu = true;
>>  +  }
>>  +
>>  +
>>     // We want to scheduled AR defs as soon as possible to make sure they 
> aren't
>>     // put in a different ALU clause from their uses.
>>     if (!SU && !UnscheduledARDefs.empty()) {
>>  @@ -132,6 +244,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool 
> IsTopNode) {
>>     }
>>   
>>     if (CurInstKind == IDAlu) {
>>  +    AluInstCount ++;
>>       switch (getAluKind(SU)) {
>>       case AluT_XYZW:
>>         CurEmitted += 4;
>>  @@ -157,7 +270,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool 
> IsTopNode) {
>>   
>>     if (CurInstKind != IDFetch) {
>>       MoveUnits(Pending[IDFetch], Available[IDFetch]);
>>  -  }
>>  +  } else
>>  +    FetchInstCount++;
>>   }
>>   
>>   static bool
>>  @@ -369,16 +483,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned 
> Slot) {
>>     return UnslotedSU;
>>   }
>>   
>>  -bool R600SchedStrategy::isAvailablesAluEmpty() const {
>>  -  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() 
> &&
>>  -      AvailableAlus[AluT_XYZW].empty() && 
> AvailableAlus[AluT_X].empty() &&
>>  -      AvailableAlus[AluT_Y].empty() && 
> AvailableAlus[AluT_Z].empty() &&
>>  -      AvailableAlus[AluT_W].empty() && 
> AvailableAlus[AluDiscarded].empty() &&
>>  -      AvailableAlus[AluPredX].empty();
>>  +unsigned R600SchedStrategy::AvailablesAluCount() const {
>>  +  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
>>  +      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
>>  +      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
>>  +      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
>>   }
>>   
>>   SUnit* R600SchedStrategy::pickAlu() {
>>  -  while (!isAvailablesAluEmpty()) {
>>  +  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
>>       if (!OccupedSlotsMask) {
>>         // Bottom up scheduling : predX must comes first
>>         if (!AvailableAlus[AluPredX].empty()) {
>>  diff --git a/lib/Target/R600/R600MachineScheduler.h 
> b/lib/Target/R600/R600MachineScheduler.h
>>  index c5024d2..aae8b3f 100644
>>  --- a/lib/Target/R600/R600MachineScheduler.h
>>  +++ b/lib/Target/R600/R600MachineScheduler.h
>>  @@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
>>     int CurEmitted;
>>     InstKind NextInstKind;
>>   
>>  +  unsigned AluInstCount;
>>  +  unsigned FetchInstCount;
>>  +
>>     int InstKindLimit[IDLast];
>>   
>>     int OccupedSlotsMask;
>>  @@ -85,7 +88,7 @@ private:
>>     bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) 
> const;
>>     AluKind getAluKind(SUnit *SU) const;
>>     void LoadAlu();
>>  -  bool isAvailablesAluEmpty() const;
>>  +  unsigned AvailablesAluCount() const;
>>     SUnit *AttemptFillSlot (unsigned Slot);
>>     void PrepareNextSlot();
>>     SUnit *PopInst(std::vector<SUnit*> &Q);
>>  -- 
>>  1.8.2.1
>> 
>>  _______________________________________________
>>  llvm-commits mailing list
>>  llvm-commits at cs.uiuc.edu
>>  http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-Use-a-refined-heuristic-to-choose-when-switchin.patch
Type: application/octet-stream
Size: 5788 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130607/0b9f1f8c/attachment.obj>