[PATCH] R600: Use a refined heuristic to choose when switching clause

Thu Jun 6 19:52:13 PDT 2013

On Fri, Jun 07, 2013 at 12:15:20AM +0200, Vincent Lejeune wrote:
> ---
>  lib/Target/R600/R600MachineScheduler.cpp | 129 +++++++++++++++++++++++++++++--
>  lib/Target/R600/R600MachineScheduler.h   |   5 +-
>  2 files changed, 125 insertions(+), 9 deletions(-)
>

I really don't think there is a good way to test this with lit, so you
might want to add a comment to the commit message explaining why and
maybe also mention which app this improves performance for and by how
much.

> diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
> index e4a78bf..af57022 100644
> --- a/lib/Target/R600/R600MachineScheduler.cpp
> +++ b/lib/Target/R600/R600MachineScheduler.cpp
> @@ -38,6 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
>  
>    const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
>    InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
> +  AluInstCount = 0;
> +  FetchInstCount = 0;
>  }
>  
>  void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
> @@ -47,6 +49,90 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
>    QSrc.clear();
>  }
>  
> +static
> +unsigned getWFCountLimitedByGPR(unsigned GPRCount) {

There is a formula for this, and it is:

WFCountLimitedByGPR = 248 / GPRCount

I think the value 248 is the maximum number of threads per SIMD, which can be
different depending on the GPU.  It should be easier to add per GPU
constants like this once my outstanding device info patches are merged,
so for now I think we should use the formula and hard-code the value to
248 and then add a TODO comment for adding a device specific query later.

-Tom

> +  switch (GPRCount) {
> +  case 0:
> +  case 1:
> +    return 248;
> +  case 2:
> +    return 124;
> +  case 3:
> +    return 82;
> +  case 4:
> +    return 62;
> +  case 5:
> +    return 49;
> +  case 6:
> +    return 41;
> +  case 7:
> +    return 35;
> +  case 8:
> +    return 31;
> +  case 9:
> +    return 27;
> +  case 10:
> +    return 24;
> +  case 11:
> +    return 22;
> +  case 12:
> +    return 20;
> +  case 13:
> +    return 19;
> +  case 14:
> +    return 17;
> +  case 15:
> +    return 16;
> +  case 16:
> +    return 15;
> +  case 17:
> +    return 14;
> +  case 18:
> +  case 19:
> +    return 13;
> +  case 20:
> +    return 12;
> +  case 21:
> +  case 22:
> +    return 11;
> +  case 23:
> +  case 24:
> +    return 10;
> +  case 25:
> +  case 26:
> +  case 27:
> +    return 9;
> +  case 28:
> +  case 29:
> +  case 30:
> +  case 31:
> +    return 8;
> +  case 32:
> +  case 33:
> +  case 34:
> +  case 35:
> +    return 7;
> +  case 36:
> +  case 37:
> +  case 38:
> +  case 39:
> +  case 40:
> +  case 41:
> +    return 6;
> +  case 42:
> +  case 43:
> +  case 44:
> +  case 45:
> +  case 46:
> +  case 47:
> +  case 48:
> +  case 49:
> +    return 5;
> +  default:
> +    return 2;
> +  }
> +}
> +
>  SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>    SUnit *SU = 0;
>    NextInstKind = IDOther;
> @@ -59,6 +145,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>    bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
>        (!Available[IDFetch].empty() || !Available[IDOther].empty());
>  
> +  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
> +    // We use the heuristic provided by AMD Accelerated Parallel Processing
> +    // OpenCL Programming Guide :
> +    // The approx. number of WF that allows TEX inst to hide ALU inst is :
> +    // 500 (cycles for TEX) / (AluFetchRatio * 4 (cycles for ALU))
> +    float ALUFetchRationEstimate = 
> +        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
> +        (FetchInstCount + Available[IDFetch].size());
> +    unsigned NeededWF = 125.0f / ALUFetchRationEstimate;
> +    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
> +    // We "predict" the number of required gpr to be :
> +    // std::sqrt(std::sqrt(AluInstCount / 4.0f)) :
> +    //     GPRS so far (TODO : use RegisterPressure)
> +    // Available[IDFetch].size() * 0.5 : GPRs required in the Fetch clause
> +    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (using
> +    // already existing GPR) or TmXYZW = TnXYZW (requiring a new GPR).
> +    // If we are going too use too many GPR, we flush Fetch instruction to lower
> +    // register pressure on 128 bits regs.
> +    unsigned NearRegisterRequirement = 
> +        std::sqrt(std::sqrt(AluInstCount / 4.0f)) +
> +        Available[IDFetch].size() * 0.5;
> +    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
> +      AllowSwitchFromAlu = true;
> +  }
> +
> +
>    // We want to scheduled AR defs as soon as possible to make sure they aren't
>    // put in a different ALU clause from their uses.
>    if (!SU && !UnscheduledARDefs.empty()) {
> @@ -132,6 +244,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
>    }
>  
>    if (CurInstKind == IDAlu) {
> +    AluInstCount ++;
>      switch (getAluKind(SU)) {
>      case AluT_XYZW:
>        CurEmitted += 4;
> @@ -157,7 +270,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
>  
>    if (CurInstKind != IDFetch) {
>      MoveUnits(Pending[IDFetch], Available[IDFetch]);
> -  }
> +  } else
> +    FetchInstCount++;
>  }
>  
>  static bool
> @@ -369,16 +483,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
>    return UnslotedSU;
>  }
>  
> -bool R600SchedStrategy::isAvailablesAluEmpty() const {
> -  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
> -      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
> -      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
> -      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
> -      AvailableAlus[AluPredX].empty();
> +unsigned R600SchedStrategy::AvailablesAluCount() const {
> +  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
> +      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
> +      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
> +      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
>  }
>  
>  SUnit* R600SchedStrategy::pickAlu() {
> -  while (!isAvailablesAluEmpty()) {
> +  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
>      if (!OccupedSlotsMask) {
>        // Bottom up scheduling : predX must comes first
>        if (!AvailableAlus[AluPredX].empty()) {
> diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
> index c5024d2..aae8b3f 100644
> --- a/lib/Target/R600/R600MachineScheduler.h
> +++ b/lib/Target/R600/R600MachineScheduler.h
> @@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
>    int CurEmitted;
>    InstKind NextInstKind;
>  
> +  unsigned AluInstCount;
> +  unsigned FetchInstCount;
> +
>    int InstKindLimit[IDLast];
>  
>    int OccupedSlotsMask;
> @@ -85,7 +88,7 @@ private:
>    bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
>    AluKind getAluKind(SUnit *SU) const;
>    void LoadAlu();
> -  bool isAvailablesAluEmpty() const;
> +  unsigned AvailablesAluCount() const;
>    SUnit *AttemptFillSlot (unsigned Slot);
>    void PrepareNextSlot();
>    SUnit *PopInst(std::vector<SUnit*> &Q);
> -- 
> 1.8.2.1
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits