[PATCH] R600: Use a refined heuristic to choose when switching clause

Fri Jun 7 08:57:10 PDT 2013

> From 951ebad474e2548a76aabcd3e1df3792f99d8a1d Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Thu, 6 Jun 2013 19:43:51 +0200
> Subject: [PATCH] R600: Use a refined heuristic to choose when switching clause
> 
> This is using a hint from AMD APP OpenCL Programming Guide with
> empirically tweaked parameters.
> I used Unigine Heaven 3.0 to determine best parameters on my system
> (i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark :
> it went from 38.8 average fps to 39.6, which is ~3% gain.
> (Lightmark 2008.2 gain is much more marginal: from 537 to 539)

I still think you should add a comment about why there are no lit tests,
maybe something like:

There no lit tests, because a test program for this would be extremely
complex and hard to maintain across future changes to the scheduler.

> ---
>  lib/Target/R600/R600MachineScheduler.cpp | 50 +++++++++++++++++++++++++++-----
>  lib/Target/R600/R600MachineScheduler.h   |  5 +++-
>  2 files changed, 46 insertions(+), 9 deletions(-)
> 
> diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
> index e1badba..e7e4921 100644
> --- a/lib/Target/R600/R600MachineScheduler.cpp
> +++ b/lib/Target/R600/R600MachineScheduler.cpp
> @@ -243,6 +243,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
>  
>    const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
>    InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
> +  AluInstCount = 0;
> +  FetchInstCount = 0;
>  }
>  
>  void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
> @@ -252,6 +254,11 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
>    QSrc.clear();
>  }
>  
> +static
> +unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
> +  return 248 / GPRCount;

You need a special case here for when GPRCount is zero.

With those changes:

Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

> +}
> +
>  SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>    SUnit *SU = 0;
>    NextInstKind = IDOther;
> @@ -264,6 +271,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>    bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
>        (!Available[IDFetch].empty() || !Available[IDOther].empty());
>  
> +  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
> +    // We use the heuristic provided by AMD Accelerated Parallel Processing
> +    // OpenCL Programming Guide :
> +    // The approx. number of WF that allows TEX inst to hide ALU inst is :
> +    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
> +    float ALUFetchRationEstimate = 
> +        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
> +        (FetchInstCount + Available[IDFetch].size());
> +    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
> +    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
> +    // We assume the local GPR requirements to be "dominated" by the requirement
> +    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
> +    // after TEX are indeed likely to consume or generate values from/for the
> +    // TEX clause.
> +    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
> +    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
> +    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
> +    // (TODO : use RegisterPressure)
> +    // If we are going too use too many GPR, we flush Fetch instruction to lower
> +    // register pressure on 128 bits regs.
> +    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
> +    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
> +      AllowSwitchFromAlu = true;
> +  }
> +
> +
>    // We want to scheduled AR defs as soon as possible to make sure they aren't
>    // put in a different ALU clause from their uses.
>    if (!SU && !UnscheduledARDefs.empty()) {
> @@ -337,6 +370,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
>    }
>  
>    if (CurInstKind == IDAlu) {
> +    AluInstCount ++;
>      switch (getAluKind(SU)) {
>      case AluT_XYZW:
>        CurEmitted += 4;
> @@ -362,7 +396,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
>  
>    if (CurInstKind != IDFetch) {
>      MoveUnits(Pending[IDFetch], Available[IDFetch]);
> -  }
> +  } else
> +    FetchInstCount++;
>  }
>  
>  static bool
> @@ -574,16 +609,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
>    return UnslotedSU;
>  }
>  
> -bool R600SchedStrategy::isAvailablesAluEmpty() const {
> -  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
> -      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
> -      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
> -      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
> -      AvailableAlus[AluPredX].empty();
> +unsigned R600SchedStrategy::AvailablesAluCount() const {
> +  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
> +      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
> +      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
> +      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
>  }
>  
>  SUnit* R600SchedStrategy::pickAlu() {
> -  while (!isAvailablesAluEmpty()) {
> +  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
>      if (!OccupedSlotsMask) {
>        // Bottom up scheduling : predX must comes first
>        if (!AvailableAlus[AluPredX].empty()) {
> diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
> index ff13f25..5dbe266 100644
> --- a/lib/Target/R600/R600MachineScheduler.h
> +++ b/lib/Target/R600/R600MachineScheduler.h
> @@ -61,6 +61,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
>    int CurEmitted;
>    InstKind NextInstKind;
>  
> +  unsigned AluInstCount;
> +  unsigned FetchInstCount;
> +
>    int InstKindLimit[IDLast];
>  
>    int OccupedSlotsMask;
> @@ -86,7 +89,7 @@ private:
>    bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
>    AluKind getAluKind(SUnit *SU) const;
>    void LoadAlu();
> -  bool isAvailablesAluEmpty() const;
> +  unsigned AvailablesAluCount() const;
>    SUnit *AttemptFillSlot (unsigned Slot);
>    void PrepareNextSlot();
>    SUnit *PopInst(std::vector<SUnit*> &Q);
> -- 
> 1.8.2.1
>