[PATCH] R600: Use a refined heuristic to choose when switching clause
Tom Stellard
tom at stellard.net
Fri Jun 7 08:57:10 PDT 2013
> From 951ebad474e2548a76aabcd3e1df3792f99d8a1d Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Thu, 6 Jun 2013 19:43:51 +0200
> Subject: [PATCH] R600: Use a refined heuristic to choose when switching clause
>
> This is using a hint from AMD APP OpenCL Programming Guide with
> empirically tweaked parameters.
> I used Unigine Heaven 3.0 to determine best parameters on my system
> (i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark :
> it went from 38.8 average fps to 39.6, which is ~3% gain.
> (Lightmark 2008.2 gain is much more marginal: from 537 to 539)
I still think you should add a comment about why there are no lit tests,
maybe something like:
There no lit tests, because a test program for this would be extremely
complex and hard to maintain across future changes to the scheduler.
> ---
> lib/Target/R600/R600MachineScheduler.cpp | 50 +++++++++++++++++++++++++++-----
> lib/Target/R600/R600MachineScheduler.h | 5 +++-
> 2 files changed, 46 insertions(+), 9 deletions(-)
>
> diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
> index e1badba..e7e4921 100644
> --- a/lib/Target/R600/R600MachineScheduler.cpp
> +++ b/lib/Target/R600/R600MachineScheduler.cpp
> @@ -243,6 +243,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
>
> const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
> InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
> + AluInstCount = 0;
> + FetchInstCount = 0;
> }
>
> void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
> @@ -252,6 +254,11 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
> QSrc.clear();
> }
>
> +static
> +unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
> + return 248 / GPRCount;
You need a special case here for when GPRCount is zero.
With those changes:
Reviewed-by: Tom Stellard <thomas.stellard at amd.com>
> +}
> +
> SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> SUnit *SU = 0;
> NextInstKind = IDOther;
> @@ -264,6 +271,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
> (!Available[IDFetch].empty() || !Available[IDOther].empty());
>
> + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
> + // We use the heuristic provided by AMD Accelerated Parallel Processing
> + // OpenCL Programming Guide :
> + // The approx. number of WF that allows TEX inst to hide ALU inst is :
> + // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
> + float ALUFetchRationEstimate =
> + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
> + (FetchInstCount + Available[IDFetch].size());
> + unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
> + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
> + // We assume the local GPR requirements to be "dominated" by the requirement
> + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
> + // after TEX are indeed likely to consume or generate values from/for the
> + // TEX clause.
> + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
> + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
> + // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
> + // (TODO : use RegisterPressure)
> + // If we are going too use too many GPR, we flush Fetch instruction to lower
> + // register pressure on 128 bits regs.
> + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
> + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
> + AllowSwitchFromAlu = true;
> + }
> +
> +
> // We want to scheduled AR defs as soon as possible to make sure they aren't
> // put in a different ALU clause from their uses.
> if (!SU && !UnscheduledARDefs.empty()) {
> @@ -337,6 +370,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
> }
>
> if (CurInstKind == IDAlu) {
> + AluInstCount ++;
> switch (getAluKind(SU)) {
> case AluT_XYZW:
> CurEmitted += 4;
> @@ -362,7 +396,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
>
> if (CurInstKind != IDFetch) {
> MoveUnits(Pending[IDFetch], Available[IDFetch]);
> - }
> + } else
> + FetchInstCount++;
> }
>
> static bool
> @@ -574,16 +609,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
> return UnslotedSU;
> }
>
> -bool R600SchedStrategy::isAvailablesAluEmpty() const {
> - return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
> - AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
> - AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
> - AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
> - AvailableAlus[AluPredX].empty();
> +unsigned R600SchedStrategy::AvailablesAluCount() const {
> + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
> + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
> + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
> + AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
> }
>
> SUnit* R600SchedStrategy::pickAlu() {
> - while (!isAvailablesAluEmpty()) {
> + while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
> if (!OccupedSlotsMask) {
> // Bottom up scheduling : predX must comes first
> if (!AvailableAlus[AluPredX].empty()) {
> diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
> index ff13f25..5dbe266 100644
> --- a/lib/Target/R600/R600MachineScheduler.h
> +++ b/lib/Target/R600/R600MachineScheduler.h
> @@ -61,6 +61,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
> int CurEmitted;
> InstKind NextInstKind;
>
> + unsigned AluInstCount;
> + unsigned FetchInstCount;
> +
> int InstKindLimit[IDLast];
>
> int OccupedSlotsMask;
> @@ -86,7 +89,7 @@ private:
> bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
> AluKind getAluKind(SUnit *SU) const;
> void LoadAlu();
> - bool isAvailablesAluEmpty() const;
> + unsigned AvailablesAluCount() const;
> SUnit *AttemptFillSlot (unsigned Slot);
> void PrepareNextSlot();
> SUnit *PopInst(std::vector<SUnit*> &Q);
> --
> 1.8.2.1
>
More information about the llvm-commits
mailing list