[PATCH] R600: Use a refined heuristic to choose when switching clause
Vincent Lejeune
vljn at ovi.com
Fri Jun 7 08:38:58 PDT 2013
----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : Vincent Lejeune <vljn at ovi.com>
> Cc : llvm-commits at cs.uiuc.edu
> Envoyé le : Vendredi 7 juin 2013 4h52
> Objet : Re: [PATCH] R600: Use a refined heuristic to choose when switching
clause
>
> On Fri, Jun 07, 2013 at 12:15:20AM +0200, Vincent Lejeune wrote:
>> ---
>> lib/Target/R600/R600MachineScheduler.cpp | 129
> +++++++++++++++++++++++++++++--
>> lib/Target/R600/R600MachineScheduler.h | 5 +-
>> 2 files changed, 125 insertions(+), 9 deletions(-)
>>
>
> I really don't think there is a good way to test this with lit, so you
> might want to add a comment to the commit message explaining why and
> maybe also mention which app this improves performance for and by how
> much.
>
>> diff --git a/lib/Target/R600/R600MachineScheduler.cpp
> b/lib/Target/R600/R600MachineScheduler.cpp
>> index e4a78bf..af57022 100644
>> --- a/lib/Target/R600/R600MachineScheduler.cpp
>> +++ b/lib/Target/R600/R600MachineScheduler.cpp
>> @@ -38,6 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
>>
>> const AMDGPUSubtarget &ST =
> DAG->TM.getSubtarget<AMDGPUSubtarget>();
>> InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
>> + AluInstCount = 0;
>> + FetchInstCount = 0;
>> }
>>
>> void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
>> @@ -47,6 +49,90 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit
> *> &QSrc,
>> QSrc.clear();
>> }
>>
>> +static
>> +unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
>
> There is a formula for this, and it is:
>
> WFCountLimitedByGPR = 248 / GPRCount
>
> I think the value 248 is the maximum number of threads per SIMD, which can be
> different depending on the GPU. It should be easier to add per GPU
> constants like this once my outstanding device info patches are merged,
> so for now I think we should use the formula and hard-code the value to
> 248 and then add a TODO comment for adding a device specific query later.
>
> -Tom
>
>> + switch (GPRCount) {
>> + case 0:
>> + case 1:
>> + return 248;
>> + case 2:
>> + return 124;
>> + case 3:
>> + return 82;
>> + case 4:
>> + return 62;
>> + case 5:
>> + return 49;
>> + case 6:
>> + return 41;
>> + case 7:
>> + return 35;
>> + case 8:
>> + return 31;
>> + case 9:
>> + return 27;
>> + case 10:
>> + return 24;
>> + case 11:
>> + return 22;
>> + case 12:
>> + return 20;
>> + case 13:
>> + return 19;
>> + case 14:
>> + return 17;
>> + case 15:
>> + return 16;
>> + case 16:
>> + return 15;
>> + case 17:
>> + return 14;
>> + case 18:
>> + case 19:
>> + return 13;
>> + case 20:
>> + return 12;
>> + case 21:
>> + case 22:
>> + return 11;
>> + case 23:
>> + case 24:
>> + return 10;
>> + case 25:
>> + case 26:
>> + case 27:
>> + return 9;
>> + case 28:
>> + case 29:
>> + case 30:
>> + case 31:
>> + return 8;
>> + case 32:
>> + case 33:
>> + case 34:
>> + case 35:
>> + return 7;
>> + case 36:
>> + case 37:
>> + case 38:
>> + case 39:
>> + case 40:
>> + case 41:
>> + return 6;
>> + case 42:
>> + case 43:
>> + case 44:
>> + case 45:
>> + case 46:
>> + case 47:
>> + case 48:
>> + case 49:
>> + return 5;
>> + default:
>> + return 2;
>> + }
>> +}
>> +
>> SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>> SUnit *SU = 0;
>> NextInstKind = IDOther;
>> @@ -59,6 +145,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode)
> {
>> bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind])
> &&
>> (!Available[IDFetch].empty() || !Available[IDOther].empty());
>>
>> + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
>> + // We use the heuristic provided by AMD Accelerated Parallel
> Processing
>> + // OpenCL Programming Guide :
>> + // The approx. number of WF that allows TEX inst to hide ALU inst is :
>> + // 500 (cycles for TEX) / (AluFetchRatio * 4 (cycles for ALU))
>> + float ALUFetchRationEstimate =
>> + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
>> + (FetchInstCount + Available[IDFetch].size());
>> + unsigned NeededWF = 125.0f / ALUFetchRationEstimate;
>> + DEBUG( dbgs() << NeededWF << " approx. Wavefronts
> Required\n" );
>> + // We "predict" the number of required gpr to be :
>> + // std::sqrt(std::sqrt(AluInstCount / 4.0f)) :
>> + // GPRS so far (TODO : use RegisterPressure)
>> + // Available[IDFetch].size() * 0.5 : GPRs required in the Fetch clause
>> + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW
> (using
>> + // already existing GPR) or TmXYZW = TnXYZW (requiring a new GPR).
>> + // If we are going too use too many GPR, we flush Fetch instruction to
> lower
>> + // register pressure on 128 bits regs.
>> + unsigned NearRegisterRequirement =
>> + std::sqrt(std::sqrt(AluInstCount / 4.0f)) +
>> + Available[IDFetch].size() * 0.5;
>> + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
>> + AllowSwitchFromAlu = true;
>> + }
>> +
>> +
>> // We want to scheduled AR defs as soon as possible to make sure they
> aren't
>> // put in a different ALU clause from their uses.
>> if (!SU && !UnscheduledARDefs.empty()) {
>> @@ -132,6 +244,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool
> IsTopNode) {
>> }
>>
>> if (CurInstKind == IDAlu) {
>> + AluInstCount ++;
>> switch (getAluKind(SU)) {
>> case AluT_XYZW:
>> CurEmitted += 4;
>> @@ -157,7 +270,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool
> IsTopNode) {
>>
>> if (CurInstKind != IDFetch) {
>> MoveUnits(Pending[IDFetch], Available[IDFetch]);
>> - }
>> + } else
>> + FetchInstCount++;
>> }
>>
>> static bool
>> @@ -369,16 +483,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned
> Slot) {
>> return UnslotedSU;
>> }
>>
>> -bool R600SchedStrategy::isAvailablesAluEmpty() const {
>> - return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty()
> &&
>> - AvailableAlus[AluT_XYZW].empty() &&
> AvailableAlus[AluT_X].empty() &&
>> - AvailableAlus[AluT_Y].empty() &&
> AvailableAlus[AluT_Z].empty() &&
>> - AvailableAlus[AluT_W].empty() &&
> AvailableAlus[AluDiscarded].empty() &&
>> - AvailableAlus[AluPredX].empty();
>> +unsigned R600SchedStrategy::AvailablesAluCount() const {
>> + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
>> + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
>> + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
>> + AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
>> }
>>
>> SUnit* R600SchedStrategy::pickAlu() {
>> - while (!isAvailablesAluEmpty()) {
>> + while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
>> if (!OccupedSlotsMask) {
>> // Bottom up scheduling : predX must comes first
>> if (!AvailableAlus[AluPredX].empty()) {
>> diff --git a/lib/Target/R600/R600MachineScheduler.h
> b/lib/Target/R600/R600MachineScheduler.h
>> index c5024d2..aae8b3f 100644
>> --- a/lib/Target/R600/R600MachineScheduler.h
>> +++ b/lib/Target/R600/R600MachineScheduler.h
>> @@ -60,6 +60,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
>> int CurEmitted;
>> InstKind NextInstKind;
>>
>> + unsigned AluInstCount;
>> + unsigned FetchInstCount;
>> +
>> int InstKindLimit[IDLast];
>>
>> int OccupedSlotsMask;
>> @@ -85,7 +88,7 @@ private:
>> bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC)
> const;
>> AluKind getAluKind(SUnit *SU) const;
>> void LoadAlu();
>> - bool isAvailablesAluEmpty() const;
>> + unsigned AvailablesAluCount() const;
>> SUnit *AttemptFillSlot (unsigned Slot);
>> void PrepareNextSlot();
>> SUnit *PopInst(std::vector<SUnit*> &Q);
>> --
>> 1.8.2.1
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-Use-a-refined-heuristic-to-choose-when-switchin.patch
Type: application/octet-stream
Size: 5788 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130607/0b9f1f8c/attachment.obj>
More information about the llvm-commits
mailing list