[llvm] r183593 - R600: Use a refined heuristic to choose when switching clause
Vincent Lejeune
vljn at ovi.com
Fri Jun 7 16:30:34 PDT 2013
Author: vljn
Date: Fri Jun 7 18:30:34 2013
New Revision: 183593
URL: http://llvm.org/viewvc/llvm-project?rev=183593&view=rev
Log:
R600: Use a refined heuristic to choose when switching clause
This is using a hint from AMD APP OpenCL Programming Guide with
empirically tweaked parameters.
I used Unigine Heaven 3.0 to determine best parameters on my system
(i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark :
it went from 38.8 average fps to 39.6, which is ~3% gain.
(Lightmark 2008.2 gain is much more marginal: from 537 to 539)
There is no lit test provided as the parameter were determined
empirically and it it would be nearly impossiblet to find a test
program that check for optimal behavior.
Modified:
llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp
llvm/trunk/lib/Target/R600/R600MachineScheduler.h
Modified: llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp?rev=183593&r1=183592&r2=183593&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp (original)
+++ llvm/trunk/lib/Target/R600/R600MachineScheduler.cpp Fri Jun 7 18:30:34 2013
@@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(Sched
const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
-
+ AluInstCount = 0;
+ FetchInstCount = 0;
}
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::v
QSrc.clear();
}
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+ assert (GPRCount && "GPRCount cannot be 0");
+ return 248 / GPRCount;
+}
+
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
SUnit *SU = 0;
NextInstKind = IDOther;
@@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool
bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
(!Available[IDFetch].empty() || !Available[IDOther].empty());
+ if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+ // We use the heuristic provided by AMD Accelerated Parallel Processing
+ // OpenCL Programming Guide :
+ // The approx. number of WF that allows TEX inst to hide ALU inst is :
+ // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+ float ALUFetchRationEstimate =
+ (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+ (FetchInstCount + Available[IDFetch].size());
+ unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We assume the local GPR requirements to be "dominated" by the requirement
+ // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+ // after TEX are indeed likely to consume or generate values from/for the
+ // TEX clause.
+ // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+ // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+ // (TODO : use RegisterPressure)
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
+
+
// We want to scheduled AR defs as soon as possible to make sure they aren't
// put in a different ALU clause from their uses.
if (!SU && !UnscheduledARDefs.empty()) {
@@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit
}
if (CurInstKind == IDAlu) {
+ AluInstCount ++;
switch (getAluKind(SU)) {
case AluT_XYZW:
CurEmitted += 4;
@@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
- }
+ } else
+ FetchInstCount++;
}
static bool
@@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlo
return UnslotedSU;
}
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
- return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
- AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
- AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
- AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
- AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+ return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+ AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+ AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+ AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
}
SUnit* R600SchedStrategy::pickAlu() {
- while (!isAvailablesAluEmpty()) {
+ while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
if (!OccupedSlotsMask) {
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {
Modified: llvm/trunk/lib/Target/R600/R600MachineScheduler.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/R600MachineScheduler.h?rev=183593&r1=183592&r2=183593&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/R600MachineScheduler.h (original)
+++ llvm/trunk/lib/Target/R600/R600MachineScheduler.h Fri Jun 7 18:30:34 2013
@@ -60,6 +60,9 @@ class R600SchedStrategy : public Machine
int CurEmitted;
InstKind NextInstKind;
+ unsigned AluInstCount;
+ unsigned FetchInstCount;
+
int InstKindLimit[IDLast];
int OccupedSlotsMask;
@@ -85,7 +88,7 @@ private:
bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
AluKind getAluKind(SUnit *SU) const;
void LoadAlu();
- bool isAvailablesAluEmpty() const;
+ unsigned AvailablesAluCount() const;
SUnit *AttemptFillSlot (unsigned Slot);
void PrepareNextSlot();
SUnit *PopInst(std::vector<SUnit*> &Q);
More information about the llvm-commits
mailing list