[llvm] r298368 - [AMDGPU] Iterative scheduling infrastructure + minimal registry scheduler
Valery Pykhtin via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 21 06:15:46 PDT 2017
Author: vpykhtin
Date: Tue Mar 21 08:15:46 2017
New Revision: 298368
URL: http://llvm.org/viewvc/llvm-project?rev=298368&view=rev
Log:
[AMDGPU] Iterative scheduling infrastructure + minimal registry scheduler
Differential revision: https://reviews.llvm.org/D31046
Added:
llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h
llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp
llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h
llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp
llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h
llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=298368&r1=298367&r2=298368&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Tue Mar 21 08:15:46 2017
@@ -22,6 +22,7 @@
#include "SIInstrInfo.h"
#include "SIISelLowering.h"
#include "SIFrameLowering.h"
+#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@@ -317,6 +318,11 @@ public:
/// the given LDS memory size is the only constraint.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+ unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+ const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
+ }
+
bool hasFP16Denormals() const {
return FP64FP16Denormals;
}
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=298368&r1=298367&r2=298368&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Tue Mar 21 08:15:46 2017
@@ -24,6 +24,7 @@
#endif
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
+#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "R600MachineScheduler.h"
#include "SIMachineScheduler.h"
@@ -155,6 +156,20 @@ createGCNMaxOccupancyMachineScheduler(Ma
return DAG;
}
+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ auto DAG = new GCNIterativeScheduler(C,
+ GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+}
+
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+ return new GCNIterativeScheduler(C,
+ GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@@ -168,6 +183,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-oc
"Run GCN scheduler to maximize occupancy",
createGCNMaxOccupancyMachineScheduler);
+static MachineSchedRegistry
+IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
+ "Run GCN scheduler to maximize occupancy (experimental)",
+ createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+GCNMinRegSchedRegistry("gcn-minreg",
+ "Run GCN iterative scheduler for minimal register usage (experimental)",
+ createMinRegScheduler);
+
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=298368&r1=298367&r2=298368&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Tue Mar 21 08:15:46 2017
@@ -94,6 +94,9 @@ add_llvm_target(AMDGPUCodeGen
SIShrinkInstructions.cpp
SITypeRewriter.cpp
SIWholeQuadMode.cpp
+ GCNIterativeScheduler.cpp
+ GCNMinRegStrategy.cpp
+ GCNRegPressure.cpp
${GLOBAL_ISEL_BUILD_FILES}
)
Added: llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp?rev=298368&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp Tue Mar 21 08:15:46 2017
@@ -0,0 +1,528 @@
+//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNIterativeScheduler.h"
+#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace llvm {
+ std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+ const ScheduleDAG &DAG);
+}
+
+// shim accessors for different order containers
+static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
+ return MI;
+}
+static inline MachineInstr *getMachineInstr(const SUnit *SU) {
+ return SU->getInstr();
+}
+static inline MachineInstr *getMachineInstr(const SUnit &SU) {
+ return SU.getInstr();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void printRegion(raw_ostream &OS,
+ MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ const LiveIntervals *LIS,
+ unsigned MaxInstNum =
+ std::numeric_limits<unsigned>::max()) {
+ auto BB = Begin->getParent();
+ OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
+ << ' ' << BB->getName() << ":\n";
+ auto I = Begin;
+ MaxInstNum = std::max(MaxInstNum, 1u);
+ for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
+ if (!I->isDebugValue() && LIS)
+ OS << LIS->getInstructionIndex(*I);
+ OS << '\t' << *I;
+ }
+ if (I != End) {
+ OS << "\t...\n";
+ I = std::prev(End);
+ if (!I->isDebugValue() && LIS)
+ OS << LIS->getInstructionIndex(*I);
+ OS << '\t' << *I;
+ }
+ if (End != BB->end()) { // print boundary inst if present
+ OS << "----\n";
+ if (LIS) OS << LIS->getInstructionIndex(*End) << '\t';
+ OS << *End;
+ }
+}
+
+LLVM_DUMP_METHOD
+static void printLivenessInfo(raw_ostream &OS,
+ MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ const LiveIntervals *LIS) {
+ const auto BB = Begin->getParent();
+ const auto &MRI = BB->getParent()->getRegInfo();
+
+ const auto LiveIns = getLiveRegsBefore(*Begin, *LIS);
+ OS << "LIn RP: ";
+ getRegPressure(MRI, LiveIns).print(OS);
+
+ const auto BottomMI = End == BB->end() ? std::prev(End) : End;
+ const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS);
+ OS << "LOt RP: ";
+ getRegPressure(MRI, LiveOuts).print(OS);
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ for (const auto R : Regions) {
+ OS << "Region to schedule ";
+ printRegion(OS, R->Begin, R->End, LIS, 1);
+ printLivenessInfo(OS, R->Begin, R->End, LIS);
+ OS << "Max RP: ";
+ R->MaxPressure.print(OS, &ST);
+ }
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedResult(raw_ostream &OS,
+ const Region *R,
+ const GCNRegPressure &RP) const {
+ OS << "\nAfter scheduling ";
+ printRegion(OS, R->Begin, R->End, LIS);
+ printSchedRP(OS, R->MaxPressure, RP);
+ OS << '\n';
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
+ const GCNRegPressure &Before,
+ const GCNRegPressure &After) const {
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ OS << "RP before: ";
+ Before.print(OS, &ST);
+ OS << "RP after: ";
+ After.print(OS, &ST);
+}
+
+#endif
+
+// DAG builder helper
+class GCNIterativeScheduler::BuildDAG {
+ GCNIterativeScheduler &Sch;
+ SmallVector<SUnit*, 8> TopRoots;
+public:
+ BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
+ : Sch(_Sch) {
+ auto BB = R.Begin->getParent();
+ Sch.BaseClass::startBlock(BB);
+ Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+
+ Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
+ /*TrackLaneMask*/true);
+ Sch.Topo.InitDAGTopologicalSorting();
+
+ SmallVector<SUnit*, 8> BotRoots;
+ Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
+ }
+ ~BuildDAG() {
+ Sch.BaseClass::exitRegion();
+ Sch.BaseClass::finishBlock();
+ }
+ ArrayRef<const SUnit*> getTopRoots() const {
+ return TopRoots;
+ }
+};
+
+class GCNIterativeScheduler::OverrideLegacyStrategy {
+ GCNIterativeScheduler &Sch;
+ Region &Rgn;
+ std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
+ GCNRegPressure SaveMaxRP;
+public:
+ OverrideLegacyStrategy(Region &R,
+ MachineSchedStrategy &OverrideStrategy,
+ GCNIterativeScheduler &_Sch)
+ : Sch(_Sch)
+ , Rgn(R)
+ , SaveSchedImpl(std::move(_Sch.SchedImpl))
+ , SaveMaxRP(R.MaxPressure) {
+ Sch.SchedImpl.reset(&OverrideStrategy);
+ auto BB = R.Begin->getParent();
+ Sch.BaseClass::startBlock(BB);
+ Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+ }
+ ~OverrideLegacyStrategy() {
+ Sch.BaseClass::exitRegion();
+ Sch.BaseClass::finishBlock();
+ Sch.SchedImpl.release();
+ Sch.SchedImpl = std::move(SaveSchedImpl);
+ }
+ void schedule() {
+ assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+ DEBUG(dbgs() << "\nScheduling ";
+ printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+ Sch.BaseClass::schedule();
+
+ // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+ Sch.RegionEnd = Rgn.End;
+ //assert(Rgn.End == Sch.RegionEnd);
+ Rgn.Begin = Sch.RegionBegin;
+ Rgn.MaxPressure.clear();
+ }
+ void restoreOrder() {
+ assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+ // DAG SUnits are stored using original region's order
+ // so just use SUnits as the restoring schedule
+ Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP);
+ }
+};
+
+// just a stub to make base class happy
+class SchedStrategyStub : public MachineSchedStrategy {
+public:
+ bool shouldTrackPressure() const override { return false; }
+ bool shouldTrackLaneMasks() const override { return false; }
+ void initialize(ScheduleDAGMI *DAG) override {}
+ SUnit *pickNode(bool &IsTopNode) override { return nullptr; }
+ void schedNode(SUnit *SU, bool IsTopNode) override {}
+ void releaseTopNode(SUnit *SU) override {}
+ void releaseBottomNode(SUnit *SU) override {}
+};
+
+GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
+ StrategyKind S)
+ : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+ , Context(C)
+ , Strategy(S)
+ , UPTracker(*LIS) {
+}
+
+// returns max pressure for a region
+GCNRegPressure
+GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End)
+ const {
+ // For the purpose of pressure tracking bottom inst of the region should
+ // be also processed. End is either BB end, BB terminator inst or sched
+ // boundary inst.
+ auto const BBEnd = Begin->getParent()->end();
+ auto const BottomMI = End == BBEnd ? std::prev(End) : End;
+
+ // scheduleRegions walks bottom to top, so its likely we just get next
+ // instruction to track
+ auto AfterBottomMI = std::next(BottomMI);
+ if (AfterBottomMI == BBEnd ||
+ &*AfterBottomMI != UPTracker.getLastTrackedMI()) {
+ UPTracker.reset(*BottomMI);
+ } else {
+ assert(UPTracker.isValid());
+ }
+
+ for (auto I = BottomMI; I != Begin; --I)
+ UPTracker.recede(*I);
+
+ UPTracker.recede(*Begin);
+
+ assert(UPTracker.isValid() ||
+ (dbgs() << "Tracked region ",
+ printRegion(dbgs(), Begin, End, LIS), false));
+ return UPTracker.moveMaxPressure();
+}
+
+// returns max pressure for a tentative schedule
+template <typename Range> GCNRegPressure
+GCNIterativeScheduler::getSchedulePressure(const Region &R,
+ Range &&Schedule) const {
+ auto const BBEnd = R.Begin->getParent()->end();
+ GCNUpwardRPTracker RPTracker(*LIS);
+ if (R.End != BBEnd) {
+ // R.End points to the boundary instruction but the
+ // schedule doesn't include it
+ RPTracker.reset(*R.End);
+ RPTracker.recede(*R.End);
+ } else {
+ // R.End doesn't point to the boundary instruction
+ RPTracker.reset(*std::prev(BBEnd));
+ }
+ for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
+ RPTracker.recede(*getMachineInstr(*--I));
+ }
+ return RPTracker.moveMaxPressure();
+}
+
+void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
+ MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) {
+ BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs);
+ if (NumRegionInstrs > 2) {
+ Regions.push_back(
+ new (Alloc.Allocate())
+ Region { Begin, End, NumRegionInstrs,
+ getRegionPressure(Begin, End), nullptr });
+ }
+}
+
+void GCNIterativeScheduler::schedule() { // overriden
+ // do nothing
+ DEBUG(
+ printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+ if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+ dbgs() << "Max RP: ";
+ Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
+ }
+ dbgs() << '\n';
+ );
+}
+
+void GCNIterativeScheduler::finalizeSchedule() { // overriden
+ if (Regions.empty())
+ return;
+ switch (Strategy) {
+ case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
+ case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
+ case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+ }
+}
+
+// Detach schedule from SUnits and interleave it with debug values.
+// Returned schedule becomes independent of DAG state.
+std::vector<MachineInstr*>
+GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
+ std::vector<MachineInstr*> Res;
+ Res.reserve(Schedule.size() * 2);
+
+ if (FirstDbgValue)
+ Res.push_back(FirstDbgValue);
+
+ const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end();
+ for (auto SU : Schedule) {
+ Res.push_back(SU->getInstr());
+ const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) {
+ return P.second == SU->getInstr();
+ });
+ if (D != DbgE)
+ Res.push_back(D->first);
+ }
+ return Res;
+}
+
+void GCNIterativeScheduler::setBestSchedule(Region &R,
+ ScheduleRef Schedule,
+ const GCNRegPressure &MaxRP) {
+ R.BestSchedule.reset(
+ new TentativeSchedule{ detachSchedule(Schedule), MaxRP });
+}
+
+void GCNIterativeScheduler::scheduleBest(Region &R) {
+ assert(R.BestSchedule.get() && "No schedule specified");
+ scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure);
+ R.BestSchedule.reset();
+}
+
+// minimal required region scheduler, works for ranges of SUnits*,
+// SUnits or MachineIntrs*
+template <typename Range>
+void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
+ const GCNRegPressure &MaxRP) {
+ assert(RegionBegin == R.Begin && RegionEnd == R.End);
+ assert(LIS != nullptr);
+#ifndef NDEBUG
+ const auto SchedMaxRP = getSchedulePressure(R, Schedule);
+#endif
+ auto BB = R.Begin->getParent();
+ auto Top = R.Begin;
+ for (const auto &I : Schedule) {
+ auto MI = getMachineInstr(I);
+ if (MI != &*Top) {
+ BB->remove(MI);
+ BB->insert(Top, MI);
+ if (!MI->isDebugValue())
+ LIS->handleMove(*MI, true);
+ }
+ if (!MI->isDebugValue()) {
+ // Reset read - undef flags and update them later.
+ for (auto &Op : MI->operands())
+ if (Op.isReg() && Op.isDef())
+ Op.setIsUndef(false);
+
+ RegisterOperands RegOpers;
+ RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
+ /*IgnoreDead*/false);
+ // Adjust liveness and add missing dead+read-undef flags.
+ auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+ RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+ }
+ Top = std::next(MI->getIterator());
+ }
+ RegionBegin = getMachineInstr(Schedule.front());
+
+ // Schedule consisting of MachineInstr* is considered 'detached'
+ // and already interleaved with debug values
+ if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
+ placeDebugValues();
+ // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+ //assert(R.End == RegionEnd);
+ RegionEnd = R.End;
+ }
+
+ R.Begin = RegionBegin;
+ R.MaxPressure = MaxRP;
+
+#ifndef NDEBUG
+ const auto RegionMaxRP = getRegionPressure(R);
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+#endif
+ assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
+ || (dbgs() << "Max RP mismatch!!!\n"
+ "RP for schedule (calculated): ",
+ SchedMaxRP.print(dbgs(), &ST),
+ dbgs() << "RP for schedule (reported): ",
+ MaxRP.print(dbgs(), &ST),
+ dbgs() << "RP after scheduling: ",
+ RegionMaxRP.print(dbgs(), &ST),
+ false));
+}
+
+// Sort recorded regions by pressure - highest at the front
+void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ std::sort(Regions.begin(), Regions.end(),
+ [&ST, TargetOcc](const Region *R1, const Region *R2) {
+ return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+ });
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Legacy MaxOccupancy Strategy
+
+// Tries to increase occupancy applying minreg scheduler for a sequence of
+// most demanding regions. Obtained schedules are saved as BestSchedule for a
+// region.
+// TargetOcc is the best achievable occupancy for a kernel.
+// Returns better occupancy on success or current occupancy on fail.
+// BestSchedules aren't deleted on fail.
+unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
+ // TODO: assert Regions are sorted descending by pressure
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+ DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
+ << ", current = " << Occ << '\n');
+
+ auto NewOcc = TargetOcc;
+ for (auto R : Regions) {
+ if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+ break;
+
+ DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+ printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+
+ BuildDAG DAG(*R, *this);
+ const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+ const auto MaxRP = getSchedulePressure(*R, MinSchedule);
+ DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+ printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+
+ NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+ if (NewOcc <= Occ)
+ break;
+
+ setBestSchedule(*R, MinSchedule, MaxRP);
+ }
+ DEBUG(dbgs() << "New occupancy = " << NewOcc
+ << ", prev occupancy = " << Occ << '\n');
+ return std::max(NewOcc, Occ);
+}
+
+void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
+ bool TryMaximizeOccupancy) {
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+
+ sortRegionsByPressure(TgtOcc);
+ auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+ if (TryMaximizeOccupancy && Occ < TgtOcc)
+ Occ = tryMaximizeOccupancy(TgtOcc);
+
+ // This is really weird but for some magic scheduling regions twice
+ // gives performance improvement
+ const int NumPasses = Occ < TgtOcc ? 2 : 1;
+
+ TgtOcc = std::min(Occ, TgtOcc);
+ DEBUG(dbgs() << "Scheduling using default scheduler, "
+ "target occupancy = " << TgtOcc << '\n');
+ GCNMaxOccupancySchedStrategy LStrgy(Context);
+
+ for (int I = 0; I < NumPasses; ++I) {
+ // running first pass with TargetOccupancy = 0 mimics previous scheduling
+ // approach and is a performance magic
+ LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
+ for (auto R : Regions) {
+ OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
+
+ Ovr.schedule();
+ const auto RP = getRegionPressure(*R);
+ DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+ if (RP.getOccupancy(ST) < TgtOcc) {
+ DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+ if (R->BestSchedule.get() &&
+ R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+ DEBUG(dbgs() << ", scheduling minimal register\n");
+ scheduleBest(*R);
+ } else {
+ DEBUG(dbgs() << ", restoring\n");
+ Ovr.restoreOrder();
+ assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+ }
+ }
+ }
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Minimal Register Strategy
+
+void GCNIterativeScheduler::scheduleMinReg(bool force) {
+ const auto &ST = MF.getSubtarget<SISubtarget>();
+ const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+ sortRegionsByPressure(TgtOcc);
+
+ auto MaxPressure = Regions.front()->MaxPressure;
+ for (auto R : Regions) {
+ if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+ break;
+
+ BuildDAG DAG(*R, *this);
+ const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+
+ const auto RP = getSchedulePressure(*R, MinSchedule);
+ DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+ dbgs() << "\nWarning: Pressure becomes worse after minreg!";
+ printSchedRP(dbgs(), R->MaxPressure, RP);
+ });
+
+ if (!force && MaxPressure.less(ST, RP, TgtOcc))
+ break;
+
+ scheduleRegion(*R, MinSchedule, RP);
+ DEBUG(printSchedResult(dbgs(), R, RP));
+
+ MaxPressure = RP;
+ }
+}
Added: llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h?rev=298368&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h (added)
+++ llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h Tue Mar 21 08:15:46 2017
@@ -0,0 +1,118 @@
+//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+
+#include "GCNRegPressure.h"
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class GCNIterativeScheduler : public ScheduleDAGMILive {
+ typedef ScheduleDAGMILive BaseClass;
+public:
+ enum StrategyKind {
+ SCHEDULE_MINREGONLY,
+ SCHEDULE_MINREGFORCED,
+ SCHEDULE_LEGACYMAXOCCUPANCY
+ };
+
+ GCNIterativeScheduler(MachineSchedContext *C,
+ StrategyKind S);
+
+ void schedule() override;
+
+ void enterRegion(MachineBasicBlock *BB,
+ MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned RegionInstrs) override;
+
+ void finalizeSchedule() override;
+
+protected:
+
+ typedef ArrayRef<const SUnit*> ScheduleRef;
+
+ struct TentativeSchedule {
+ std::vector<MachineInstr*> Schedule;
+ GCNRegPressure MaxPressure;
+ };
+
+ struct Region {
+ // Fields except for BestSchedule are supposed to reflect current IR state
+ // `const` fields are to emphasize they shouldn't change for any schedule.
+ MachineBasicBlock::iterator Begin;
+ // End is either a boundary instruction or end of basic block
+ const MachineBasicBlock::iterator End;
+ const unsigned NumRegionInstrs;
+ GCNRegPressure MaxPressure;
+
+ // best schedule for the region so far (not scheduled yet)
+ std::unique_ptr<TentativeSchedule> BestSchedule;
+ };
+
+ SpecificBumpPtrAllocator<Region> Alloc;
+ std::vector<Region*> Regions;
+
+ MachineSchedContext *Context;
+ const StrategyKind Strategy;
+ mutable GCNUpwardRPTracker UPTracker;
+
+ class BuildDAG;
+ class OverrideLegacyStrategy;
+
+ template <typename Range>
+ GCNRegPressure getSchedulePressure(const Region &R,
+ Range &&Schedule) const;
+
+ GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End) const;
+
+ GCNRegPressure getRegionPressure(const Region &R) const {
+ return getRegionPressure(R.Begin, R.End);
+ }
+
+ void setBestSchedule(Region &R,
+ ScheduleRef Schedule,
+ const GCNRegPressure &MaxRP = GCNRegPressure());
+
+ void scheduleBest(Region &R);
+
+ std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const;
+
+ void sortRegionsByPressure(unsigned TargetOcc);
+
+ template <typename Range>
+ void scheduleRegion(Region &R, Range &&Schedule,
+ const GCNRegPressure &MaxRP = GCNRegPressure());
+
+ unsigned tryMaximizeOccupancy(unsigned TargetOcc =
+ std::numeric_limits<unsigned>::max());
+
+ void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
+ void scheduleMinReg(bool force = false);
+
+ void printRegions(raw_ostream &OS) const;
+ void printSchedResult(raw_ostream &OS,
+ const Region *R,
+ const GCNRegPressure &RP) const;
+ void printSchedRP(raw_ostream &OS,
+ const GCNRegPressure &Before,
+ const GCNRegPressure &After) const;
+};
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
Added: llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp?rev=298368&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp Tue Mar 21 08:15:46 2017
@@ -0,0 +1,266 @@
+//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+class GCNMinRegScheduler {
+ struct Candidate : ilist_node<Candidate> {
+ const SUnit *SU;
+ int Priority;
+
+ Candidate(const SUnit *SU_, int Priority_ = 0)
+ : SU(SU_), Priority(Priority_) {}
+ };
+
+ SpecificBumpPtrAllocator<Candidate> Alloc;
+ typedef simple_ilist<Candidate> Queue;
+ Queue RQ; // Ready queue
+
+ std::vector<unsigned> NumPreds;
+
+ bool isScheduled(const SUnit *SU) const {
+ assert(!SU->isBoundaryNode());
+ return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max();
+ }
+
+ void setIsScheduled(const SUnit *SU) {
+ assert(!SU->isBoundaryNode());
+ NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max();
+ }
+
+ unsigned getNumPreds(const SUnit *SU) const {
+ assert(!SU->isBoundaryNode());
+ assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+ return NumPreds[SU->NodeNum];
+ }
+
+ unsigned decNumPreds(const SUnit *SU) {
+ assert(!SU->isBoundaryNode());
+ assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+ return --NumPreds[SU->NodeNum];
+ }
+
+ void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits);
+
+ int getReadySuccessors(const SUnit *SU) const;
+ int getNotReadySuccessors(const SUnit *SU) const;
+
+ template <typename Calc>
+ unsigned findMax(unsigned Num, Calc C);
+
+ Candidate* pickCandidate();
+
+ void bumpPredsPriority(const SUnit *SchedSU, int Priority);
+ void releaseSuccessors(const SUnit* SU, int Priority);
+
+public:
+ std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+ const ScheduleDAG &DAG);
+};
+
+void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
+ NumPreds.resize(SUnits.size());
+ for (unsigned I = 0; I < SUnits.size(); ++I)
+ NumPreds[I] = SUnits[I].NumPredsLeft;
+}
+
+int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const {
+ unsigned NumSchedSuccs = 0;
+ for (auto SDep : SU->Succs) {
+ bool wouldBeScheduled = true;
+ for (auto PDep : SDep.getSUnit()->Preds) {
+ auto PSU = PDep.getSUnit();
+ assert(!PSU->isBoundaryNode());
+ if (PSU != SU && !isScheduled(PSU)) {
+ wouldBeScheduled = false;
+ break;
+ }
+ }
+ NumSchedSuccs += wouldBeScheduled ? 1 : 0;
+ }
+ return NumSchedSuccs;
+}
+
+int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
+ return SU->Succs.size() - getReadySuccessors(SU);
+}
+
+template <typename Calc>
+unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
+ assert(!RQ.empty() && Num <= RQ.size());
+ typedef decltype(C(*RQ.begin())) T;
+ T Max = std::numeric_limits<T>::min();
+ unsigned NumMax = 0;
+ for (auto I = RQ.begin(); Num; --Num) {
+ T Cur = C(*I);
+ if (Cur >= Max) {
+ if (Cur > Max) {
+ Max = Cur;
+ NumMax = 1;
+ } else
+ ++NumMax;
+ auto &Cand = *I++;
+ RQ.remove(Cand);
+ RQ.push_front(Cand);
+ continue;
+ }
+ ++I;
+ }
+ return NumMax;
+}
+
+GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
+ do {
+ unsigned Num = RQ.size();
+ if (Num == 1) break;
+
+ DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+ Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
+ if (Num == 1) break;
+
+ DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+ << Num << '\n');
+ Num = findMax(Num, [=](const Candidate &C) {
+ auto SU = C.SU;
+ int Res = getNotReadySuccessors(SU);
+ DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+ << Res << " successors, metric = " << -Res << '\n');
+ return -Res;
+ });
+ if (Num == 1) break;
+
+ DEBUG(dbgs() << "\nSelecting most producing candidate among "
+ << Num << '\n');
+ Num = findMax(Num, [=](const Candidate &C) {
+ auto SU = C.SU;
+ auto Res = getReadySuccessors(SU);
+ DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
+ << Res << " successors, metric = " << Res << '\n');
+ return Res;
+ });
+ if (Num == 1) break;
+
+ Num = Num ? Num : RQ.size();
+ DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
+ << Num << '\n');
+ Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
+ assert(Num == 1);
+ } while (false);
+
+ return &RQ.front();
+}
+
+void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
+ SmallPtrSet<const SUnit*, 32> Set;
+ for (const auto &S : SchedSU->Succs) {
+ if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) ||
+ S.getKind() != SDep::Data)
+ continue;
+ for (const auto &P : S.getSUnit()->Preds) {
+ auto PSU = P.getSUnit();
+ assert(!PSU->isBoundaryNode());
+ if (PSU != SchedSU && !isScheduled(PSU)) {
+ Set.insert(PSU);
+ }
+ }
+ }
+ SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end());
+ while (!Worklist.empty()) {
+ auto SU = Worklist.pop_back_val();
+ assert(!SU->isBoundaryNode());
+ for (const auto &P : SU->Preds) {
+ if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) &&
+ Set.insert(P.getSUnit()).second)
+ Worklist.push_back(P.getSUnit());
+ }
+ }
+ DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+ << ")'s non-ready successors of " << Priority
+ << " priority in ready queue: ");
+ const auto SetEnd = Set.end();
+ for (auto &C : RQ) {
+ if (Set.find(C.SU) != SetEnd) {
+ C.Priority = Priority;
+ DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+ }
+ }
+ DEBUG(dbgs() << '\n');
+}
+
+void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
+ for (const auto &S : SU->Succs) {
+ auto SuccSU = S.getSUnit();
+ if (S.isWeak())
+ continue;
+ assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0);
+ if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0)
+ RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority));
+ }
+}
+
+std::vector<const SUnit*>
+GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
+ const ScheduleDAG &DAG) {
+ const auto &SUnits = DAG.SUnits;
+ std::vector<const SUnit*> Schedule;
+ Schedule.reserve(SUnits.size());
+
+ initNumPreds(SUnits);
+
+ int StepNo = 0;
+
+ for (auto SU : TopRoots) {
+ RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo));
+ }
+ releaseSuccessors(&DAG.EntrySU, StepNo);
+
+ while (!RQ.empty()) {
+ DEBUG(
+ dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
+ "Ready queue:";
+ for (auto &C : RQ)
+ dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+ dbgs() << '\n';
+ );
+
+ auto C = pickCandidate();
+ assert(C);
+ RQ.remove(*C);
+ auto SU = C->SU;
+ DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+ releaseSuccessors(SU, StepNo);
+ Schedule.push_back(SU);
+ setIsScheduled(SU);
+
+ if (getReadySuccessors(SU) == 0)
+ bumpPredsPriority(SU, StepNo);
+
+ ++StepNo;
+ }
+ assert(SUnits.size() == Schedule.size());
+
+ return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+ const ScheduleDAG &DAG) {
+ GCNMinRegScheduler S;
+ return S.schedule(TopRoots, DAG);
+}
+}
Added: llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp?rev=298368&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp Tue Mar 21 08:15:46 2017
@@ -0,0 +1,355 @@
+//===------------------------- GCNRegPressure.cpp - -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNRegPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void llvm::printLivesAt(SlotIndex SI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI) {
+ dbgs() << "Live regs at " << SI << ": "
+ << *LIS.getInstructionFromIndex(SI);
+ unsigned Num = 0;
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ if (MRI.reg_nodbg_empty(Reg))
+ continue;
+ const auto &LI = LIS.getInterval(Reg);
+ if (LI.hasSubRanges()) {
+ bool firstTime = true;
+ for (const auto &S : LI.subranges()) {
+ if (!S.liveAt(SI)) continue;
+ if (firstTime) {
+ dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo())
+ << '\n';
+ firstTime = false;
+ }
+ dbgs() << " " << S << '\n';
+ ++Num;
+ }
+ } else if (LI.liveAt(SI)) {
+ dbgs() << " " << LI << '\n';
+ ++Num;
+ }
+ }
+ if (!Num) dbgs() << " <none>\n";
+}
+
+static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+ const GCNRPTracker::LiveRegSet &S2) {
+ if (S1.size() != S2.size())
+ return false;
+
+ for (const auto &P : S1) {
+ auto I = S2.find(P.first);
+ if (I == S2.end() || I->second != P.second)
+ return false;
+ }
+ return true;
+}
+
+static GCNRPTracker::LiveRegSet
+stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
+ GCNRPTracker::LiveRegSet Res;
+ for (const auto &P : LR) {
+ if (P.second.any())
+ Res.insert(P);
+ }
+ return Res;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRegPressure
+
+unsigned GCNRegPressure::getRegKind(unsigned Reg,
+ const MachineRegisterInfo &MRI) {
+ assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ const auto RC = MRI.getRegClass(Reg);
+ auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+ return STI->isSGPRClass(RC) ?
+ (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
+ (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
+}
+
+void GCNRegPressure::inc(unsigned Reg,
+ LaneBitmask PrevMask,
+ LaneBitmask NewMask,
+ const MachineRegisterInfo &MRI) {
+ if (NewMask == PrevMask)
+ return;
+
+ int Sign = 1;
+ if (NewMask < PrevMask) {
+ std::swap(NewMask, PrevMask);
+ Sign = -1;
+ }
+#ifndef NDEBUG
+ const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
+#endif
+ switch (auto Kind = getRegKind(Reg, MRI)) {
+ case SGPR32:
+ case VGPR32:
+ assert(PrevMask.none() && NewMask == MaxMask);
+ Value[Kind] += Sign;
+ break;
+
+ case SGPR_TUPLE:
+ case VGPR_TUPLE:
+ assert(NewMask < MaxMask || NewMask == MaxMask);
+ assert(PrevMask < NewMask);
+
+ Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+ Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
+
+ if (PrevMask.none()) {
+ assert(NewMask.any());
+ Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
+ }
+ break;
+
+ default: llvm_unreachable("Unknown register kind");
+ }
+}
+
+bool GCNRegPressure::less(const SISubtarget &ST,
+ const GCNRegPressure& O,
+ unsigned MaxOccupancy) const {
+ const auto SGPROcc = std::min(MaxOccupancy,
+ ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+ const auto VGPROcc = std::min(MaxOccupancy,
+ ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+ const auto OtherSGPROcc = std::min(MaxOccupancy,
+ ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+ const auto OtherVGPROcc = std::min(MaxOccupancy,
+ ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+
+ const auto Occ = std::min(SGPROcc, VGPROcc);
+ const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+ if (Occ != OtherOcc)
+ return Occ > OtherOcc;
+
+ bool SGPRImportant = SGPROcc < VGPROcc;
+ const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
+
+ // if both pressures disagree on what is more important compare vgprs
+ if (SGPRImportant != OtherSGPRImportant) {
+ SGPRImportant = false;
+ }
+
+ // compare large regs pressure
+ bool SGPRFirst = SGPRImportant;
+ for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
+ if (SGPRFirst) {
+ auto SW = getSGPRTuplesWeight();
+ auto OtherSW = O.getSGPRTuplesWeight();
+ if (SW != OtherSW)
+ return SW < OtherSW;
+ } else {
+ auto VW = getVGPRTuplesWeight();
+ auto OtherVW = O.getVGPRTuplesWeight();
+ if (VW != OtherVW)
+ return VW < OtherVW;
+ }
+ }
+ return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
+ (getVGRPNum() < O.getVGRPNum());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+ OS << "VGPRs: " << getVGRPNum();
+ if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
+ OS << ", SGPRs: " << getSGRPNum();
+ if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+ OS << ", LVGPR WT: " << getVGPRTuplesWeight()
+ << ", LSGPR WT: " << getSGPRTuplesWeight();
+ if (ST) OS << " -> Occ: " << getOccupancy(*ST);
+ OS << '\n';
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
+LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
+ SlotIndex SI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI) {
+ assert(!MRI.reg_nodbg_empty(Reg));
+ LaneBitmask LiveMask;
+ const auto &LI = LIS.getInterval(Reg);
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges())
+ if (S.liveAt(SI)) {
+ LiveMask |= S.LaneMask;
+ assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
+ LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
+ }
+ } else if (LI.liveAt(SI)) {
+ LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
+ }
+ return LiveMask;
+}
+
+GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI) {
+ GCNRPTracker::LiveRegSet LiveRegs;
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ if (MRI.reg_nodbg_empty(Reg))
+ continue;
+ auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
+ if (LiveMask.any())
+ LiveRegs[Reg] = LiveMask;
+ }
+ return LiveRegs;
+}
+
+void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
+ MRI = &MI.getParent()->getParent()->getRegInfo();
+ LiveRegs = getLiveRegsAfter(MI, LIS);
+ MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
+LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+ assert(MO.isDef() && MO.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+ // We don't rely on read-undef flag because in case of tentative schedule
+ // tracking it isn't set correctly yet. This works correctly however since
+ // use mask has been tracked before using LIS.
+ return MO.getSubReg() == 0 ?
+ MRI->getMaxLaneMaskForVReg(MO.getReg()) :
+ MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+ assert(MO.isUse() && MO.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+ if (auto SubReg = MO.getSubReg())
+ return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+ auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
+ if (MaxMask.getAsInteger() == 1) // cannot have subregs
+ return MaxMask;
+
+ // For a tentative schedule LIS isn't updated yet but livemask should remain
+ // the same on any schedule. Subreg defs can be reordered but they all must
+ // dominate uses anyway.
+ auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+ return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
+}
+
+void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
+ assert(MRI && "call reset first");
+
+ LastTrackedMI = &MI;
+
+ if (MI.isDebugValue())
+ return;
+
+ // process all defs first to ensure early clobbers are handled correctly
+ // iterating over operands() to catch implicit defs
+ for (const auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef() ||
+ !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ continue;
+
+ auto Reg = MO.getReg();
+ auto &LiveMask = LiveRegs[Reg];
+ auto PrevMask = LiveMask;
+ LiveMask &= ~getDefRegMask(MO);
+ CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+ }
+
+ // then all uses
+ for (const auto &MO : MI.uses()) {
+ if (!MO.isReg() || !MO.readsReg() ||
+ !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ continue;
+
+ auto Reg = MO.getReg();
+ auto &LiveMask = LiveRegs[Reg];
+ auto PrevMask = LiveMask;
+ LiveMask |= getUsedRegMask(MO);
+ CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+ }
+
+ MaxPressure = max(MaxPressure, CurPressure);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
+ const GCNRPTracker::LiveRegSet &TrackedLR,
+ const TargetRegisterInfo *TRI) {
+ for (auto const &P : TrackedLR) {
+ auto I = LISLR.find(P.first);
+ if (I == LISLR.end()) {
+ dbgs() << " " << PrintReg(P.first, TRI)
+ << ":L" << PrintLaneMask(P.second)
+ << " isn't found in LIS reported set\n";
+ }
+ else if (I->second != P.second) {
+ dbgs() << " " << PrintReg(P.first, TRI)
+ << " masks doesn't match: LIS reported "
+ << PrintLaneMask(I->second)
+ << ", tracked "
+ << PrintLaneMask(P.second)
+ << '\n';
+ }
+ }
+ for (auto const &P : LISLR) {
+ auto I = TrackedLR.find(P.first);
+ if (I == TrackedLR.end()) {
+ dbgs() << " " << PrintReg(P.first, TRI)
+ << ":L" << PrintLaneMask(P.second)
+ << " isn't found in tracked set\n";
+ }
+ }
+}
+
+bool GCNUpwardRPTracker::isValid() const {
+ const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
+ const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
+ const auto TrackedLR = stripEmpty(LiveRegs);
+
+ if (!isEqual(LISLR, TrackedLR)) {
+ dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
+ " LIS reported livesets mismatch:\n";
+ printLivesAt(SI, LIS, *MRI);
+ reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo());
+ return false;
+ }
+
+ auto LISPressure = getRegPressure(*MRI, LISLR);
+ if (LISPressure != CurPressure) {
+ dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: ";
+ CurPressure.print(dbgs());
+ dbgs() << "LIS rpt: ";
+ LISPressure.print(dbgs());
+ return false;
+ }
+ return true;
+}
+
+#endif
Added: llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h?rev=298368&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h (added)
+++ llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h Tue Mar 21 08:15:46 2017
@@ -0,0 +1,170 @@
+//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+
+#include "AMDGPUSubtarget.h"
+
+#include <limits>
+
+namespace llvm {
+
+struct GCNRegPressure {
+ enum RegKind {
+ SGPR32,
+ SGPR_TUPLE,
+ VGPR32,
+ VGPR_TUPLE,
+ TOTAL_KINDS
+ };
+
+ GCNRegPressure() {
+ clear();
+ }
+
+ bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+
+ void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+
+ unsigned getSGRPNum() const { return Value[SGPR32]; }
+ unsigned getVGRPNum() const { return Value[VGPR32]; }
+
+ unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+ unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
+
+ unsigned getOccupancy(const SISubtarget &ST) const {
+ return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
+ ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+ }
+
+ void inc(unsigned Reg,
+ LaneBitmask PrevMask,
+ LaneBitmask NewMask,
+ const MachineRegisterInfo &MRI);
+
+ bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+ return getOccupancy(ST) > O.getOccupancy(ST);
+ }
+
+ bool less(const SISubtarget &ST, const GCNRegPressure& O,
+ unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+
+ bool operator==(const GCNRegPressure &O) const {
+ return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
+ }
+
+ bool operator!=(const GCNRegPressure &O) const {
+ return !(*this == O);
+ }
+
+ void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
+ void dump() const { print(dbgs()); }
+
+private:
+ unsigned Value[TOTAL_KINDS];
+
+ static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+
+ friend GCNRegPressure max(const GCNRegPressure &P1,
+ const GCNRegPressure &P2);
+};
+
+inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
+ GCNRegPressure Res;
+ for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I)
+ Res.Value[I] = std::max(P1.Value[I], P2.Value[I]);
+ return Res;
+}
+
+class GCNRPTracker {
+public:
+ typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
+
+protected:
+ LiveRegSet LiveRegs;
+ GCNRegPressure CurPressure, MaxPressure;
+ const MachineInstr *LastTrackedMI = nullptr;
+ mutable const MachineRegisterInfo *MRI = nullptr;
+ GCNRPTracker() {}
+public:
+ // live regs for the current state
+ const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
+ const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+
+ // returns MaxPressure, resetting it
+ decltype(MaxPressure) moveMaxPressure() {
+ auto Res = MaxPressure;
+ MaxPressure.clear();
+ return Res;
+ }
+ decltype(LiveRegs) moveLiveRegs() {
+ return std::move(LiveRegs);
+ }
+};
+
+class GCNUpwardRPTracker : public GCNRPTracker {
+ const LiveIntervals &LIS;
+ LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+ LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
+public:
+ GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ // reset tracker to the point just below MI
+ // filling live regs upon this point using LIS
+ void reset(const MachineInstr &MI);
+
+ // move to the state just above the MI
+ void recede(const MachineInstr &MI);
+
+ // checks whether the tracker's state after receding MI corresponds
+ // to reported by LIS
+ bool isValid() const;
+};
+
+LaneBitmask getLiveLaneMask(unsigned Reg,
+ SlotIndex SI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI);
+
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI);
+
+inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
+ const LiveIntervals &LIS) {
+ return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
+ MI.getParent()->getParent()->getRegInfo());
+}
+
+inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
+ const LiveIntervals &LIS) {
+ return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
+ MI.getParent()->getParent()->getRegInfo());
+}
+
+template <typename Range>
+GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
+ Range &&LiveRegs) {
+ GCNRegPressure Res;
+ for (const auto &RM : LiveRegs)
+ Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
+ return Res;
+}
+
+void printLivesAt(SlotIndex SI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI);
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
Modified: llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp?rev=298368&r1=298367&r2=298368&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp Tue Mar 21 08:15:46 2017
@@ -45,8 +45,6 @@ void GCNMaxOccupancySchedStrategy::initi
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
- if (MF != &DAG->MF)
- TargetOccupancy = 0;
MF = &DAG->MF;
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
@@ -531,7 +529,7 @@ void GCNScheduleDAGMILive::finalizeSched
Stage++;
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- S.TargetOccupancy = MinOccupancy;
+ S.setTargetOccupancy(MinOccupancy);
MachineBasicBlock *MBB = nullptr;
for (auto Region : Regions) {
Modified: llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h?rev=298368&r1=298367&r2=298368&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h Tue Mar 21 08:15:46 2017
@@ -55,6 +55,8 @@ public:
SUnit *pickNode(bool &IsTopNode) override;
void initialize(ScheduleDAGMI *DAG) override;
+
+ void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};
class GCNScheduleDAGMILive : public ScheduleDAGMILive {
Modified: llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll?rev=298368&r1=298367&r2=298368&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll Tue Mar 21 08:15:46 2017
@@ -1,4 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
; We expect a two digit VGPR usage here, not a three digit.
; CHECK: NumVgprs: {{[0-9][0-9]$}}
Added: llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll?rev=298368&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll Tue Mar 21 08:15:46 2017
@@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+
+; SI: NumSgprs: {{[1-9]$}}
+; SI: NumVgprs: {{[1-9]$}}
+
+; stores may alias loads
+; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumVgprs: {{[1-3][0-9]$}}
+
+define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+bb:
+ %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
+ %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
+ %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508
+ %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772
+ %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020
+ %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276
+ %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540
+ %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788
+ %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044
+ %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308
+ %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556
+ %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812
+ %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076
+ %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324
+ %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580
+ %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844
+ %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092
+ %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348
+ %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612
+ %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860
+ %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116
+ %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380
+ %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628
+ %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884
+ %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148
+ %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396
+ %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652
+ %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916
+ %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164
+ %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420
+ %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684
+ %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932
+ %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188
+ %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452
+ %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700
+ %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956
+ %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220
+ %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468
+ %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724
+ %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988
+ %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236
+ %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492
+ %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756
+ %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004
+ %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260
+ %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524
+ %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772
+ %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028
+ %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292
+ %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540
+ %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796
+ %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060
+ %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308
+ %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564
+ %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828
+ %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076
+ %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332
+ %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596
+ %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844
+ %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100
+ %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364
+ %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612
+ %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868
+ %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132
+ %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380
+ %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636
+ %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900
+ %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148
+ %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404
+ %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668
+ %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916
+ %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172
+ %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436
+ %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684
+ %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940
+ %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204
+ %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452
+ %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708
+ %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972
+ %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220
+ %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476
+ %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740
+ %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988
+ %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244
+ %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508
+ %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756
+ %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012
+ %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276
+ %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524
+ %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780
+ %a.0 = load float, float addrspace(3)* %adr.a.0, align 4
+ %b.0 = load float, float addrspace(3)* %adr.b.0, align 4
+ %c.0 = load float, float addrspace(3)* %adr.c.0, align 4
+ %a.1 = load float, float addrspace(3)* %adr.a.1, align 4
+ %b.1 = load float, float addrspace(3)* %adr.b.1, align 4
+ %c.1 = load float, float addrspace(3)* %adr.c.1, align 4
+ %a.2 = load float, float addrspace(3)* %adr.a.2, align 4
+ %b.2 = load float, float addrspace(3)* %adr.b.2, align 4
+ %c.2 = load float, float addrspace(3)* %adr.c.2, align 4
+ %a.3 = load float, float addrspace(3)* %adr.a.3, align 4
+ %b.3 = load float, float addrspace(3)* %adr.b.3, align 4
+ %c.3 = load float, float addrspace(3)* %adr.c.3, align 4
+ %a.4 = load float, float addrspace(3)* %adr.a.4, align 4
+ %b.4 = load float, float addrspace(3)* %adr.b.4, align 4
+ %c.4 = load float, float addrspace(3)* %adr.c.4, align 4
+ %a.5 = load float, float addrspace(3)* %adr.a.5, align 4
+ %b.5 = load float, float addrspace(3)* %adr.b.5, align 4
+ %c.5 = load float, float addrspace(3)* %adr.c.5, align 4
+ %a.6 = load float, float addrspace(3)* %adr.a.6, align 4
+ %b.6 = load float, float addrspace(3)* %adr.b.6, align 4
+ %c.6 = load float, float addrspace(3)* %adr.c.6, align 4
+ %a.7 = load float, float addrspace(3)* %adr.a.7, align 4
+ %b.7 = load float, float addrspace(3)* %adr.b.7, align 4
+ %c.7 = load float, float addrspace(3)* %adr.c.7, align 4
+ %a.8 = load float, float addrspace(3)* %adr.a.8, align 4
+ %b.8 = load float, float addrspace(3)* %adr.b.8, align 4
+ %c.8 = load float, float addrspace(3)* %adr.c.8, align 4
+ %a.9 = load float, float addrspace(3)* %adr.a.9, align 4
+ %b.9 = load float, float addrspace(3)* %adr.b.9, align 4
+ %c.9 = load float, float addrspace(3)* %adr.c.9, align 4
+ %a.10 = load float, float addrspace(3)* %adr.a.10, align 4
+ %b.10 = load float, float addrspace(3)* %adr.b.10, align 4
+ %c.10 = load float, float addrspace(3)* %adr.c.10, align 4
+ %a.11 = load float, float addrspace(3)* %adr.a.11, align 4
+ %b.11 = load float, float addrspace(3)* %adr.b.11, align 4
+ %c.11 = load float, float addrspace(3)* %adr.c.11, align 4
+ %a.12 = load float, float addrspace(3)* %adr.a.12, align 4
+ %b.12 = load float, float addrspace(3)* %adr.b.12, align 4
+ %c.12 = load float, float addrspace(3)* %adr.c.12, align 4
+ %a.13 = load float, float addrspace(3)* %adr.a.13, align 4
+ %b.13 = load float, float addrspace(3)* %adr.b.13, align 4
+ %c.13 = load float, float addrspace(3)* %adr.c.13, align 4
+ %a.14 = load float, float addrspace(3)* %adr.a.14, align 4
+ %b.14 = load float, float addrspace(3)* %adr.b.14, align 4
+ %c.14 = load float, float addrspace(3)* %adr.c.14, align 4
+ %a.15 = load float, float addrspace(3)* %adr.a.15, align 4
+ %b.15 = load float, float addrspace(3)* %adr.b.15, align 4
+ %c.15 = load float, float addrspace(3)* %adr.c.15, align 4
+ %a.16 = load float, float addrspace(3)* %adr.a.16, align 4
+ %b.16 = load float, float addrspace(3)* %adr.b.16, align 4
+ %c.16 = load float, float addrspace(3)* %adr.c.16, align 4
+ %a.17 = load float, float addrspace(3)* %adr.a.17, align 4
+ %b.17 = load float, float addrspace(3)* %adr.b.17, align 4
+ %c.17 = load float, float addrspace(3)* %adr.c.17, align 4
+ %a.18 = load float, float addrspace(3)* %adr.a.18, align 4
+ %b.18 = load float, float addrspace(3)* %adr.b.18, align 4
+ %c.18 = load float, float addrspace(3)* %adr.c.18, align 4
+ %a.19 = load float, float addrspace(3)* %adr.a.19, align 4
+ %b.19 = load float, float addrspace(3)* %adr.b.19, align 4
+ %c.19 = load float, float addrspace(3)* %adr.c.19, align 4
+ %a.20 = load float, float addrspace(3)* %adr.a.20, align 4
+ %b.20 = load float, float addrspace(3)* %adr.b.20, align 4
+ %c.20 = load float, float addrspace(3)* %adr.c.20, align 4
+ %a.21 = load float, float addrspace(3)* %adr.a.21, align 4
+ %b.21 = load float, float addrspace(3)* %adr.b.21, align 4
+ %c.21 = load float, float addrspace(3)* %adr.c.21, align 4
+ %a.22 = load float, float addrspace(3)* %adr.a.22, align 4
+ %b.22 = load float, float addrspace(3)* %adr.b.22, align 4
+ %c.22 = load float, float addrspace(3)* %adr.c.22, align 4
+ %a.23 = load float, float addrspace(3)* %adr.a.23, align 4
+ %b.23 = load float, float addrspace(3)* %adr.b.23, align 4
+ %c.23 = load float, float addrspace(3)* %adr.c.23, align 4
+ %a.24 = load float, float addrspace(3)* %adr.a.24, align 4
+ %b.24 = load float, float addrspace(3)* %adr.b.24, align 4
+ %c.24 = load float, float addrspace(3)* %adr.c.24, align 4
+ %a.25 = load float, float addrspace(3)* %adr.a.25, align 4
+ %b.25 = load float, float addrspace(3)* %adr.b.25, align 4
+ %c.25 = load float, float addrspace(3)* %adr.c.25, align 4
+ %a.26 = load float, float addrspace(3)* %adr.a.26, align 4
+ %b.26 = load float, float addrspace(3)* %adr.b.26, align 4
+ %c.26 = load float, float addrspace(3)* %adr.c.26, align 4
+ %a.27 = load float, float addrspace(3)* %adr.a.27, align 4
+ %b.27 = load float, float addrspace(3)* %adr.b.27, align 4
+ %c.27 = load float, float addrspace(3)* %adr.c.27, align 4
+ %a.28 = load float, float addrspace(3)* %adr.a.28, align 4
+ %b.28 = load float, float addrspace(3)* %adr.b.28, align 4
+ %c.28 = load float, float addrspace(3)* %adr.c.28, align 4
+ %a.29 = load float, float addrspace(3)* %adr.a.29, align 4
+ %b.29 = load float, float addrspace(3)* %adr.b.29, align 4
+ %c.29 = load float, float addrspace(3)* %adr.c.29, align 4
+ %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0)
+ %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1)
+ %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2)
+ %res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3)
+ %res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4)
+ %res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5)
+ %res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6)
+ %res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7)
+ %res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8)
+ %res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9)
+ %res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10)
+ %res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11)
+ %res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12)
+ %res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13)
+ %res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14)
+ %res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15)
+ %res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16)
+ %res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17)
+ %res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18)
+ %res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19)
+ %res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20)
+ %res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21)
+ %res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22)
+ %res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23)
+ %res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24)
+ %res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25)
+ %res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26)
+ %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27)
+ %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28)
+ %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29)
+ %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0
+ %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2
+ %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4
+ %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6
+ %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8
+ %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10
+ %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12
+ %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14
+ %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16
+ %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18
+ %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20
+ %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22
+ %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24
+ %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26
+ %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28
+ %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30
+ %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32
+ %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34
+ %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36
+ %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38
+ %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40
+ %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42
+ %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44
+ %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46
+ %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48
+ %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50
+ %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52
+ %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54
+ %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56
+ %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58
+ store float %res.0, float addrspace(1)* %adr.res.0, align 4
+ store float %res.1, float addrspace(1)* %adr.res.1, align 4
+ store float %res.2, float addrspace(1)* %adr.res.2, align 4
+ store float %res.3, float addrspace(1)* %adr.res.3, align 4
+ store float %res.4, float addrspace(1)* %adr.res.4, align 4
+ store float %res.5, float addrspace(1)* %adr.res.5, align 4
+ store float %res.6, float addrspace(1)* %adr.res.6, align 4
+ store float %res.7, float addrspace(1)* %adr.res.7, align 4
+ store float %res.8, float addrspace(1)* %adr.res.8, align 4
+ store float %res.9, float addrspace(1)* %adr.res.9, align 4
+ store float %res.10, float addrspace(1)* %adr.res.10, align 4
+ store float %res.11, float addrspace(1)* %adr.res.11, align 4
+ store float %res.12, float addrspace(1)* %adr.res.12, align 4
+ store float %res.13, float addrspace(1)* %adr.res.13, align 4
+ store float %res.14, float addrspace(1)* %adr.res.14, align 4
+ store float %res.15, float addrspace(1)* %adr.res.15, align 4
+ store float %res.16, float addrspace(1)* %adr.res.16, align 4
+ store float %res.17, float addrspace(1)* %adr.res.17, align 4
+ store float %res.18, float addrspace(1)* %adr.res.18, align 4
+ store float %res.19, float addrspace(1)* %adr.res.19, align 4
+ store float %res.20, float addrspace(1)* %adr.res.20, align 4
+ store float %res.21, float addrspace(1)* %adr.res.21, align 4
+ store float %res.22, float addrspace(1)* %adr.res.22, align 4
+ store float %res.23, float addrspace(1)* %adr.res.23, align 4
+ store float %res.24, float addrspace(1)* %adr.res.24, align 4
+ store float %res.25, float addrspace(1)* %adr.res.25, align 4
+ store float %res.26, float addrspace(1)* %adr.res.26, align 4
+ store float %res.27, float addrspace(1)* %adr.res.27, align 4
+ store float %res.28, float addrspace(1)* %adr.res.28, align 4
+ store float %res.29, float addrspace(1)* %adr.res.29, align 4
+ ret void
+}
+declare float @llvm.fmuladd.f32(float, float, float) #0
+attributes #0 = { nounwind readnone }
More information about the llvm-commits
mailing list