[llvm] r333289 - [AMDGPU] Add perf hints to functions
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri May 25 10:25:12 PDT 2018
Author: rampitec
Date: Fri May 25 10:25:12 2018
New Revision: 333289
URL: http://llvm.org/viewvc/llvm-project?rev=333289&view=rev
Log:
[AMDGPU] Add perf hints to functions
This is adoption of HSAIL perfhint pass. Two types of hints are produced:
1. Function is memory bound.
2. Kernel can use wave limiter.
Currently these hints are used in the scheduler. If a function is suspected
to be memory bound we allow occupancy to decrease to 4 waves in the course
of scheduling.
Differential Revision: https://reviews.llvm.org/D46992
Added:
llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h
llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp
llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Fri May 25 10:25:12 2018
@@ -136,6 +136,9 @@ extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
+void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
+extern char &AMDGPUPerfHintAnalysisID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp Fri May 25 10:25:12 2018
@@ -278,11 +278,14 @@ void AMDGPUAsmPrinter::emitCommonFunctio
uint32_t NumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
- uint64_t CodeSize) {
+ uint64_t CodeSize,
+ const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+ OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
+ false);
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -339,7 +342,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunct
Info.NumVGPR,
Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
Info.PrivateSegmentSize,
- getFunctionCodeSize(MF));
+ getFunctionCodeSize(MF), MFI);
return false;
}
@@ -347,7 +350,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunct
emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
CurrentProgramInfo.NumSGPR,
CurrentProgramInfo.ScratchSize,
- getFunctionCodeSize(MF));
+ getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -376,6 +379,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunct
" ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
false);
+ OutStreamer->emitRawComment(
+ " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
+
if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
OutStreamer->emitRawComment(
" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h Fri May 25 10:25:12 2018
@@ -29,6 +29,7 @@
namespace llvm {
+class AMDGPUMachineFunction;
class AMDGPUTargetStreamer;
class MCOperand;
class SISubtarget;
@@ -144,7 +145,8 @@ private:
void emitCommonFunctionComments(uint32_t NumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
- uint64_t CodeSize);
+ uint64_t CodeSize,
+ const AMDGPUMachineFunction* MFI);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Fri May 25 10:25:12 2018
@@ -16,6 +16,7 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
@@ -85,6 +86,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AMDGPUArgumentUsageInfo>();
+ AU.addRequired<AMDGPUPerfHintAnalysis>();
AU.addRequired<DivergenceAnalysis>();
SelectionDAGISel::getAnalysisUsage(AU);
}
@@ -242,6 +244,7 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp Fri May 25 10:25:12 2018
@@ -9,6 +9,8 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
using namespace llvm;
@@ -20,9 +22,19 @@ AMDGPUMachineFunction::AMDGPUMachineFunc
LDSSize(0),
ABIArgOffset(0),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
- NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+ NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
+ MemoryBound(false),
+ WaveLimiter(false) {
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
// except reserved size is not correctly aligned.
+
+ if (auto *Resolver = MF.getMMI().getResolver()) {
+ if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
+ Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
+ MemoryBound = PHA->isMemoryBound(&MF.getFunction());
+ WaveLimiter = PHA->needsWaveLimiter(&MF.getFunction());
+ }
+ }
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h Fri May 25 10:25:12 2018
@@ -36,6 +36,12 @@ class AMDGPUMachineFunction : public Mac
bool NoSignedZerosFPMath;
+ // Function may be memory bound.
+ bool MemoryBound;
+
+ // Kernel may need limited waves per EU for better performance.
+ bool WaveLimiter;
+
public:
AMDGPUMachineFunction(const MachineFunction &MF);
@@ -78,6 +84,14 @@ public:
return NoSignedZerosFPMath;
}
+ bool isMemoryBound() const {
+ return MemoryBound;
+ }
+
+ bool needsWaveLimiter() const {
+ return WaveLimiter;
+ }
+
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
};
Added: llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp?rev=333289&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp Fri May 25 10:25:12 2018
@@ -0,0 +1,404 @@
+//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUPerfHintAnalysis.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-perf-hint"
+
+static cl::opt<unsigned>
+ MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
+ cl::desc("Function mem bound threshold in %"));
+
+static cl::opt<unsigned>
+ LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
+ cl::desc("Kernel limit wave threshold in %"));
+
+static cl::opt<unsigned>
+ IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
+ cl::desc("Indirect access memory instruction weight"));
+
+static cl::opt<unsigned>
+ LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
+ cl::desc("Large stride memory access weight"));
+
+static cl::opt<unsigned>
+ LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
+ cl::desc("Large stride memory access threshold"));
+
+STATISTIC(NumMemBound, "Number of functions marked as memory bound");
+STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
+
+char llvm::AMDGPUPerfHintAnalysis::ID = 0;
+char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
+
+INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
+ "Analysis if a function is memory bound", true, true)
+
+namespace {
+
+struct AMDGPUPerfHint {
+ friend AMDGPUPerfHintAnalysis;
+
+public:
+ AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
+ const TargetLowering *TLI_)
+ : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
+
+ void runOnFunction(Function &F);
+
+private:
+ struct MemAccessInfo {
+ const Value *V;
+ const Value *Base;
+ int64_t Offset;
+ MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+ bool isLargeStride(MemAccessInfo &Reference) const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ Printable print() const {
+ return Printable([this](raw_ostream &OS) {
+ OS << "Value: " << *V << '\n'
+ << "Base: " << *Base << " Offset: " << Offset << '\n';
+ });
+ }
+#endif
+ };
+
+ MemAccessInfo makeMemAccessInfo(Instruction *) const;
+
+ MemAccessInfo LastAccess; // Last memory access info
+
+ AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
+
+ const DataLayout *DL;
+
+ AMDGPUAS AS;
+
+ const TargetLowering *TLI;
+
+ AMDGPUPerfHintAnalysis::FuncInfoMap::iterator visit(const Function &F);
+ static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+ static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
+
+ bool isIndirectAccess(const Instruction *Inst) const;
+
+ /// Check if the instruction is large stride.
+ /// The purpose is to identify memory access pattern like:
+ /// x = a[i];
+ /// y = a[i+1000];
+ /// z = a[i+2000];
+ /// In the above example, the second and third memory access will be marked
+ /// large stride memory access.
+ bool isLargeStride(const Instruction *Inst);
+
+ bool isGlobalAddr(const Value *V) const;
+ bool isLocalAddr(const Value *V) const;
+ bool isConstantAddr(const Value *V) const;
+};
+
+static const Value *getMemoryInstrPtr(const Instruction *Inst) {
+ if (auto LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->getPointerOperand();
+ }
+ if (auto SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->getPointerOperand();
+ }
+ if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ return AI->getPointerOperand();
+ }
+ if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
+ return AI->getPointerOperand();
+ }
+ if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
+ return MI->getRawDest();
+ }
+
+ return nullptr;
+}
+
+bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
+ LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
+ SmallSet<const Value *, 32> WorkSet;
+ SmallSet<const Value *, 32> Visited;
+ if (const Value *MO = getMemoryInstrPtr(Inst)) {
+ if (isGlobalAddr(MO))
+ WorkSet.insert(MO);
+ }
+
+ while (!WorkSet.empty()) {
+ const Value *V = *WorkSet.begin();
+ WorkSet.erase(*WorkSet.begin());
+ if (!Visited.insert(V).second)
+ continue;
+ LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
+
+ if (auto LD = dyn_cast<LoadInst>(V)) {
+ auto M = LD->getPointerOperand();
+ if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
+ LLVM_DEBUG(dbgs() << " is IA\n");
+ return true;
+ }
+ continue;
+ }
+
+ if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
+ auto P = GEP->getPointerOperand();
+ WorkSet.insert(P);
+ for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
+ WorkSet.insert(GEP->getOperand(I));
+ continue;
+ }
+
+ if (auto U = dyn_cast<UnaryInstruction>(V)) {
+ WorkSet.insert(U->getOperand(0));
+ continue;
+ }
+
+ if (auto BO = dyn_cast<BinaryOperator>(V)) {
+ WorkSet.insert(BO->getOperand(0));
+ WorkSet.insert(BO->getOperand(1));
+ continue;
+ }
+
+ if (auto S = dyn_cast<SelectInst>(V)) {
+ WorkSet.insert(S->getFalseValue());
+ WorkSet.insert(S->getTrueValue());
+ continue;
+ }
+
+ if (auto E = dyn_cast<ExtractElementInst>(V)) {
+ WorkSet.insert(E->getVectorOperand());
+ continue;
+ }
+
+ if (auto Phi = dyn_cast<PHINode>(V)) {
+ for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+ WorkSet.insert(Phi->getIncomingValue(I));
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " dropped\n");
+ }
+
+ LLVM_DEBUG(dbgs() << " is not IA\n");
+ return false;
+}
+
+AMDGPUPerfHintAnalysis::FuncInfoMap::iterator
+AMDGPUPerfHint::visit(const Function &F) {
+ auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
+ if (!FIP.second)
+ return FIP.first;
+
+ AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
+
+ LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
+
+ for (auto &B : F) {
+ LastAccess = MemAccessInfo();
+ for (auto &I : B) {
+ if (getMemoryInstrPtr(&I)) {
+ if (isIndirectAccess(&I))
+ ++FI.IAMInstCount;
+ if (isLargeStride(&I))
+ ++FI.LSMInstCount;
+ ++FI.MemInstCount;
+ ++FI.InstCount;
+ continue;
+ }
+ CallSite CS(const_cast<Instruction *>(&I));
+ if (CS) {
+ Function *Callee = CS.getCalledFunction();
+ if (!Callee || Callee->isDeclaration()) {
+ ++FI.InstCount;
+ continue;
+ }
+ if (&F == Callee) // Handle immediate recursion
+ continue;
+
+ auto Loc = visit(*Callee);
+
+ assert(Loc != FIM.end() && "No func info");
+ FI.MemInstCount += Loc->second.MemInstCount;
+ FI.InstCount += Loc->second.InstCount;
+ FI.IAMInstCount += Loc->second.IAMInstCount;
+ FI.LSMInstCount += Loc->second.LSMInstCount;
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ TargetLoweringBase::AddrMode AM;
+ auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
+ AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
+ AM.HasBaseReg = !AM.BaseGV;
+ if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
+ GEP->getPointerAddressSpace()))
+ // Offset will likely be folded into load or store
+ continue;
+ ++FI.InstCount;
+ } else {
+ ++FI.InstCount;
+ }
+ }
+ }
+
+ return FIP.first;
+}
+
+void AMDGPUPerfHint::runOnFunction(Function &F) {
+ if (FIM.find(&F) != FIM.end())
+ return;
+
+ const Module &M = *F.getParent();
+ DL = &M.getDataLayout();
+ AS = AMDGPU::getAMDGPUAS(M);
+
+ auto Loc = visit(F);
+
+ assert(Loc != FIM.end() && "No func info");
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
+ << '\n'
+ << " IAMInst: " << Loc->second.IAMInstCount << '\n'
+ << " LSMInst: " << Loc->second.LSMInstCount << '\n'
+ << " TotalInst: " << Loc->second.InstCount << '\n');
+
+ auto &FI = Loc->second;
+
+ if (isMemBound(FI)) {
+ LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
+ NumMemBound++;
+ }
+
+ if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
+ LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
+ NumLimitWave++;
+ }
+}
+
+bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+}
+
+bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+ return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
+ FI.LSMInstCount * LSWeight) *
+ 100 / FI.InstCount) > LimitWaveThresh;
+}
+
+bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType())) {
+ unsigned As = PT->getAddressSpace();
+ // Flat likely points to global too.
+ return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+ }
+ return false;
+}
+
+bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType()))
+ return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+ return false;
+}
+
+bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
+ LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
+
+ MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
+ bool IsLargeStride = MAI.isLargeStride(LastAccess);
+ if (MAI.Base)
+ LastAccess = std::move(MAI);
+
+ return IsLargeStride;
+}
+
+AMDGPUPerfHint::MemAccessInfo
+AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
+ MemAccessInfo MAI;
+ const Value *MO = getMemoryInstrPtr(Inst);
+
+ LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
+ // Do not treat local-addr memory access as large stride.
+ if (isLocalAddr(MO))
+ return MAI;
+
+ MAI.V = MO;
+ MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
+ return MAI;
+}
+
+bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
+ if (auto PT = dyn_cast<PointerType>(V->getType())) {
+ unsigned As = PT->getAddressSpace();
+ return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+ }
+ return false;
+}
+
+bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
+ MemAccessInfo &Reference) const {
+
+ if (!Base || !Reference.Base || Base != Reference.Base)
+ return false;
+
+ uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
+ : Reference.Offset - Offset;
+ bool Result = Diff > LargeStrideThresh;
+ LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
+ << print() << "<=>\n"
+ << Reference.print() << "Result:" << Result << '\n');
+ return Result;
+}
+} // namespace
+
+bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
+
+ AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
+ Analyzer.runOnFunction(F);
+ return false;
+}
+
+bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
+ auto FI = FIM.find(F);
+ if (FI == FIM.end())
+ return false;
+
+ return AMDGPUPerfHint::isMemBound(FI->second);
+}
+
+bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
+ auto FI = FIM.find(F);
+ if (FI == FIM.end())
+ return false;
+
+ return AMDGPUPerfHint::needLimitWave(FI->second);
+}
Added: llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h?rev=333289&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h Fri May 25 10:25:12 2018
@@ -0,0 +1,55 @@
+//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Analyzes if a function potentially memory bound and if a kernel
+/// kernel may benefit from limiting number of waves to reduce cache thrashing.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+struct AMDGPUPerfHintAnalysis : public FunctionPass {
+ static char ID;
+
+public:
+ AMDGPUPerfHintAnalysis() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ }
+
+ bool isMemoryBound(const Function *F) const;
+
+ bool needsWaveLimiter(const Function *F) const;
+
+ struct FuncInfo {
+ unsigned MemInstCount;
+ unsigned InstCount;
+ unsigned IAMInstCount; // Indirect access memory instruction count
+ unsigned LSMInstCount; // Large stride memory instruction count
+ FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
+ LSMInstCount(0) {}
+ };
+
+ typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
+
+private:
+
+ FuncInfoMap FIM;
+};
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H
Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Fri May 25 10:25:12 2018
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUUnifyDivergentExitNodes.cpp
AMDGPUUnifyMetadata.cpp
AMDGPUInline.cpp
+ AMDGPUPerfHintAnalysis.cpp
AMDILCFGStructurizer.cpp
GCNHazardRecognizer.cpp
GCNIterativeScheduler.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp Fri May 25 10:25:12 2018
@@ -372,13 +372,23 @@ void GCNScheduleDAGMILive::schedule() {
// We could not keep current target occupancy because of the just scheduled
// region. Record new occupancy for next scheduling cycle.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+ // Allow memory bound functions to drop to 4 waves if not limited by an
+ // attribute.
+ unsigned MinMemBoundWaves = std::max(MFI.getMinWavesPerEU(), 4u);
+ if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
+ WavesAfter >= MinMemBoundWaves &&
+ (MFI.isMemoryBound() || MFI.needsWaveLimiter())) {
+ LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
+ << MinMemBoundWaves << " waves\n");
+ NewOccupancy = WavesAfter;
+ }
if (NewOccupancy < MinOccupancy) {
MinOccupancy = NewOccupancy;
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< MinOccupancy << ".\n");
}
- if (WavesAfter >= WavesBefore) {
+ if (WavesAfter >= MinOccupancy) {
Pressure[RegionIdx] = PressureAfter;
return;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll?rev=333289&r1=333288&r2=333289&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll Fri May 25 10:25:12 2018
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
@var = addrspace(1) global float 0.0
@@ -17,9 +17,7 @@
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 6
-; GFX700: NumVGPRs: 4
-; GFX803: NumVGPRs: 6
-; GFX900: NumVGPRs: 6
+; CHECK: NumVGPRs: 3
; CHECK: MaxFlatWorkGroupSize: 256
define amdgpu_kernel void @test(
half addrspace(1)* %r,
Added: llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll?rev=333289&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll Fri May 25 10:25:12 2018
@@ -0,0 +1,85 @@
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_membound:
+; MemoryBound: 1
+; WaveLimiterHint : 1
+define amdgpu_kernel void @test_membound(<4 x i32> addrspace(1)* nocapture readonly %arg, <4 x i32> addrspace(1)* nocapture %arg1) {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp2 = zext i32 %tmp to i64
+ %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2
+ %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
+ %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
+ store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
+ %tmp6 = add nuw nsw i64 %tmp2, 1
+ %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
+ %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
+ %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
+ store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
+ %tmp10 = add nuw nsw i64 %tmp2, 2
+ %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
+ %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
+ %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
+ store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
+ %tmp14 = add nuw nsw i64 %tmp2, 3
+ %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
+ %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
+ %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
+ store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_large_stride:
+; MemoryBound: 0
+; WaveLimiterHint : 1
+define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) {
+bb:
+ %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4096
+ %tmp1 = load i32, i32 addrspace(1)* %tmp, align 4
+ %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8192
+ %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+ %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+ store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_indirect:
+; MemoryBound: 0
+; WaveLimiterHint : 1
+define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) {
+bb:
+ %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+ %tmp3 = bitcast i32 addrspace(1)* %arg to <4 x i32> addrspace(1)*
+ %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
+ %tmp5 = extractelement <4 x i32> %tmp4, i32 0
+ %tmp6 = sext i32 %tmp5 to i64
+ %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
+ %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4
+ store i32 %tmp8, i32 addrspace(1)* %arg, align 4
+ %tmp9 = extractelement <4 x i32> %tmp4, i32 1
+ %tmp10 = sext i32 %tmp9 to i64
+ %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp10
+ %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4
+ store i32 %tmp12, i32 addrspace(1)* %tmp, align 4
+ %tmp13 = extractelement <4 x i32> %tmp4, i32 2
+ %tmp14 = sext i32 %tmp13 to i64
+ %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp14
+ %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
+ store i32 %tmp16, i32 addrspace(1)* %tmp1, align 4
+ %tmp17 = extractelement <4 x i32> %tmp4, i32 3
+ %tmp18 = sext i32 %tmp17 to i64
+ %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp18
+ %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4
+ store i32 %tmp20, i32 addrspace(1)* %tmp2, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list