[llvm] r298948 - [AMDGPU] Boost unroll threshold for loops reading local memory
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 28 15:13:51 PDT 2017
Author: rampitec
Date: Tue Mar 28 17:13:51 2017
New Revision: 298948
URL: http://llvm.org/viewvc/llvm-project?rev=298948&view=rev
Log:
[AMDGPU] Boost unroll threshold for loops reading local memory
This is less important than increase threshold for private memory,
but still brings performance improvements in a wide range of tests.
Unrolling more for local memory serves three purposes: it allows
to combine ds operations if offset becomes static, saves registers
used for offsets in case of static offsets, and allows better lds
latency hiding.
Differential Revision: https://reviews.llvm.org/D31412
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/trunk/test/CodeGen/AMDGPU/unroll.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp?rev=298948&r1=298947&r2=298948&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Tue Mar 28 17:13:51 2017
@@ -34,6 +34,11 @@ static cl::opt<unsigned> UnrollThreshold
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
cl::init(2000), cl::Hidden);
+static cl::opt<unsigned> UnrollThresholdLocal(
+ "amdgpu-unroll-threshold-local",
+ cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
+ cl::init(1000), cl::Hidden);
+
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
TTI::UnrollingPreferences &UP) {
UP.Threshold = 300; // Twice the default.
@@ -44,50 +49,87 @@ void AMDGPUTTIImpl::getUnrollingPreferen
// Maximum alloca size than can fit registers. Reserve 16 registers.
const unsigned MaxAlloca = (256 - 16) * 4;
+ unsigned ThresholdPrivate = UnrollThresholdPrivate;
+ unsigned ThresholdLocal = UnrollThresholdLocal;
+ unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+ AMDGPUAS ASST = ST->getAMDGPUAS();
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
+ unsigned LocalGEPsSeen = 0;
+
for (const Instruction &I : *BB) {
const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
- if (!GEP || GEP->getAddressSpace() != ST->getAMDGPUAS().PRIVATE_ADDRESS)
+ if (!GEP)
+ continue;
+
+ unsigned AS = GEP->getAddressSpace();
+ unsigned Threshold = 0;
+ if (AS == ASST.PRIVATE_ADDRESS)
+ Threshold = ThresholdPrivate;
+ else if (AS == ASST.LOCAL_ADDRESS)
+ Threshold = ThresholdLocal;
+ else
continue;
- const Value *Ptr = GEP->getPointerOperand();
- const AllocaInst *Alloca =
- dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
- if (Alloca && Alloca->isStaticAlloca()) {
+ if (UP.Threshold >= Threshold)
+ continue;
+
+ if (AS == ASST.PRIVATE_ADDRESS) {
+ const Value *Ptr = GEP->getPointerOperand();
+ const AllocaInst *Alloca =
+ dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+ if (!Alloca || !Alloca->isStaticAlloca())
+ continue;
Type *Ty = Alloca->getAllocatedType();
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;
+ } else if (AS == ASST.LOCAL_ADDRESS) {
+ LocalGEPsSeen++;
+ // Inhibit unroll for local memory if we have seen addressing not to
+ // a variable, most likely we will be unable to combine it.
+ // Do not unroll too deep inner loops for local memory to give a chance
+ // to unroll an outer loop for a more important reason.
+ if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
+ (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
+ !isa<Argument>(GEP->getPointerOperand())))
+ continue;
+ }
- // Check if GEP depends on a value defined by this loop itself.
- bool HasLoopDef = false;
- for (const Value *Op : GEP->operands()) {
- const Instruction *Inst = dyn_cast<Instruction>(Op);
- if (!Inst || L->isLoopInvariant(Op))
- continue;
- if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
- return SubLoop->contains(Inst); }))
- continue;
- HasLoopDef = true;
- break;
- }
- if (!HasLoopDef)
+ // Check if GEP depends on a value defined by this loop itself.
+ bool HasLoopDef = false;
+ for (const Value *Op : GEP->operands()) {
+ const Instruction *Inst = dyn_cast<Instruction>(Op);
+ if (!Inst || L->isLoopInvariant(Op))
continue;
- // We want to do whatever we can to limit the number of alloca
- // instructions that make it through to the code generator. allocas
- // require us to use indirect addressing, which is slow and prone to
- // compiler bugs. If this loop does an address calculation on an
- // alloca ptr, then we want to use a higher than normal loop unroll
- // threshold. This will give SROA a better chance to eliminate these
- // allocas.
- //
- // Don't use the maximum allowed value here as it will make some
- // programs way too big.
- UP.Threshold = UnrollThresholdPrivate;
- return;
+ if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+ return SubLoop->contains(Inst); }))
+ continue;
+ HasLoopDef = true;
+ break;
}
+ if (!HasLoopDef)
+ continue;
+
+ // We want to do whatever we can to limit the number of alloca
+ // instructions that make it through to the code generator. allocas
+ // require us to use indirect addressing, which is slow and prone to
+ // compiler bugs. If this loop does an address calculation on an
+ // alloca ptr, then we want to use a higher than normal loop unroll
+ // threshold. This will give SROA a better chance to eliminate these
+ // allocas.
+ //
+ // We also want to have more unrolling for local memory to let ds
+ // instructions with different offsets combine.
+ //
+ // Don't use the maximum allowed value here as it will make some
+ // programs way too big.
+ UP.Threshold = Threshold;
+ DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
+ << *L << " due to " << *GEP << '\n');
+ if (UP.Threshold == MaxBoost)
+ return;
}
}
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/unroll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unroll.ll?rev=298948&r1=298947&r2=298948&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unroll.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unroll.ll Tue Mar 28 17:13:51 2017
@@ -6,10 +6,10 @@
; private memory. We want to make sure these kinds of loops are always
; unrolled, because private memory is slow.
-; CHECK-LABEL: @test
+; CHECK-LABEL: @private_memory
; CHECK-NOT: alloca
; CHECK: store i32 5, i32 addrspace(1)* %out
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) {
entry:
%0 = alloca [32 x i32]
br label %loop.header
@@ -34,3 +34,33 @@ exit:
store i32 %3, i32 addrspace(1)* %out
ret void
}
+
+; Check that loop is unrolled for local memory references
+
+; CHECK-LABEL: @local_memory
+; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128
+; CHECK-NEXT: store
+; CHECK-NEXT: ret
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) {
+entry:
+ br label %loop.header
+
+loop.header:
+ %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+ br label %loop.body
+
+loop.body:
+ %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
+ %val = load i32, i32 addrspace(3)* %ptr_lds
+ %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
+ store i32 %val, i32 addrspace(1)* %ptr_out
+ br label %loop.inc
+
+loop.inc:
+ %inc = add i32 %counter, 1
+ %cond = icmp sge i32 %counter, 128
+ br i1 %cond, label %exit, label %loop.header
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list