[PATCH] D29473: [AMDGPU] Unroll preferences improvements

Thu Feb 2 14:41:06 PST 2017

rampitec created this revision.
Herald added a reviewer: tstellarAMD.
Herald added subscribers: tpr, tony-tye, yaxunl, nhaehnle, wdng, kzhuravl.

Exit loop analysis early if suitable private access found.
Do not account for GEPs which are invariant to loop induction variable.
Do not account for Allocas which are too big to fit into register file anyway.
Add two options for tuning: -amdgpu-unroll-threshold and -amdgpu-unroll-threshold-private.
Fix AMDGPUTTIImpl::getNumberOfRegisters() query to return correct amount on pre-VI.


Repository:
  rL LLVM

https://reviews.llvm.org/D29473

Files:
  lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp


Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================

--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -29,15 +29,27 @@
 
 #define DEBUG_TYPE "AMDGPUtti"
 
+static cl::opt<unsigned> UnrollThreshold(
+  "amdgpu-unroll-threshold",
+  cl::desc("Unroll threshold for AMDGPU"),
+  cl::init(300), cl::Hidden); // Twice the default.
+
+static cl::opt<unsigned> UnrollThresholdPrivate(
+  "amdgpu-unroll-threshold-private",
+  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
+  cl::init(800), cl::Hidden);
+
 
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                             TTI::UnrollingPreferences &UP) {
-  UP.Threshold = 300; // Twice the default.
+  UP.Threshold = UnrollThreshold;
   UP.MaxCount = UINT_MAX;
   UP.Partial = true;
 
   // TODO: Do we want runtime unrolling?
 
+  // Maximum alloca size than can fit registers. Reserve 16 registers.
+  const unsigned MaxAlloca = (getNumberOfRegisters(false) - 16) * 4;
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
     for (const Instruction &I : *BB) {
@@ -49,6 +61,13 @@
       const AllocaInst *Alloca =
           dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
       if (Alloca) {
+        if (L->hasLoopInvariantOperands(GEP))
+          continue;
+        Type *Ty = Alloca->getAllocatedType();
+        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
+        if (AllocaSize > MaxAlloca)
+          continue;
+
         // We want to do whatever we can to limit the number of alloca
         // instructions that make it through to the code generator.  allocas
         // require us to use indirect addressing, which is slow and prone to
@@ -59,7 +78,8 @@
         //
         // Don't use the maximum allowed value here as it will make some
         // programs way too big.
-        UP.Threshold = 800;
+        UP.Threshold = UnrollThresholdPrivate;
+        return;
       }
     }
   }
@@ -70,8 +90,10 @@
     return 0;
 
   // Number of VGPRs on SI.
-  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+  if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return 256;
+  else if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return 128;
 
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D29473.86883.patch
Type: text/x-patch
Size: 2507 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170202/5f8cfc99/attachment.bin>