[llvm] r299779 - [AMDGPU] Unroll more to eliminate phis and conditions

Fri Apr 7 09:26:29 PDT 2017

Author: rampitec
Date: Fri Apr  7 11:26:28 2017
New Revision: 299779

URL: http://llvm.org/viewvc/llvm-project?rev=299779&view=rev
Log:
[AMDGPU] Unroll more to eliminate phis and conditions

Increase threshold to unroll a loop which contains an "if" statement
whose condition defined by a PHI belonging to the loop. This may help
to eliminate if region and potentially even PHI itself, saving on
both divergence and registers used for the PHI.

Add a small bonus for each of such "if" statements.

Differential Revision: https://reviews.llvm.org/D31693

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/trunk/test/CodeGen/AMDGPU/unroll.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp?rev=299779&r1=299778&r2=299779&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Fri Apr  7 11:26:28 2017
@@ -32,13 +32,37 @@ using namespace llvm;
 static cl::opt<unsigned> UnrollThresholdPrivate(
   "amdgpu-unroll-threshold-private",
   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
-  cl::init(2000), cl::Hidden);
+  cl::init(2500), cl::Hidden);
 
 static cl::opt<unsigned> UnrollThresholdLocal(
   "amdgpu-unroll-threshold-local",
   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
   cl::init(1000), cl::Hidden);
 
+static cl::opt<unsigned> UnrollThresholdIf(
+  "amdgpu-unroll-threshold-if",
+  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
+  cl::init(150), cl::Hidden);
+
+static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
+                              unsigned Depth = 0) {
+  const Instruction *I = dyn_cast<Instruction>(Cond);
+  if (!I)
+    return false;
+
+  for (const Value *V : I->operand_values()) {
+    if (!L->contains(I))
+      continue;
+    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
+      if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
+                  return SubLoop->contains(PHI); }))
+        return true;
+    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
+      return true;
+  }
+  return false;
+}
+
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                             TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
@@ -57,7 +81,33 @@ void AMDGPUTTIImpl::getUnrollingPreferen
     const DataLayout &DL = BB->getModule()->getDataLayout();
     unsigned LocalGEPsSeen = 0;
 
+    if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
+               return SubLoop->contains(BB); }))
+        continue; // Block belongs to an inner loop.
+
     for (const Instruction &I : *BB) {
+
+      // Unroll a loop which contains an "if" statement whose condition
+      // defined by a PHI belonging to the loop. This may help to eliminate
+      // if region and potentially even PHI itself, saving on both divergence
+      // and registers used for the PHI.
+      // Add a small bonus for each of such "if" statements.
+      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
+        if (UP.Threshold < MaxBoost && Br->isConditional()) {
+          if (L->isLoopExiting(Br->getSuccessor(0)) ||
+              L->isLoopExiting(Br->getSuccessor(1)))
+            continue;
+          if (dependsOnLocalPhi(L, Br->getCondition())) {
+            UP.Threshold += UnrollThresholdIf;
+            DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                         << " for loop:\n" << *L << " due to " << *Br << '\n');
+            if (UP.Threshold >= MaxBoost)
+              return;
+          }
+        }
+        continue;
+      }
+
       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
       if (!GEP)
         continue;
@@ -128,7 +178,7 @@ void AMDGPUTTIImpl::getUnrollingPreferen
       UP.Threshold = Threshold;
       DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
                    << *L << " due to " << *GEP << '\n');
-      if (UP.Threshold == MaxBoost)
+      if (UP.Threshold >= MaxBoost)
         return;
     }
   }

Modified: llvm/trunk/test/CodeGen/AMDGPU/unroll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unroll.ll?rev=299779&r1=299778&r2=299779&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unroll.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unroll.ll Fri Apr  7 11:26:28 2017
@@ -64,3 +64,37 @@ loop.inc:
 exit:
   ret void
 }
+
+; Check that a loop with if inside completely unrolled to eliminate phi and if
+
+; CHECK-LABEL: @unroll_for_if
+; CHECK: entry:
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: store
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: store
+; CHECK-NOT: br
+define amdgpu_kernel void @unroll_for_if(i32* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i32 %i1, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %0 = sext i32 %i1 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %0
+  store i32 0, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i1, 1
+  %cmp = icmp ult i32 %inc, 48
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}