[PATCH] R600: Unconditionally unroll loops that contain GEPs with alloca pointers

Wed Dec 11 10:01:45 PST 2013

From: Tom Stellard <thomas.stellard at amd.com>

Implement the getUnrollingPreferences() function for
AMDGPUTargetTransformInfo so that loops that do address calculations
on pointers derived from alloca are unconditionally unrolled.

Unrolling these loops makes it more likely that SROA will be able to
eliminate the allocas, which is a big win for R600 since memory
allocated by alloca (private memory) is really slow.
---
 lib/Target/R600/AMDGPUTargetTransformInfo.cpp | 29 +++++++++++++++++++++
 test/CodeGen/R600/unroll.ll                   | 37 +++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 test/CodeGen/R600/unroll.ll

diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 8db319c..2f97b72 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -18,7 +18,9 @@
 #define DEBUG_TYPE "AMDGPUtti"
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/CostTable.h"
@@ -73,6 +75,8 @@ public:
 
   virtual bool hasBranchDivergence() const;
 
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+
   /// @}
 };
 
@@ -88,3 +92,28 @@ llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
 }
 
 bool AMDGPUTTI::hasBranchDivergence() const { return true; }
+
+void AMDGPUTTI::getUnrollingPreferences(Loop *L,
+                                        UnrollingPreferences &UP) const {
+  for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+                                                  BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+                                                      I != E; ++I) {
+      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I);
+      if (!GEP)
+        continue;
+      const Value *Ptr = GEP->getPointerOperand();
+      const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
+      if (Alloca) {
+        // We want to do whatever we can to limit the number of alloca
+        // instructions that make it through to the code generator.  allocas
+        // require us to use indirect addressing, which is slow and prone to
+        // compiler bugs.  If this loop does an address calculation on an
+        // alloca ptr, then we want to unconditionally unroll the loop.  In most
+        // cases, this will make it possible for SROA to eliminate these allocas.
+        UP.Threshold = UINT_MAX;
+      }
+    }
+  }
+}
diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll
new file mode 100644
index 0000000..e0035ea
--- /dev/null
+++ b/test/CodeGen/R600/unroll.ll
@@ -0,0 +1,37 @@
+; RUN: opt -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; This test contains a simple loop that initializes an array declared in
+; private memory.  We want to make sure these kinds of loops are always
+; unrolled, because private memory is slow.
+
+; CHECK-LABEL: @test
+; CHECK-NOT: alloca
+; CHECK: store i32 5, i32 addrspace(1)* %out
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  %0 = alloca [32 x i32]
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr = getelementptr [32 x i32]* %0, i32 0, i32 %counter
+  store i32 %counter, i32* %ptr
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %1 = icmp sge i32 %counter, 32
+  br i1 %1, label  %exit, label %loop.header
+
+exit:
+  %2 = getelementptr [32 x i32]* %0, i32 0, i32 5
+  %3 = load i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
-- 
1.8.1.4