[llvm] 689e616 - [AMDGPU] Promote alloca to vector in opt

Thu May 21 13:50:01 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-05-21T13:49:51-07:00
New Revision: 689e616ed0b1210517dda65dc3528f8fdb9a5959

URL: https://github.com/llvm/llvm-project/commit/689e616ed0b1210517dda65dc3528f8fdb9a5959
DIFF: https://github.com/llvm/llvm-project/commit/689e616ed0b1210517dda65dc3528f8fdb9a5959.diff

LOG: [AMDGPU] Promote alloca to vector in opt

Promote alloca to vector before SROA and loop unroll. If we manage
to eliminate allocas before unroll we may choose to unroll less.

Differential Revision: https://reviews.llvm.org/D80386

Added: 
    llvm/lib/Target/AMDGPU/sroa-before-unroll.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/test/CodeGen/AMDGPU/opt-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b39dd1bc8bae..30221872bbbf 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -195,6 +195,10 @@ FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
+FunctionPass *createAMDGPUPromoteAllocaToVector();
+void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaToVectorID;
+
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(
   TargetMachine *TM = nullptr,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ae691b7ecde5..03e927b3cdc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -128,14 +128,39 @@ class AMDGPUPromoteAlloca : public FunctionPass {
   }
 };
 
+class AMDGPUPromoteAllocaToVector : public FunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Promote Alloca to vector";
+  }
+
+  bool handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
+char AMDGPUPromoteAllocaToVector::ID = 0;
 
 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
                 "AMDGPU promote alloca to vector or LDS", false, false)
 
+INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                "AMDGPU promote alloca to vector", false, false)
+
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
   Mod = &M;
@@ -982,6 +1007,43 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   return true;
 }
 
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+  if (skipFunction(F) || DisablePromoteAllocaToVector)
+    return false;
+
+  bool Changed = false;
+  BasicBlock &EntryBB = *F.begin();
+
+  SmallVector<AllocaInst *, 16> Allocas;
+  for (Instruction &I : EntryBB) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+      Allocas.push_back(AI);
+  }
+
+  for (AllocaInst *AI : Allocas) {
+    if (handleAlloca(*AI))
+      Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  Module *Mod = I.getParent()->getParent()->getParent();
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
+}
+
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
+
+FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
+  return new AMDGPUPromoteAllocaToVector();
+}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1ee7f3480f87..4ec903e37653 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
@@ -470,7 +471,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
 
   Builder.addExtension(
     PassManagerBuilder::EP_CGSCCOptimizerLate,
-    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
       // Add infer address spaces pass to the opt pipeline after inlining
       // but before SROA to increase SROA opportunities.
       PM.add(createInferAddressSpacesPass());
@@ -478,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       // This should run after inlining to have any chance of doing anything,
       // and before other cleanup optimizations.
       PM.add(createAMDGPULowerKernelAttributesPass());
+
+      // Promote alloca to vector before SROA and loop unroll. If we manage
+      // to eliminate allocas before unroll we may choose to unroll less.
+      if (EnableOpt)
+        PM.add(createAMDGPUPromoteAllocaToVector());
   });
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll b/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll
new file mode 100644
index 000000000000..b71454c720b1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll
@@ -0,0 +1,47 @@
+; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP
+; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL
+
+target datalayout = "A5"
+
+; This test contains a simple loop that initializes an array declared in
+; private memory. This loop would be fully unrolled if we could not SROA
+; the alloca. Check that we successfully eliminate it before the unroll,
+; so that we do not need to fully unroll it.
+
+; FUNC-LABEL: @private_memory
+; LOOP-NOT: alloca
+; LOOP: loop.header:
+; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
+
+; FULL-UNROLL: alloca
+; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, i32 addrspace(5)*
+; FULL-UNROLL-NOT: br
+
+; FUNC: store i32 %{{[^,]+}}, i32 addrspace(1)* %out
+define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %alloca = alloca [16 x i32], addrspace(5)
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %salt = xor i32 %counter, %n
+  %idx = and i32 %salt, 15
+  %ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %idx
+  store i32 %counter, i32 addrspace(5)* %ptr
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %cmp = icmp sge i32 %counter, 255
+  br i1 %cmp, label  %exit, label %loop.header
+
+exit:
+  %gep = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %n
+  %load = load i32, i32 addrspace(5)* %gep
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index 0c8b0a799065..87f3435c23d4 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -101,6 +101,7 @@
 ; GCN-O1-NEXT:         Infer address spaces
 ; GCN-O1-NEXT:     AMDGPU Kernel Attributes
 ; GCN-O1-NEXT:     FunctionPass Manager
+; GCN-O1-NEXT:       AMDGPU Promote Alloca to vector
 ; GCN-O1-NEXT:       Dominator Tree Construction
 ; GCN-O1-NEXT:       SROA
 ; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
@@ -401,6 +402,7 @@
 ; GCN-O2-NEXT:         Infer address spaces
 ; GCN-O2-NEXT:     AMDGPU Kernel Attributes
 ; GCN-O2-NEXT:     FunctionPass Manager
+; GCN-O2-NEXT:       AMDGPU Promote Alloca to vector
 ; GCN-O2-NEXT:       Dominator Tree Construction
 ; GCN-O2-NEXT:       SROA
 ; GCN-O2-NEXT:       Basic Alias Analysis (stateless AA impl)
@@ -752,6 +754,7 @@
 ; GCN-O3-NEXT:         Infer address spaces
 ; GCN-O3-NEXT:     AMDGPU Kernel Attributes
 ; GCN-O3-NEXT:     FunctionPass Manager
+; GCN-O3-NEXT:       AMDGPU Promote Alloca to vector
 ; GCN-O3-NEXT:       Dominator Tree Construction
 ; GCN-O3-NEXT:       SROA
 ; GCN-O3-NEXT:       Basic Alias Analysis (stateless AA impl)