[polly] r275783 - GPGPU: Create host control flow

Mon Jul 18 04:56:40 PDT 2016

Author: grosser
Date: Mon Jul 18 06:56:39 2016
New Revision: 275783

URL: http://llvm.org/viewvc/llvm-project?rev=275783&view=rev
Log:
GPGPU: Create host control flow

Create LLVM-IR for all host-side control flow of a given GPU AST. We implement
this by introducing a new GPUNodeBuilder class derived from IslNodeBuilder.  The
IslNodeBuilder will take care of generating all general-purpose ast nodes, but
we provide our own createUser implementation to handle the different GPU
specific user statements. For now, we just skip any user statement and only
generate a host-code sceleton, but in subsequent commits we will add handling of
normal ScopStmt's performing computations, kernel calls, as well as host-device
data transfers. We will also introduce run-time check generation and LICM in
subsequent commits.

Added:
    polly/trunk/test/GPGPU/host-control-flow.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/test/GPGPU/double-parallel-loop.ll

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=275783&r1=275782&r2=275783&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Mon Jul 18 06:56:39 2016
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/CodeGen/IslNodeBuilder.h"
+#include "polly/CodeGen/Utils.h"
 #include "polly/DependenceInfo.h"
 #include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
@@ -68,6 +69,35 @@ static __isl_give isl_id_to_ast_expr *po
   return nullptr;
 }
 
+/// Generate code for a GPU specific isl AST.
+///
+/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
+/// generates code for general-prupose AST nodes, with special functionality
+/// for generating GPU specific user nodes.
+///
+/// @see GPUNodeBuilder::createUser
+class GPUNodeBuilder : public IslNodeBuilder {
+public:
+  GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
+                 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
+                 DominatorTree &DT, Scop &S)
+      : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S) {}
+
+private:
+  /// Create code for user-defined AST nodes.
+  ///
+  /// These AST nodes can be of type:
+  ///
+  ///   - ScopStmt:      A computational statement (TODO)
+  ///   - Kernel:        A GPU kernel call (TODO)
+  ///   - Data-Transfer: A GPU <-> CPU data-transfer (TODO)
+  ///
+  virtual void createUser(__isl_take isl_ast_node *User) {
+    isl_ast_node_free(User);
+    return;
+  }
+};
+
 namespace {
 class PPCGCodeGeneration : public ScopPass {
 public:
@@ -76,6 +106,12 @@ public:
   /// The scop that is currently processed.
   Scop *S;
 
+  LoopInfo *LI;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  RegionInfo *RI;
+
   PPCGCodeGeneration() : ScopPass(ID) {}
 
   /// Construct compilation options for PPCG.
@@ -650,12 +686,58 @@ public:
     PPCGScop->options = nullptr;
   }
 
+  /// Generate code for a given GPU AST described by @p Root.
+  ///
+  /// @param An isl_ast_node pointing to the root of the GPU AST.
+  void generateCode(__isl_take isl_ast_node *Root) {
+    ScopAnnotator Annotator;
+    Annotator.buildAliasScopes(*S);
+
+    Region *R = &S->getRegion();
+
+    simplifyRegion(R, DT, LI, RI);
+
+    BasicBlock *EnteringBB = R->getEnteringBlock();
+
+    PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
+
+    GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT,
+                               *S);
+
+    // Only build the run-time condition and parameters _after_ having
+    // introduced the conditional branch. This is important as the conditional
+    // branch will guard the original scop from new induction variables that
+    // the SCEVExpander may introduce while code generating the parameters and
+    // which may introduce scalar dependences that prevent us from correctly
+    // code generating this scop.
+    BasicBlock *StartBlock =
+        executeScopConditionally(*S, this, Builder.getTrue());
+
+    // TODO: Handle LICM
+    // TODO: Verify run-time checks
+    auto SplitBlock = StartBlock->getSinglePredecessor();
+    Builder.SetInsertPoint(SplitBlock->getTerminator());
+    NodeBuilder.addParameters(S->getContext());
+    Builder.SetInsertPoint(&*StartBlock->begin());
+    NodeBuilder.create(Root);
+    NodeBuilder.finalizeSCoP(*S);
+  }
+
   bool runOnScop(Scop &CurrentScop) override {
     S = &CurrentScop;
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout();
+    RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
 
     auto PPCGScop = createPPCGScop();
     auto PPCGProg = createPPCGProg(PPCGScop);
     auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
+
+    if (PPCGGen->tree)
+      generateCode(isl_ast_node_copy(PPCGGen->tree));
+
     freeOptions(PPCGScop);
     freePPCGGen(PPCGGen);
     gpu_prog_free(PPCGProg);

Modified: polly/trunk/test/GPGPU/double-parallel-loop.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/double-parallel-loop.ll?rev=275783&r1=275782&r2=275783&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/double-parallel-loop.ll (original)
+++ polly/trunk/test/GPGPU/double-parallel-loop.ll Mon Jul 18 06:56:39 2016
@@ -7,6 +7,9 @@
 ; RUN: -disable-output < %s | \
 ; RUN: FileCheck -check-prefix=CODE %s
 
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+
 ; REQUIRES: pollyacc
 
 ; CHECK: Stmt_bb5
@@ -77,7 +80,14 @@
 ; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
 ; CODE-NEXT:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
 
+; IR: polly.split_new_and_old:
+; IR-NEXT:    br i1 true, label %polly.start, label %bb2
+
+; IR: polly.start:
+; IR-NEXT:    br label %polly.exiting
 
+; IR: polly.exiting:
+; IR-NEXT:    br label %polly.merge_new_and_old
 
 ;    void double_parallel_loop(float A[][1024]) {
 ;      for (long i = 0; i < 1024; i++)

Added: polly/trunk/test/GPGPU/host-control-flow.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/host-control-flow.ll?rev=275783&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/host-control-flow.ll (added)
+++ polly/trunk/test/GPGPU/host-control-flow.ll Mon Jul 18 06:56:39 2016
@@ -0,0 +1,86 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output \
+; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
+
+; RUN: opt %loadPolly -polly-codegen-ppcg \
+; RUN: -S < %s | FileCheck %s -check-prefix=IR
+;    void foo(float A[2][100]) {
+;      for (long t = 0; t < 100; t++)
+;        for (long i = 1; i < 99; i++)
+;          A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
+;    }
+
+; CODE: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   for (int c0 = 0; c0 <= 99; c0 += 1)
+; CODE-NEXT:     {
+; CODE-NEXT:       dim3 k0_dimBlock(32);
+; CODE-NEXT:       dim3 k0_dimGrid(4);
+; CODE-NEXT:       kernel0 <<<k0_dimGrid, k0_dimBlock>>> (c0);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
+; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
+; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 98
+; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo([100 x float]* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc18, %entry
+  %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
+  %exitcond1 = icmp ne i64 %t.0, 100
+  br i1 %exitcond1, label %for.body, label %for.end20
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i64 %i.0, 99
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %sub = add nsw i64 %i.0, -1
+  %rem = srem i64 %t.0, 2
+  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
+  %tmp = load float, float* %arrayidx4, align 4
+  %rem5 = srem i64 %t.0, 2
+  %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
+  %tmp2 = load float, float* %arrayidx7, align 4
+  %add = fadd float %tmp, %tmp2
+  %add8 = add nuw nsw i64 %i.0, 1
+  %rem9 = srem i64 %t.0, 2
+  %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
+  %tmp3 = load float, float* %arrayidx11, align 4
+  %add12 = fadd float %add, %tmp3
+  %add13 = add nuw nsw i64 %t.0, 1
+  %rem14 = srem i64 %add13, 2
+  %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
+  %tmp4 = load float, float* %arrayidx16, align 4
+  %add17 = fadd float %tmp4, %add12
+  store float %add17, float* %arrayidx16, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc18
+
+for.inc18:                                        ; preds = %for.end
+  %inc19 = add nuw nsw i64 %t.0, 1
+  br label %for.cond
+
+for.end20:                                        ; preds = %for.cond
+  ret void
+}