[polly] r281848 - GPGPU: Dynamically ensure 'sufficient compute'

Tobias Grosser via llvm-commits llvm-commits at lists.llvm.org
Sat Sep 17 23:50:36 PDT 2016


Author: grosser
Date: Sun Sep 18 01:50:35 2016
New Revision: 281848

URL: http://llvm.org/viewvc/llvm-project?rev=281848&view=rev
Log:
GPGPU: Dynamically ensure 'sufficient compute'

Offloading to a GPU is only beneficial if there is a sufficient amount of
compute that can be accelerated. Many kernels just have a very small number
of dynamic compute, which means GPU acceleration is not beneficial. We
compute at run-time an approximation of how many dynamic instructions will be
executed and fall back to CPU code in case this number is not sufficiently
large. To keep the run-time checking code simple, we over-approximate the
number of instructions executed in each statement by computing the volume of
the rectangular hull of its iteration space.

Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/test/GPGPU/double-parallel-loop.ll

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=281848&r1=281847&r2=281848&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Sun Sep 18 01:50:35 2016
@@ -92,6 +92,11 @@ static cl::opt<std::string>
                 cl::desc("The CUDA version to compile for"), cl::Hidden,
                 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::opt<int>
+    MinCompute("polly-acc-mincompute",
+               cl::desc("Minimal number of compute statements to run on GPU."),
+               cl::Hidden, cl::init(10 * 512 * 512));
+
 /// Create the ast expressions for a ScopStmt.
 ///
 /// This function is a callback for to generate the ast expressions for each
@@ -2261,6 +2266,109 @@ public:
     PPCGScop->options = nullptr;
   }
 
+  /// Approximate the number of points in the set.
+  ///
+  /// This function returns an ast expression that overapproximates the number
+  /// of points in an isl set through the rectangular hull surrounding this set.
+  ///
+  /// @param Set   The set to count.
+  /// @param Build The isl ast build object to use for creating the ast
+  ///              expression.
+  ///
+  /// @returns An approximation of the number of points in the set.
+  __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
+                                             __isl_keep isl_ast_build *Build) {
+
+    isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
+    auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
+
+    isl_space *Space = isl_set_get_space(Set);
+    Space = isl_space_params(Space);
+    auto *Univ = isl_set_universe(Space);
+    isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
+
+    for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
+      isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
+      isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
+      isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
+      DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
+      auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
+      Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
+    }
+
+    isl_set_free(Set);
+    isl_pw_aff_free(OneAff);
+
+    return Expr;
+  }
+
+  /// Approximate a number of dynamic instructions executed by a given
+  /// statement.
+  ///
+  /// @param Stmt  The statement for which to compute the number of dynamic
+  ///              instructions.
+  /// @param Build The isl ast build object to use for creating the ast
+  ///              expression.
+  /// @returns An approximation of the number of dynamic instructions executed
+  ///          by @p Stmt.
+  __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
+                                             __isl_keep isl_ast_build *Build) {
+    auto Iterations = approxPointsInSet(Stmt.getDomain(), Build);
+
+    long InstCount = 0;
+
+    if (Stmt.isBlockStmt()) {
+      auto *BB = Stmt.getBasicBlock();
+      InstCount = std::distance(BB->begin(), BB->end());
+    } else {
+      auto *R = Stmt.getRegion();
+
+      for (auto *BB : R->blocks()) {
+        InstCount += std::distance(BB->begin(), BB->end());
+      }
+    }
+
+    isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
+    auto *InstExpr = isl_ast_expr_from_val(InstVal);
+    return isl_ast_expr_mul(InstExpr, Iterations);
+  }
+
+  /// Approximate dynamic instructions executed in scop.
+  ///
+  /// @param S     The scop for which to approximate dynamic instructions.
+  /// @param Build The isl ast build object to use for creating the ast
+  ///              expression.
+  /// @returns An approximation of the number of dynamic instructions executed
+  ///          in @p S.
+  __isl_give isl_ast_expr *
+  getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
+    isl_ast_expr *Instructions;
+
+    isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
+    Instructions = isl_ast_expr_from_val(Zero);
+
+    for (ScopStmt &Stmt : S) {
+      isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
+      Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
+    }
+    return Instructions;
+  }
+
+  /// Create a check that ensures sufficient compute in scop.
+  ///
+  /// @param S     The scop for which to ensure sufficient compute.
+  /// @param Build The isl ast build object to use for creating the ast
+  ///              expression.
+  /// @returns An expression that evaluates to TRUE in case of sufficient
+  ///          compute and to FALSE, otherwise.
+  __isl_give isl_ast_expr *
+  createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
+    auto Iterations = getNumberOfIterations(S, Build);
+    auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
+    auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
+    return isl_ast_expr_ge(Iterations, MinComputeExpr);
+  }
+
   /// Generate code for a given GPU AST described by @p Root.
   ///
   /// @param Root An isl_ast_node pointing to the root of the GPU AST.
@@ -2296,6 +2404,8 @@ public:
 
     isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
     isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build);
+    isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
+    Condition = isl_ast_expr_and(Condition, SufficientCompute);
     isl_ast_build_free(Build);
 
     Value *RTC = NodeBuilder.createRTC(Condition);

Modified: polly/trunk/test/GPGPU/double-parallel-loop.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/double-parallel-loop.ll?rev=281848&r1=281847&r2=281848&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/double-parallel-loop.ll (original)
+++ polly/trunk/test/GPGPU/double-parallel-loop.ll Sun Sep 18 01:50:35 2016
@@ -89,7 +89,27 @@
 ; CODE-NEXT:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
 
 ; IR: polly.split_new_and_old:
-; IR-NEXT:    br i1 true, label %polly.start, label %bb2
+; IR-NEXT:   %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
+; IR-NEXT:   %.obit = extractvalue { i64, i1 } %0, 1
+; IR-NEXT:   %polly.overflow.state = or i1 false, %.obit
+; IR-NEXT:   %.res = extractvalue { i64, i1 } %0, 0
+; IR-NEXT:   %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
+; IR-NEXT:   %.obit1 = extractvalue { i64, i1 } %1, 1
+; IR-NEXT:   %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
+; IR-NEXT:   %.res3 = extractvalue { i64, i1 } %1, 0
+; IR-NEXT:   %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
+; IR-NEXT:   %.obit4 = extractvalue { i64, i1 } %2, 1
+; IR-NEXT:   %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
+; IR-NEXT:   %.res6 = extractvalue { i64, i1 } %2, 0
+; IR-NEXT:   %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
+; IR-NEXT:   %.obit7 = extractvalue { i64, i1 } %3, 1
+; IR-NEXT:   %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
+; IR-NEXT:   %.res9 = extractvalue { i64, i1 } %3, 0
+; IR-NEXT:   %4 = icmp sge i64 %.res9, 2621440
+; IR-NEXT:   %5 = and i1 true, %4
+; IR-NEXT:   %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
+; IR-NEXT:   %polly.rtc.result = and i1 %5, %polly.rtc.overflown
+; IR-NEXT:   br i1 %polly.rtc.result, label %polly.start, label %bb2
 
 ; IR: polly.start:
 ; IR-NEXT: br label %polly.acc.initialize
@@ -105,7 +125,7 @@
 ; IR-NEXT:    [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
 ; IR-NEXT:    store i8* [[ParamTyped]], i8** [[ParamSlot]]
 ; IR-NEXT:    call i8* @polly_getKernel
-; IR-NEXT:    call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
+; IR-NEXT:    call void @polly_launchKernel(i8* %11, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; IR-NEXT:    call void @polly_freeKernel
 ; IR-NEXT:    [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
 ; IR-NEXT:    call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)




More information about the llvm-commits mailing list