[polly] r281848 - GPGPU: Dynamically ensure 'sufficient compute'
Tobias Grosser via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 17 23:50:36 PDT 2016
Author: grosser
Date: Sun Sep 18 01:50:35 2016
New Revision: 281848
URL: http://llvm.org/viewvc/llvm-project?rev=281848&view=rev
Log:
GPGPU: Dynamically ensure 'sufficient compute'
Offloading to a GPU is only beneficial if there is a sufficient amount of
compute that can be accelerated. Many kernels just have a very small number
of dynamic compute, which means GPU acceleration is not beneficial. We
compute at run-time an approximation of how many dynamic instructions will be
executed and fall back to CPU code in case this number is not sufficiently
large. To keep the run-time checking code simple, we over-approximate the
number of instructions executed in each statement by computing the volume of
the rectangular hull of its iteration space.
Modified:
polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
polly/trunk/test/GPGPU/double-parallel-loop.ll
Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=281848&r1=281847&r2=281848&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Sun Sep 18 01:50:35 2016
@@ -92,6 +92,11 @@ static cl::opt<std::string>
cl::desc("The CUDA version to compile for"), cl::Hidden,
cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
+static cl::opt<int>
+ MinCompute("polly-acc-mincompute",
+ cl::desc("Minimal number of compute statements to run on GPU."),
+ cl::Hidden, cl::init(10 * 512 * 512));
+
/// Create the ast expressions for a ScopStmt.
///
/// This function is a callback for to generate the ast expressions for each
@@ -2261,6 +2266,109 @@ public:
PPCGScop->options = nullptr;
}
+ /// Approximate the number of points in the set.
+ ///
+ /// This function returns an ast expression that overapproximates the number
+ /// of points in an isl set through the rectangular hull surrounding this set.
+ ///
+ /// @param Set The set to count.
+ /// @param Build The isl ast build object to use for creating the ast
+ /// expression.
+ ///
+ /// @returns An approximation of the number of points in the set.
+ __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
+ __isl_keep isl_ast_build *Build) {
+
+ isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
+ auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
+
+ isl_space *Space = isl_set_get_space(Set);
+ Space = isl_space_params(Space);
+ auto *Univ = isl_set_universe(Space);
+ isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
+
+ for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
+ isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
+ isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
+ isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
+ DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
+ auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
+ Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
+ }
+
+ isl_set_free(Set);
+ isl_pw_aff_free(OneAff);
+
+ return Expr;
+ }
+
+ /// Approximate a number of dynamic instructions executed by a given
+ /// statement.
+ ///
+ /// @param Stmt The statement for which to compute the number of dynamic
+ /// instructions.
+ /// @param Build The isl ast build object to use for creating the ast
+ /// expression.
+ /// @returns An approximation of the number of dynamic instructions executed
+ /// by @p Stmt.
+ __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
+ __isl_keep isl_ast_build *Build) {
+ auto Iterations = approxPointsInSet(Stmt.getDomain(), Build);
+
+ long InstCount = 0;
+
+ if (Stmt.isBlockStmt()) {
+ auto *BB = Stmt.getBasicBlock();
+ InstCount = std::distance(BB->begin(), BB->end());
+ } else {
+ auto *R = Stmt.getRegion();
+
+ for (auto *BB : R->blocks()) {
+ InstCount += std::distance(BB->begin(), BB->end());
+ }
+ }
+
+ isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
+ auto *InstExpr = isl_ast_expr_from_val(InstVal);
+ return isl_ast_expr_mul(InstExpr, Iterations);
+ }
+
+ /// Approximate dynamic instructions executed in scop.
+ ///
+ /// @param S The scop for which to approximate dynamic instructions.
+ /// @param Build The isl ast build object to use for creating the ast
+ /// expression.
+ /// @returns An approximation of the number of dynamic instructions executed
+ /// in @p S.
+ __isl_give isl_ast_expr *
+ getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
+ isl_ast_expr *Instructions;
+
+ isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
+ Instructions = isl_ast_expr_from_val(Zero);
+
+ for (ScopStmt &Stmt : S) {
+ isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
+ Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
+ }
+ return Instructions;
+ }
+
+ /// Create a check that ensures sufficient compute in scop.
+ ///
+ /// @param S The scop for which to ensure sufficient compute.
+ /// @param Build The isl ast build object to use for creating the ast
+ /// expression.
+ /// @returns An expression that evaluates to TRUE in case of sufficient
+ /// compute and to FALSE, otherwise.
+ __isl_give isl_ast_expr *
+ createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
+ auto Iterations = getNumberOfIterations(S, Build);
+ auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
+ auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
+ return isl_ast_expr_ge(Iterations, MinComputeExpr);
+ }
+
/// Generate code for a given GPU AST described by @p Root.
///
/// @param Root An isl_ast_node pointing to the root of the GPU AST.
@@ -2296,6 +2404,8 @@ public:
isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build);
+ isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
+ Condition = isl_ast_expr_and(Condition, SufficientCompute);
isl_ast_build_free(Build);
Value *RTC = NodeBuilder.createRTC(Condition);
Modified: polly/trunk/test/GPGPU/double-parallel-loop.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/double-parallel-loop.ll?rev=281848&r1=281847&r2=281848&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/double-parallel-loop.ll (original)
+++ polly/trunk/test/GPGPU/double-parallel-loop.ll Sun Sep 18 01:50:35 2016
@@ -89,7 +89,27 @@
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
; IR: polly.split_new_and_old:
-; IR-NEXT: br i1 true, label %polly.start, label %bb2
+; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
+; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1
+; IR-NEXT: %polly.overflow.state = or i1 false, %.obit
+; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0
+; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
+; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1
+; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
+; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0
+; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
+; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1
+; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
+; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0
+; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
+; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1
+; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
+; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0
+; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440
+; IR-NEXT: %5 = and i1 true, %4
+; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
+; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown
+; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2
; IR: polly.start:
; IR-NEXT: br label %polly.acc.initialize
@@ -105,7 +125,7 @@
; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]]
; IR-NEXT: call i8* @polly_getKernel
-; IR-NEXT: call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
+; IR-NEXT: call void @polly_launchKernel(i8* %11, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
More information about the llvm-commits
mailing list