[polly] r306284 - [PPCGCodeGeneration] Allow intrinsics within kernels.
Siddharth Bhat via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 26 06:12:06 PDT 2017
Author: bollu
Date: Mon Jun 26 06:12:06 2017
New Revision: 306284
URL: http://llvm.org/viewvc/llvm-project?rev=306284&view=rev
Log:
[PPCGCodeGeneration] Allow intrinsics within kernels.
- In D33414, if any function call was found within a kernel, we would bail out.
- This is an over-approximation. This patch changes this by allowing the
`llvm.sqrt.*` family of intrinsics.
- This introduces an additional step when creating a separate llvm::Module
for a kernel (GPUModule). We now copy function declarations from the
original module to new module.
- We also populate IslNodeBuilder::ValueMap so it replaces the function
references to the old module to the ones in the new module
(GPUModule).
Differential Revision: https://reviews.llvm.org/D34145
Added:
polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll
Modified:
polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=306284&r1=306283&r2=306284&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Mon Jun 26 06:12:06 2017
@@ -255,8 +255,12 @@ private:
///
/// @param Kernel The kernel to scan for llvm::Values
///
- /// @returns A set of values referenced by the kernel.
- SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
+ /// @returns A pair, whose first element contains the set of values
+ /// referenced by the kernel, and whose second element contains the
+ /// set of functions referenced by the kernel. All functions in the
+ /// second set satisfy isValidFunctionInKernel.
+ std::pair<SetVector<Value *>, SetVector<Function *>>
+ getReferencesInKernel(ppcg_kernel *Kernel);
/// Compute the sizes of the execution grid for a given kernel.
///
@@ -365,8 +369,11 @@ private:
///
/// @param Kernel The kernel to generate code for.
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
+ /// @param SubtreeFunctions The set of llvm::Functions referenced by this
+ /// kernel.
void createKernelFunction(ppcg_kernel *Kernel,
- SetVector<Value *> &SubtreeValues);
+ SetVector<Value *> &SubtreeValues,
+ SetVector<Function *> &SubtreeFunctions);
/// Create the declaration of a kernel function.
///
@@ -389,6 +396,25 @@ private:
/// @param The kernel to generate the intrinsic functions for.
void insertKernelIntrinsics(ppcg_kernel *Kernel);
+ /// Setup the creation of functions referenced by the GPU kernel.
+ ///
+ /// 1. Create new function declarations in GPUModule which are the same as
+ /// SubtreeFunctions.
+ ///
+ /// 2. Populate IslNodeBuilder::ValueMap with mappings from
+ /// old functions (that come from the original module) to new functions
+ /// (that are created within GPUModule). That way, we generate references
+ /// to the correct function (in GPUModule) in BlockGenerator.
+ ///
+ /// @see IslNodeBuilder::ValueMap
+ /// @see BlockGenerator::GlobalMap
+ /// @see BlockGenerator::getNewValue
+ /// @see GPUNodeBuilder::getReferencesInKernel.
+ ///
+ /// @param SubtreeFunctions The set of llvm::Functions referenced by
+ /// this kernel.
+ void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
+
/// Create a global-to-shared or shared-to-global copy statement.
///
/// @param CopyStmt The copy statement to generate code for
@@ -1109,7 +1135,40 @@ isl_bool collectReferencesInGPUStmt(__is
return isl_bool_true;
}
-SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
+/// Check if F is a function that we can code-generate in a GPU kernel.
+static bool isValidFunctionInKernel(llvm::Function *F) {
+ assert(F && "F is an invalid pointer");
+ // We string compare against the name of the function to allow
+ // all variants of the intrinsic "llvm.sqrt.*"
+ return F->isIntrinsic() && F->getName().startswith("llvm.sqrt");
+}
+
+/// Do not take `Function` as a subtree value.
+///
+/// We try to take the reference of all subtree values and pass them along
+/// to the kernel from the host. Taking an address of any function and
+/// trying to pass along is nonsensical. Only allow `Value`s that are not
+/// `Function`s.
+static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
+
+/// Return `Function`s from `RawSubtreeValues`.
+static SetVector<Function *>
+getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) {
+ SetVector<Function *> SubtreeFunctions;
+ for (Value *It : RawSubtreeValues) {
+ Function *F = dyn_cast<Function>(It);
+ if (F) {
+ assert(isValidFunctionInKernel(F) && "Code should have bailed out by "
+ "this point if an invalid function "
+ "were present in a kernel.");
+ SubtreeFunctions.insert(F);
+ }
+ }
+ return SubtreeFunctions;
+}
+
+std::pair<SetVector<Value *>, SetVector<Function *>>
+GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
SetVector<Value *> SubtreeValues;
SetVector<const SCEV *> SCEVs;
SetVector<const Loop *> Loops;
@@ -1146,7 +1205,19 @@ SetVector<Value *> GPUNodeBuilder::getRe
isl_id_free(Id);
}
- return SubtreeValues;
+ // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
+ // SubtreeValues. This is important, because we should not lose any
+ // SubtreeValues in the process of constructing the
+ // "ValidSubtree{Values, Functions} sets. Nor should the set
+ // ValidSubtree{Values, Functions} have any common element.
+ auto ValidSubtreeValuesIt =
+ make_filter_range(SubtreeValues, isValidSubtreeValue);
+ SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
+ ValidSubtreeValuesIt.end());
+ SetVector<Function *> ValidSubtreeFunctions(
+ getFunctionsFromRawSubtreeValues(SubtreeValues));
+
+ return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions);
}
void GPUNodeBuilder::clearDominators(Function *F) {
@@ -1353,6 +1424,21 @@ GPUNodeBuilder::createLaunchParameters(p
Launch + "_params_i8ptr", Location);
}
+void GPUNodeBuilder::setupKernelSubtreeFunctions(
+ SetVector<Function *> SubtreeFunctions) {
+ for (auto Fn : SubtreeFunctions) {
+ const std::string ClonedFnName = Fn->getName();
+ Function *Clone = GPUModule->getFunction(ClonedFnName);
+ if (!Clone)
+ Clone =
+ Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
+ ClonedFnName, GPUModule.get());
+ assert(Clone && "Expected cloned function to be initialized.");
+ assert(ValueMap.find(Fn) == ValueMap.end() &&
+ "Fn already present in ValueMap");
+ ValueMap[Fn] = Clone;
+ }
+}
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
@@ -1369,7 +1455,9 @@ void GPUNodeBuilder::createKernel(__isl_
Value *BlockDimX, *BlockDimY, *BlockDimZ;
std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
- SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
+ SetVector<Value *> SubtreeValues;
+ SetVector<Function *> SubtreeFunctions;
+ std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel);
assert(Kernel->tree && "Device AST of kernel node is empty");
@@ -1393,7 +1481,8 @@ void GPUNodeBuilder::createKernel(__isl_
SubtreeValues.insert(V);
}
- createKernelFunction(Kernel, SubtreeValues);
+ createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
+ setupKernelSubtreeFunctions(SubtreeFunctions);
create(isl_ast_node_copy(Kernel->tree));
@@ -1721,8 +1810,9 @@ void GPUNodeBuilder::createKernelVariabl
}
}
-void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
- SetVector<Value *> &SubtreeValues) {
+void GPUNodeBuilder::createKernelFunction(
+ ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
+ SetVector<Function *> &SubtreeFunctions) {
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
GPUModule.reset(new Module(Identifier, Builder.getContext()));
@@ -2611,9 +2701,18 @@ public:
return isl_ast_expr_ge(Iterations, MinComputeExpr);
}
- /// Check whether the Block contains any Function value.
- bool ContainsFnPtrValInBlock(const BasicBlock *BB) {
- for (const Instruction &Inst : *BB)
+ /// Check if the basic block contains a function we cannot codegen for GPU
+ /// kernels.
+ ///
+ /// If this basic block does something with a `Function` other than calling
+ /// a function that we support in a kernel, return true.
+ bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) {
+ for (const Instruction &Inst : *BB) {
+ const CallInst *Call = dyn_cast<CallInst>(&Inst);
+ if (Call && isValidFunctionInKernel(Call->getCalledFunction())) {
+ continue;
+ }
+
for (Value *SrcVal : Inst.operands()) {
PointerType *p = dyn_cast<PointerType>(SrcVal->getType());
if (!p)
@@ -2621,20 +2720,21 @@ public:
if (isa<FunctionType>(p->getElementType()))
return true;
}
+ }
return false;
}
- /// Return whether the Scop S has functions.
- bool ContainsFnPtr(const Scop &S) {
+ /// Return whether the Scop S uses functions in a way that we do not support.
+ bool containsInvalidKernelFunction(const Scop &S) {
for (auto &Stmt : S) {
if (Stmt.isBlockStmt()) {
- if (ContainsFnPtrValInBlock(Stmt.getBasicBlock()))
+ if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock()))
return true;
} else {
assert(Stmt.isRegionStmt() &&
"Stmt was neither block nor region statement");
for (const BasicBlock *BB : Stmt.getRegion()->blocks())
- if (ContainsFnPtrValInBlock(BB))
+ if (containsInvalidKernelFunctionInBllock(BB))
return true;
}
}
@@ -2708,13 +2808,18 @@ public:
DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
- // We currently do not support functions inside kernels, as code
- // generation will need to offload function calls to the kernel.
- // This may lead to a kernel trying to call a function on the host.
+ // We currently do not support functions other than intrinsics inside
+ // kernels, as code generation will need to offload function calls to the
+ // kernel. This may lead to a kernel trying to call a function on the host.
// This also allows us to prevent codegen from trying to take the
// address of an intrinsic function to send to the kernel.
- if (ContainsFnPtr(CurrentScop))
+ if (containsInvalidKernelFunction(CurrentScop)) {
+ DEBUG(
+ dbgs()
+ << "Scop contains function which cannot be materialised in a GPU "
+ "kernel. Bailing out.\n";);
return false;
+ }
auto PPCGScop = createPPCGScop();
auto PPCGProg = createPPCGProg(PPCGScop);
Added: polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll?rev=306284&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll (added)
+++ polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll Mon Jun 26 06:12:06 2017
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR
+
+; Test that we do recognise and codegen a kernel that has intrinsics.
+
+; REQUIRES: pollyacc
+
+; Check that we model the kernel as a scop.
+; SCOP: Function: f
+; SCOP-NEXT: Region: %entry.split---%for.end
+
+; Check that the intrinsic call is present in the kernel IR.
+; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
+; KERNEL-IR: declare float @llvm.sqrt.f32(float) #2
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+
+; void f(float *A, float *B, int N) {
+; for(int i = 0; i < N; i++) {
+; B[i] = sqrt(A[i]);
+; }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(float* %A, float* %B, i32 %N) {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ %cmp1 = icmp sgt i32 %N, 0
+ br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph: ; preds = %entry.split
+ br label %for.body
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+ %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
+ %A.arr.i.val = load float, float* %A.arr.i, align 4
+ ; Call to intrinsic that should be part of the kernel.
+ %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
+ %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
+ store float %sqrt, float* %B.arr.i, align 4
+
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %wide.trip.count = zext i32 %N to i64
+ %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #0
+
+attributes #0 = { nounwind readnone }
+
More information about the llvm-commits
mailing list