[polly] r306284 - [PPCGCodeGeneration] Allow intrinsics within kernels.

Siddharth Bhat via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 26 06:12:06 PDT 2017


Author: bollu
Date: Mon Jun 26 06:12:06 2017
New Revision: 306284

URL: http://llvm.org/viewvc/llvm-project?rev=306284&view=rev
Log:
[PPCGCodeGeneration] Allow intrinsics within kernels.

- In D33414, if any function call was found within a kernel, we would bail out.

- This is an over-approximation. This patch changes this by allowing the
  `llvm.sqrt.*` family of intrinsics.

- This introduces an additional step when creating a separate llvm::Module
  for a kernel (GPUModule). We now copy function declarations from the
  original module to new module.

- We also populate IslNodeBuilder::ValueMap so it replaces the function
  references to the old module to the ones in the new module
  (GPUModule).

Differential Revision: https://reviews.llvm.org/D34145

Added:
    polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=306284&r1=306283&r2=306284&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Mon Jun 26 06:12:06 2017
@@ -255,8 +255,12 @@ private:
   ///
   /// @param Kernel The kernel to scan for llvm::Values
   ///
-  /// @returns A set of values referenced by the kernel.
-  SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
+  /// @returns A pair, whose first element contains the set of values
+  ///          referenced by the kernel, and whose second element contains the
+  ///          set of functions referenced by the kernel. All functions in the
+  ///          second set satisfy isValidFunctionInKernel.
+  std::pair<SetVector<Value *>, SetVector<Function *>>
+  getReferencesInKernel(ppcg_kernel *Kernel);
 
   /// Compute the sizes of the execution grid for a given kernel.
   ///
@@ -365,8 +369,11 @@ private:
   ///
   /// @param Kernel The kernel to generate code for.
   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
+  /// @param SubtreeFunctions The set of llvm::Functions referenced by this
+  ///                         kernel.
   void createKernelFunction(ppcg_kernel *Kernel,
-                            SetVector<Value *> &SubtreeValues);
+                            SetVector<Value *> &SubtreeValues,
+                            SetVector<Function *> &SubtreeFunctions);
 
   /// Create the declaration of a kernel function.
   ///
@@ -389,6 +396,25 @@ private:
   /// @param The kernel to generate the intrinsic functions for.
   void insertKernelIntrinsics(ppcg_kernel *Kernel);
 
+  /// Setup the creation of functions referenced by the GPU kernel.
+  ///
+  /// 1. Create new function declarations in GPUModule which are the same as
+  /// SubtreeFunctions.
+  ///
+  /// 2. Populate IslNodeBuilder::ValueMap with mappings from
+  /// old functions (that come from the original module) to new functions
+  /// (that are created within GPUModule). That way, we generate references
+  /// to the correct function (in GPUModule) in BlockGenerator.
+  ///
+  /// @see IslNodeBuilder::ValueMap
+  /// @see BlockGenerator::GlobalMap
+  /// @see BlockGenerator::getNewValue
+  /// @see GPUNodeBuilder::getReferencesInKernel.
+  ///
+  /// @param SubtreeFunctions The set of llvm::Functions referenced by
+  ///                         this kernel.
+  void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
+
   /// Create a global-to-shared or shared-to-global copy statement.
   ///
   /// @param CopyStmt The copy statement to generate code for
@@ -1109,7 +1135,40 @@ isl_bool collectReferencesInGPUStmt(__is
   return isl_bool_true;
 }
 
-SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
+/// Check if F is a function that we can code-generate in a GPU kernel.
+static bool isValidFunctionInKernel(llvm::Function *F) {
+  assert(F && "F is an invalid pointer");
+  // We string compare against the name of the function to allow
+  // all variants of the intrinsic "llvm.sqrt.*"
+  return F->isIntrinsic() && F->getName().startswith("llvm.sqrt");
+}
+
+/// Do not take `Function` as a subtree value.
+///
+/// We try to take the reference of all subtree values and pass them along
+/// to the kernel from the host. Taking an address of any function and
+/// trying to pass along is nonsensical. Only allow `Value`s that are not
+/// `Function`s.
+static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
+
+/// Return `Function`s from `RawSubtreeValues`.
+static SetVector<Function *>
+getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) {
+  SetVector<Function *> SubtreeFunctions;
+  for (Value *It : RawSubtreeValues) {
+    Function *F = dyn_cast<Function>(It);
+    if (F) {
+      assert(isValidFunctionInKernel(F) && "Code should have bailed out by "
+                                           "this point if an invalid function "
+                                           "were present in a kernel.");
+      SubtreeFunctions.insert(F);
+    }
+  }
+  return SubtreeFunctions;
+}
+
+std::pair<SetVector<Value *>, SetVector<Function *>>
+GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
   SetVector<Value *> SubtreeValues;
   SetVector<const SCEV *> SCEVs;
   SetVector<const Loop *> Loops;
@@ -1146,7 +1205,19 @@ SetVector<Value *> GPUNodeBuilder::getRe
     isl_id_free(Id);
   }
 
-  return SubtreeValues;
+  // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
+  // SubtreeValues. This is important, because we should not lose any
+  // SubtreeValues in the process of constructing the
+  // "ValidSubtree{Values, Functions} sets. Nor should the set
+  // ValidSubtree{Values, Functions} have any common element.
+  auto ValidSubtreeValuesIt =
+      make_filter_range(SubtreeValues, isValidSubtreeValue);
+  SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
+                                        ValidSubtreeValuesIt.end());
+  SetVector<Function *> ValidSubtreeFunctions(
+      getFunctionsFromRawSubtreeValues(SubtreeValues));
+
+  return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions);
 }
 
 void GPUNodeBuilder::clearDominators(Function *F) {
@@ -1353,6 +1424,21 @@ GPUNodeBuilder::createLaunchParameters(p
                          Launch + "_params_i8ptr", Location);
 }
 
+void GPUNodeBuilder::setupKernelSubtreeFunctions(
+    SetVector<Function *> SubtreeFunctions) {
+  for (auto Fn : SubtreeFunctions) {
+    const std::string ClonedFnName = Fn->getName();
+    Function *Clone = GPUModule->getFunction(ClonedFnName);
+    if (!Clone)
+      Clone =
+          Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
+                           ClonedFnName, GPUModule.get());
+    assert(Clone && "Expected cloned function to be initialized.");
+    assert(ValueMap.find(Fn) == ValueMap.end() &&
+           "Fn already present in ValueMap");
+    ValueMap[Fn] = Clone;
+  }
+}
 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
   ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
@@ -1369,7 +1455,9 @@ void GPUNodeBuilder::createKernel(__isl_
   Value *BlockDimX, *BlockDimY, *BlockDimZ;
   std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
 
-  SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
+  SetVector<Value *> SubtreeValues;
+  SetVector<Function *> SubtreeFunctions;
+  std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel);
 
   assert(Kernel->tree && "Device AST of kernel node is empty");
 
@@ -1393,7 +1481,8 @@ void GPUNodeBuilder::createKernel(__isl_
     SubtreeValues.insert(V);
   }
 
-  createKernelFunction(Kernel, SubtreeValues);
+  createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
+  setupKernelSubtreeFunctions(SubtreeFunctions);
 
   create(isl_ast_node_copy(Kernel->tree));
 
@@ -1721,8 +1810,9 @@ void GPUNodeBuilder::createKernelVariabl
   }
 }
 
-void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
-                                          SetVector<Value *> &SubtreeValues) {
+void GPUNodeBuilder::createKernelFunction(
+    ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
+    SetVector<Function *> &SubtreeFunctions) {
   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
   GPUModule.reset(new Module(Identifier, Builder.getContext()));
 
@@ -2611,9 +2701,18 @@ public:
     return isl_ast_expr_ge(Iterations, MinComputeExpr);
   }
 
-  /// Check whether the Block contains any Function value.
-  bool ContainsFnPtrValInBlock(const BasicBlock *BB) {
-    for (const Instruction &Inst : *BB)
+  /// Check if the basic block contains a function we cannot codegen for GPU
+  /// kernels.
+  ///
+  /// If this basic block does something with a `Function` other than calling
+  /// a function that we support in a kernel, return true.
+  bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) {
+    for (const Instruction &Inst : *BB) {
+      const CallInst *Call = dyn_cast<CallInst>(&Inst);
+      if (Call && isValidFunctionInKernel(Call->getCalledFunction())) {
+        continue;
+      }
+
       for (Value *SrcVal : Inst.operands()) {
         PointerType *p = dyn_cast<PointerType>(SrcVal->getType());
         if (!p)
@@ -2621,20 +2720,21 @@ public:
         if (isa<FunctionType>(p->getElementType()))
           return true;
       }
+    }
     return false;
   }
 
-  /// Return whether the Scop S has functions.
-  bool ContainsFnPtr(const Scop &S) {
+  /// Return whether the Scop S uses functions in a way that we do not support.
+  bool containsInvalidKernelFunction(const Scop &S) {
     for (auto &Stmt : S) {
       if (Stmt.isBlockStmt()) {
-        if (ContainsFnPtrValInBlock(Stmt.getBasicBlock()))
+        if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock()))
           return true;
       } else {
         assert(Stmt.isRegionStmt() &&
                "Stmt was neither block nor region statement");
         for (const BasicBlock *BB : Stmt.getRegion()->blocks())
-          if (ContainsFnPtrValInBlock(BB))
+          if (containsInvalidKernelFunctionInBllock(BB))
             return true;
       }
     }
@@ -2708,13 +2808,18 @@ public:
     DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
     RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
 
-    // We currently do not support functions inside kernels, as code
-    // generation will need to offload function calls to the kernel.
-    // This may lead to a kernel trying to call a function on the host.
+    // We currently do not support functions other than intrinsics inside
+    // kernels, as code generation will need to offload function calls to the
+    // kernel. This may lead to a kernel trying to call a function on the host.
     // This also allows us to prevent codegen from trying to take the
     // address of an intrinsic function to send to the kernel.
-    if (ContainsFnPtr(CurrentScop))
+    if (containsInvalidKernelFunction(CurrentScop)) {
+      DEBUG(
+          dbgs()
+              << "Scop contains function which cannot be materialised in a GPU "
+                 "kernel. Bailing out.\n";);
       return false;
+    }
 
     auto PPCGScop = createPPCGScop();
     auto PPCGProg = createPPCGProg(PPCGScop);

Added: polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll?rev=306284&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll (added)
+++ polly/trunk/test/GPGPU/intrinsic-copied-into-kernel.ll Mon Jun 26 06:12:06 2017
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR
+; RUN: opt %loadPolly -S -polly-codegen-ppcg  < %s | FileCheck %s --check-prefix=HOST-IR
+
+; Test that we do recognise and codegen a kernel that has intrinsics.
+
+; REQUIRES: pollyacc
+
+; Check that we model the kernel as a scop.
+; SCOP:      Function: f
+; SCOP-NEXT:       Region: %entry.split---%for.end
+
+; Check that the intrinsic call is present in the kernel IR.
+; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
+; KERNEL-IR:   declare float @llvm.sqrt.f32(float) #2
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+
+; void f(float *A, float *B, int N) {
+;   for(int i = 0; i < N; i++) {
+;       B[i] = sqrt(A[i]);
+;   }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(float* %A, float* %B, i32 %N) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %A.arr.i.val = load float, float* %A.arr.i, align 4
+  ; Call to intrinsic that should be part of the kernel.
+  %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
+  %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %sqrt, float* %B.arr.i, align 4
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %N to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #0
+
+attributes #0 = { nounwind readnone }
+




More information about the llvm-commits mailing list