[polly] r277799 - GPGPU: Add cuda annotations to specify maximal number of threads per block

Thu Aug 4 23:47:44 PDT 2016

Author: grosser
Date: Fri Aug  5 01:47:43 2016
New Revision: 277799

URL: http://llvm.org/viewvc/llvm-project?rev=277799&view=rev
Log:
GPGPU: Add cuda annotations to specify maximal number of threads per block

These annotations ensure that the NVIDIA PTX assembler limits the number of
registers used such that we can be certain the resulting kernel can be executed
for the number of threads in a thread block that we are planning to use.

Added:
    polly/trunk/test/GPGPU/cuda-annotations.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=277799&r1=277798&r2=277799&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri Aug  5 01:47:43 2016
@@ -249,6 +249,21 @@ private:
   /// @param FN            The function into which to generate the variables.
   void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
 
+  /// Add CUDA annotations to module.
+  ///
+  /// Add a set of CUDA annotations that declares the maximal block dimensions
+  /// that will be used to execute the CUDA kernel. This allows the NVIDIA
+  /// PTX compiler to bound the number of allocated registers to ensure the
+  /// resulting kernel is known to run with up to as many block dimensions
+  /// as specified here.
+  ///
+  /// @param M         The module to add the annotations to.
+  /// @param BlockDimX The size of block dimension X.
+  /// @param BlockDimY The size of block dimension Y.
+  /// @param BlockDimZ The size of block dimension Z.
+  void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
+                          Value *BlockDimZ);
+
   /// Create GPU kernel.
   ///
   /// Code generate the kernel described by @p KernelStmt.
@@ -448,6 +463,27 @@ void GPUNodeBuilder::allocateDeviceArray
   isl_ast_build_free(Build);
 }
 
+void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
+                                        Value *BlockDimY, Value *BlockDimZ) {
+  auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+  for (auto &F : *M) {
+    if (F.getCallingConv() != CallingConv::PTX_Kernel)
+      continue;
+
+    Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
+
+    Metadata *Elements[] = {
+        ValueAsMetadata::get(&F),   MDString::get(M->getContext(), "maxntidx"),
+        ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
+        ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
+        ValueAsMetadata::get(V[2]),
+    };
+    MDNode *Node = MDNode::get(M->getContext(), Elements);
+    AnnotationNode->addOperand(Node);
+  }
+}
+
 void GPUNodeBuilder::freeDeviceArrays() {
   for (auto &Array : DeviceAllocations)
     createCallFreeDeviceMemory(Array.second);
@@ -1021,6 +1057,9 @@ void GPUNodeBuilder::createKernel(__isl_
   isl_id_free(Id);
   isl_ast_node_free(KernelStmt);
 
+  Value *BlockDimX, *BlockDimY, *BlockDimZ;
+  std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
+
   SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
 
   assert(Kernel->tree && "Device AST of kernel node is empty");
@@ -1048,6 +1087,7 @@ void GPUNodeBuilder::createKernel(__isl_
   create(isl_ast_node_copy(Kernel->tree));
 
   Function *F = Builder.GetInsertBlock()->getParent();
+  addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
   clearDominators(F);
   clearScalarEvolution(F);
   clearLoops(F);
@@ -1076,9 +1116,6 @@ void GPUNodeBuilder::createKernel(__isl_
   Value *GridDimX, *GridDimY;
   std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
 
-  Value *BlockDimX, *BlockDimY, *BlockDimZ;
-  std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
-
   createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
                          BlockDimZ, Parameters);
   createCallFreeKernel(GPUKernel);

Added: polly/trunk/test/GPGPU/cuda-annotations.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/cuda-annotations.ll?rev=277799&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/cuda-annotations.ll (added)
+++ polly/trunk/test/GPGPU/cuda-annotations.ll Fri Aug  5 01:47:43 2016
@@ -0,0 +1,35 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %n) #0 {
+
+; KERNEL: !nvvm.annotations = !{!0}
+
+; KERNEL: !0 = !{void (i8*, i64)* @kernel_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i64* %A, i64 %n) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb6, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
+  %tmp = icmp slt i64 %i.0, %n
+  br i1 %tmp, label %bb2, label %bb8
+
+bb2:                                              ; preds = %bb1
+  %tmp3 = getelementptr inbounds i64, i64* %A, i64 %i.0
+  %tmp4 = load i64, i64* %tmp3, align 8
+  %tmp5 = add nsw i64 %tmp4, 100
+  store i64 %tmp5, i64* %tmp3, align 8
+  br label %bb6
+
+bb6:                                              ; preds = %bb2
+  %tmp7 = add nuw nsw i64 %i.0, 1
+  br label %bb1
+
+bb8:                                              ; preds = %bb1
+  ret void
+}