[polly] r310196 - [GPGPU] Make sure managed arrays are prepared at the beginning of the scop

Sun Aug 6 04:10:38 PDT 2017

Author: grosser
Date: Sun Aug  6 04:10:38 2017
New Revision: 310196

URL: http://llvm.org/viewvc/llvm-project?rev=310196&view=rev
Log:
[GPGPU] Make sure managed arrays are prepared at the beginning of the scop

Summary:
This resolves some "instruction does not dominate use" errors, as we used to
prepare the arrays at the location of the first kernel, which not necessarily
dominated all other kernel calls.

Reviewers: Meinersbur, bollu, singam-sanjay

Subscribers: nemanjai, pollydev, llvm-commits, kbarton

Differential Revision: https://reviews.llvm.org/D36372

Added:
    polly/trunk/test/GPGPU/managed-pointers-preparation.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=310196&r1=310195&r2=310196&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Sun Aug  6 04:10:38 2017
@@ -441,13 +441,10 @@ private:
   /// @returns A tuple with grid sizes for X and Y dimension
   std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
 
-  /// Creates a array that can be sent to the kernel on the device using a
-  /// host pointer. This is required for managed memory, when we directly send
-  /// host pointers to the device.
+  /// Get the managed array pointer for sending host pointers to the device.
   /// \note
   /// This is to be used only with managed memory
-  Value *getOrCreateManagedDeviceArray(gpu_array_info *Array,
-                                       ScopArrayInfo *ArrayInfo);
+  Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo);
 
   /// Compute the sizes of the thread blocks for a given kernel.
   ///
@@ -652,6 +649,9 @@ private:
   /// Create code that allocates memory to store arrays on device.
   void allocateDeviceArrays();
 
+  /// Create code to prepare the managed device pointers.
+  void prepareManagedDeviceArrays();
+
   /// Free all allocated device arrays.
   void freeDeviceArrays();
 
@@ -747,6 +747,8 @@ void GPUNodeBuilder::initializeAfterRTH(
 
   if (!ManagedMemory)
     allocateDeviceArrays();
+  else
+    prepareManagedDeviceArrays();
 }
 
 void GPUNodeBuilder::finalize() {
@@ -783,6 +785,32 @@ void GPUNodeBuilder::allocateDeviceArray
   isl_ast_build_free(Build);
 }
 
+void GPUNodeBuilder::prepareManagedDeviceArrays() {
+  assert(ManagedMemory &&
+         "Device array most only be prepared in managed-memory mode");
+  for (int i = 0; i < Prog->n_array; ++i) {
+    gpu_array_info *Array = &Prog->array[i];
+    ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user;
+    Value *HostPtr;
+
+    if (gpu_array_is_scalar(Array))
+      HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
+    else
+      HostPtr = ScopArray->getBasePtr();
+    HostPtr = getLatestValue(HostPtr);
+
+    Value *Offset = getArrayOffset(Array);
+    if (Offset) {
+      HostPtr = Builder.CreatePointerCast(
+          HostPtr, ScopArray->getElementType()->getPointerTo());
+      HostPtr = Builder.CreateGEP(HostPtr, Offset);
+    }
+
+    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
+    DeviceAllocations[ScopArray] = HostPtr;
+  }
+}
+
 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
                                         Value *BlockDimY, Value *BlockDimZ) {
   auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
@@ -1102,35 +1130,16 @@ Value *GPUNodeBuilder::getArrayOffset(gp
   return ExprBuilder.create(Result.release());
 }
 
-Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array,
-                                                     ScopArrayInfo *ArrayInfo) {
-
+Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array,
+                                             ScopArrayInfo *ArrayInfo) {
   assert(ManagedMemory && "Only used when you wish to get a host "
                           "pointer for sending data to the kernel, "
                           "with managed memory");
   std::map<ScopArrayInfo *, Value *>::iterator it;
-  if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) {
-    return it->second;
-  } else {
-    Value *HostPtr;
-
-    if (gpu_array_is_scalar(Array))
-      HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo);
-    else
-      HostPtr = ArrayInfo->getBasePtr();
-    HostPtr = getLatestValue(HostPtr);
-
-    Value *Offset = getArrayOffset(Array);
-    if (Offset) {
-      HostPtr = Builder.CreatePointerCast(
-          HostPtr, ArrayInfo->getElementType()->getPointerTo());
-      HostPtr = Builder.CreateGEP(HostPtr, Offset);
-    }
-
-    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
-    DeviceAllocations[ArrayInfo] = HostPtr;
-    return HostPtr;
-  }
+  it = DeviceAllocations.find(ArrayInfo);
+  assert(it != DeviceAllocations.end() &&
+         "Device array expected to be available");
+  return it->second;
 }
 
 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
@@ -1577,8 +1586,8 @@ GPUNodeBuilder::createLaunchParameters(p
 
     Value *DevArray = nullptr;
     if (ManagedMemory) {
-      DevArray = getOrCreateManagedDeviceArray(
-          &Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
+      DevArray = getManagedDeviceArray(&Prog->array[i],
+                                       const_cast<ScopArrayInfo *>(SAI));
     } else {
       DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
       DevArray = createCallGetDevicePtr(DevArray);

Modified: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll?rev=310196&r1=310195&r2=310196&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll (original)
+++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll Sun Aug  6 04:10:38 2017
@@ -37,13 +37,13 @@
 
 ; CHECK:       %13 = call i8* @polly_initContextCUDA()
 ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
-; CHECK-NEXT:  %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; CHECK-NEXT:  %15 = bitcast i32* %R to i8*
+; CHECK-NEXT:  %16 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
-; CHECK-NEXT:  %16 = bitcast i8** %polly_launch_0_param_0 to i8*
-; CHECK-NEXT:  store i8* %16, i8** %15
-; CHECK-NEXT:  %17 = bitcast i32* %R to i8*
+; CHECK-NEXT:  %17 = bitcast i8** %polly_launch_0_param_0 to i8*
+; CHECK-NEXT:  store i8* %17, i8** %16
 ; CHECK-NEXT:  %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
+; CHECK-NEXT:  store i8* %15, i8** %polly_launch_0_param_1
 ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
 ; CHECK-NEXT:  store i8* %19, i8** %18
 ; CHECK-NEXT:  store i32 4, i32* %polly_launch_0_param_size_0

Added: polly/trunk/test/GPGPU/managed-pointers-preparation.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/managed-pointers-preparation.ll?rev=310196&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/managed-pointers-preparation.ll (added)
+++ polly/trunk/test/GPGPU/managed-pointers-preparation.ll Sun Aug  6 04:10:38 2017
@@ -0,0 +1,105 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-invariant-load-hoisting \
+; RUN: -S -polly-acc-codegen-managed-memory < %s | FileCheck %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -polly-invariant-load-hoisting \
+; RUN: -S -polly-acc-codegen-managed-memory -disable-output \
+; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
+
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK: @polly_launchKernel
+; CHECK-NOT: @polly_launchKernel
+
+
+; CODE:  if (p_0_loaded_from___data_runcontrol_MOD_lmulti_layer == 0) {
+; CODE-NEXT:    {
+; CODE-NEXT:      dim3 k0_dimBlock;
+; CODE-NEXT:      dim3 k0_dimGrid;
+; CODE-NEXT:      kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef__pn__phi, p_0_loaded_from___data_runcontrol_MOD_lmulti_layer);
+; CODE-NEXT:      cudaCheckKernel();
+; CODE-NEXT:    }
+
+; CODE:  } else {
+; CODE-NEXT:    {
+; CODE-NEXT:      dim3 k1_dimBlock;
+; CODE-NEXT:      dim3 k1_dimGrid;
+; CODE-NEXT:      kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef__pn__phi, p_0_loaded_from___data_runcontrol_MOD_lmulti_layer);
+; CODE-NEXT:      cudaCheckKernel();
+; CODE-NEXT:    }
+
+; CHECK that this program is correctly code generated and does not result in
+; 'instruction does not dominate use' errors. At an earlier point, such errors
+; have been generated as the preparation of the managed memory pointers was
+; performed right before kernel0, which does not dominate all other kernels.
+; Now the preparation is performed at the very beginning of the scop.
+
+source_filename = "bugpoint-output-c78f41e.bc"
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at __data_radiation_MOD_rad_csalbw = external global [10 x double], align 32
+ at __data_radiation_MOD_coai = external global [168 x double], align 32
+ at __data_runcontrol_MOD_lmulti_layer = external global i32
+
+; Function Attrs: nounwind uwtable
+define void @__radiation_interface_MOD_radiation_init() #0 {
+entry:
+  br label %"94"
+
+"94":                                             ; preds = %"97", %entry
+  br label %"95"
+
+"95":                                             ; preds = %"95", %"94"
+  br i1 undef, label %"97", label %"95"
+
+"97":                                             ; preds = %"95"
+  br i1 undef, label %"99", label %"94"
+
+"99":                                             ; preds = %"97"
+  br label %"102"
+
+"102":                                            ; preds = %"102", %"99"
+  %indvars.iv17 = phi i64 [ %indvars.iv.next18, %"102" ], [ 1, %"99" ]
+  %0 = getelementptr [168 x double], [168 x double]* @__data_radiation_MOD_coai, i64 0, i64 0
+  store double 1.000000e+00, double* %0, align 8
+  %1 = icmp eq i64 %indvars.iv17, 3
+  %indvars.iv.next18 = add nuw nsw i64 %indvars.iv17, 1
+  br i1 %1, label %"110", label %"102"
+
+"110":                                            ; preds = %"102"
+  %2 = load i32, i32* @__data_runcontrol_MOD_lmulti_layer, align 4, !range !0
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %"112", label %"111"
+
+"111":                                            ; preds = %"110"
+  br label %"115"
+
+"112":                                            ; preds = %"110"
+  br label %"115"
+
+"115":                                            ; preds = %"112", %"111"
+  %.pn = phi double [ undef, %"112" ], [ undef, %"111" ]
+  %4 = fdiv double 1.000000e+00, %.pn
+  br label %"116"
+
+"116":                                            ; preds = %"116", %"115"
+  %indvars.iv = phi i64 [ %indvars.iv.next, %"116" ], [ 1, %"115" ]
+  %5 = add nsw i64 %indvars.iv, -1
+  %6 = fmul double %4, undef
+  %7 = getelementptr [10 x double], [10 x double]* @__data_radiation_MOD_rad_csalbw, i64 0, i64 %5
+  store double %6, double* %7, align 8
+  %8 = icmp eq i64 %indvars.iv, 10
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br i1 %8, label %return, label %"116"
+
+return:                                           ; preds = %"116"
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = !{i32 0, i32 2}