[polly] r301640 - [Polly] [PPCGCodeGeneration] Add managed memory support to GPU code

Fri Apr 28 04:16:31 PDT 2017

Author: bollu
Date: Fri Apr 28 06:16:30 2017
New Revision: 301640

URL: http://llvm.org/viewvc/llvm-project?rev=301640&view=rev
Log:
[Polly] [PPCGCodeGeneration] Add managed memory support to GPU code
generation.

This needs changes to GPURuntime to expose synchronization between host
and device.

1. Needs better function naming, I want a better name than
"getOrCreateManagedDeviceArray"

2. DeviceAllocations is used by both the managed memory and the
non-managed memory path. This exploits the fact that the two code paths
are never run together. I'm not sure if this is the best design decision

Reviewed by: PhilippSchaad

Tags: #polly

Differential Revision: https://reviews.llvm.org/D32215

Added:
    polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/tools/GPURuntime/GPUJIT.c
    polly/trunk/tools/GPURuntime/GPUJIT.h

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=301640&r1=301639&r2=301640&view=diff
==============================================================================

--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri Apr 28 06:16:30 2017
@@ -86,6 +86,13 @@ static cl::opt<bool> PrivateMemory("poll
                                    cl::init(false), cl::ZeroOrMore,
                                    cl::cat(PollyCategory));
 
+static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory",
+                                   cl::desc("Generate Host kernel code assuming"
+                                            " that all memory has been"
+                                            " declared as managed memory"),
+                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
+                                   cl::cat(PollyCategory));
+
 static cl::opt<std::string>
     CudaVersion("polly-acc-cuda-version",
                 cl::desc("The CUDA version to compile for"), cl::Hidden,
@@ -243,6 +250,14 @@ private:
   /// @returns A tuple with grid sizes for X and Y dimension
   std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
 
+  /// Creates a array that can be sent to the kernel on the device using a
+  /// host pointer. This is required for managed memory, when we directly send
+  /// host pointers to the device.
+  /// \note
+  /// This is to be used only with managed memory
+  Value *getOrCreateManagedDeviceArray(gpu_array_info *Array,
+                                       ScopArrayInfo *ArrayInfo);
+
   /// Compute the sizes of the thread blocks for a given kernel.
   ///
   /// @param Kernel The kernel to compute thread block sizes for.
@@ -449,6 +464,11 @@ private:
   void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
                                       Value *Size);
 
+  /// Create a call to synchronize Host & Device.
+  /// \note
+  /// This is to be used only with managed memory.
+  void createCallSynchronizeDevice();
+
   /// Create a call to get a kernel from an assembly string.
   ///
   /// @param Buffer The string describing the kernel.
@@ -485,16 +505,22 @@ void GPUNodeBuilder::initializeAfterRTH(
   Builder.SetInsertPoint(&NewBB->front());
 
   GPUContext = createCallInitContext();
-  allocateDeviceArrays();
+
+  if (!ManagedMemory)
+    allocateDeviceArrays();
 }
 
 void GPUNodeBuilder::finalize() {
-  freeDeviceArrays();
+  if (!ManagedMemory)
+    freeDeviceArrays();
+
   createCallFreeContext(GPUContext);
   IslNodeBuilder::finalize();
 }
 
 void GPUNodeBuilder::allocateDeviceArrays() {
+  assert(!ManagedMemory && "Managed memory will directly send host pointers "
+                           "to the kernel. There is no need for device arrays");
   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
 
   for (int i = 0; i < Prog->n_array; ++i) {
@@ -540,6 +566,7 @@ void GPUNodeBuilder::addCUDAAnnotations(
 }
 
 void GPUNodeBuilder::freeDeviceArrays() {
+  assert(!ManagedMemory && "Managed memory does not use device arrays");
   for (auto &Array : DeviceAllocations)
     createCallFreeDeviceMemory(Array.second);
 }
@@ -624,6 +651,8 @@ void GPUNodeBuilder::createCallFreeKerne
 }
 
 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
+  assert(!ManagedMemory && "Managed memory does not allocate or free memory "
+                           "for device");
   const char *Name = "polly_freeDeviceMemory";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
@@ -641,6 +670,8 @@ void GPUNodeBuilder::createCallFreeDevic
 }
 
 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
+  assert(!ManagedMemory && "Managed memory does not allocate or free memory "
+                           "for device");
   const char *Name = "polly_allocateMemoryForDevice";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
@@ -660,6 +691,8 @@ Value *GPUNodeBuilder::createCallAllocat
 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
                                                     Value *DeviceData,
                                                     Value *Size) {
+  assert(!ManagedMemory && "Managed memory does not transfer memory between "
+                           "device and host");
   const char *Name = "polly_copyFromHostToDevice";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
@@ -681,6 +714,8 @@ void GPUNodeBuilder::createCallCopyFromH
 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
                                                     Value *HostData,
                                                     Value *Size) {
+  assert(!ManagedMemory && "Managed memory does not transfer memory between "
+                           "device and host");
   const char *Name = "polly_copyFromDeviceToHost";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
@@ -699,6 +734,23 @@ void GPUNodeBuilder::createCallCopyFromD
   Builder.CreateCall(F, {DeviceData, HostData, Size});
 }
 
+void GPUNodeBuilder::createCallSynchronizeDevice() {
+  assert(ManagedMemory && "explicit synchronization is only necessary for "
+                          "managed memory");
+  const char *Name = "polly_synchronizeDevice";
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall(F);
+}
+
 Value *GPUNodeBuilder::createCallInitContext() {
   const char *Name = "polly_initContext";
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -805,8 +857,39 @@ Value *GPUNodeBuilder::getArrayOffset(gp
   return ResultValue;
 }
 
+Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array,
+                                                     ScopArrayInfo *ArrayInfo) {
+
+  assert(ManagedMemory && "Only used when you wish to get a host "
+                          "pointer for sending data to the kernel, "
+                          "with managed memory");
+  std::map<ScopArrayInfo *, Value *>::iterator it;
+  if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) {
+    return it->second;
+  } else {
+    Value *HostPtr;
+
+    if (gpu_array_is_scalar(Array))
+      HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo);
+    else
+      HostPtr = ArrayInfo->getBasePtr();
+
+    Value *Offset = getArrayOffset(Array);
+    if (Offset) {
+      HostPtr = Builder.CreatePointerCast(
+          HostPtr, ArrayInfo->getElementType()->getPointerTo());
+      HostPtr = Builder.CreateGEP(HostPtr, Offset);
+    }
+
+    HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
+    DeviceAllocations[ArrayInfo] = HostPtr;
+    return HostPtr;
+  }
+}
+
 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
                                         enum DataDirection Direction) {
+  assert(!ManagedMemory && "Managed memory needs no data transfers");
   isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
   isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
   isl_id *Id = isl_ast_expr_get_id(Arg);
@@ -864,13 +947,22 @@ void GPUNodeBuilder::createUser(__isl_ta
   }
 
   if (isPrefix(Str, "to_device")) {
-    createDataTransfer(UserStmt, HOST_TO_DEVICE);
+    if (!ManagedMemory)
+      createDataTransfer(UserStmt, HOST_TO_DEVICE);
+    else
+      isl_ast_node_free(UserStmt);
+
     isl_ast_expr_free(Expr);
     return;
   }
 
   if (isPrefix(Str, "from_device")) {
-    createDataTransfer(UserStmt, DEVICE_TO_HOST);
+    if (!ManagedMemory) {
+      createDataTransfer(UserStmt, DEVICE_TO_HOST);
+    } else {
+      createCallSynchronizeDevice();
+      isl_ast_node_free(UserStmt);
+    }
     isl_ast_expr_free(Expr);
     return;
   }
@@ -1096,9 +1188,16 @@ GPUNodeBuilder::createLaunchParameters(p
     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
 
-    Value *DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
-    DevArray = createCallGetDevicePtr(DevArray);
-
+    Value *DevArray = nullptr;
+    if (ManagedMemory) {
+      DevArray = getOrCreateManagedDeviceArray(
+          &Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
+    } else {
+      DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
+      DevArray = createCallGetDevicePtr(DevArray);
+    }
+    assert(DevArray != nullptr && "Array to be offloaded to device not "
+                                  "initialized");
     Value *Offset = getArrayOffset(&Prog->array[i]);
 
     if (Offset) {
@@ -1111,7 +1210,14 @@ GPUNodeBuilder::createLaunchParameters(p
         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
 
     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
-      Value *ValPtr = BlockGen.getOrCreateAlloca(SAI);
+      Value *ValPtr = nullptr;
+      if (ManagedMemory)
+        ValPtr = DevArray;
+      else
+        ValPtr = BlockGen.getOrCreateAlloca(SAI);
+
+      assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
+                                  " to be stored into Parameters");
       Value *ValPtrCast =
           Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
       Builder.CreateStore(ValPtrCast, Slot);

Added: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll?rev=301640&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll (added)
+++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll Fri Apr 28 06:16:30 2017
@@ -0,0 +1,118 @@
+; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 -polly-target=gpu  -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+;
+;    #include <cuda_runtime.h>
+;
+;    static const int N = 45;
+;
+;    void copy(int *R, int *A) {
+;      for (int i = 0; i < N; i++) {
+;        R[i] = A[i] * 10;
+;      }
+;    }
+;
+;    int main() {
+;      int *A, *R;
+;
+;      cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
+;      cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
+;
+;      for (int i = 0; i < N; i++) {
+;        A[i] = i;
+;        R[i] = 0;
+;      }
+;      copy(R, A);
+;
+;      return 0;
+;    }
+;
+
+; CHECK-NOT: polly_copyFromHostToDevice
+; CHECK-NOT: polly_copyFromDeviceToHost
+; CHECK-NOT: polly_freeDeviceMemory
+; CHECK-NOT: polly_allocateMemoryForDevice
+
+; CHECK:       %13 = call i8* @polly_initContext()
+; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
+; CHECK-NEXT:  %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
+; CHECK-NEXT:  %16 = bitcast i8** %polly_launch_0_param_0 to i8*
+; CHECK-NEXT:  store i8* %16, i8** %15
+; CHECK-NEXT:  %17 = bitcast i32* %R to i8*
+; CHECK-NEXT:  %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
+; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
+; CHECK-NEXT:  store i8* %19, i8** %18
+; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([820 x i8], [820 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
+; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
+; CHECK-NEXT:  call void @polly_synchronizeDevice()
+; CHECK-NEXT:  call void @polly_freeContext(i8* %13)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @copy(i32* %R, i32* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, 45
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %tmp, 10
+  %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define i32 @main() {
+entry:
+  %A = alloca i32*, align 8
+  %R = alloca i32*, align 8
+  %tmp = bitcast i32** %A to i8**
+  %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
+  %tmp1 = bitcast i32** %R to i8**
+  %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %exitcond = icmp ne i64 %indvars.iv, 45
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp2 = load i32*, i32** %A, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
+  %tmp3 = trunc i64 %indvars.iv to i32
+  store i32 %tmp3, i32* %arrayidx, align 4
+  %tmp4 = load i32*, i32** %R, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
+  store i32 0, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %tmp5 = load i32*, i32** %R, align 8
+  %tmp6 = load i32*, i32** %A, align 8
+  call void @copy(i32* %tmp5, i32* %tmp6)
+  ret i32 0
+}
+
+declare i32 @cudaMallocManaged(i8**, i64, i32) #1

Modified: polly/trunk/tools/GPURuntime/GPUJIT.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.c?rev=301640&r1=301639&r2=301640&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.c (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.c Fri Apr 28 06:16:30 2017
@@ -130,6 +130,9 @@ static CuLinkCompleteFcnTy *CuLinkComple
 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
 
+typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
+static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
+
 /* Type-defines of function pointer ot CUDA runtime APIs. */
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
@@ -233,6 +236,9 @@ static int initialDeviceAPIs() {
   CuLinkDestroyFcnPtr =
       (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
 
+  CuCtxSynchronizeFcnPtr =
+      (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
+
   /* Get function pointer to CUDA Runtime APIs. */
   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
       HandleCudaRT, "cudaThreadSynchronize");
@@ -436,6 +442,13 @@ void polly_copyFromDeviceToHost(PollyGPU
     exit(-1);
   }
 }
+void polly_synchronizeDevice() {
+  dump_function();
+  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
+    fprintf(stdout, "Synchronizing device and host memory failed.\n");
+    exit(-1);
+  }
+}
 
 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                         unsigned int GridDimY, unsigned int BlockDimX,

Modified: polly/trunk/tools/GPURuntime/GPUJIT.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.h?rev=301640&r1=301639&r2=301640&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.h (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.h Fri Apr 28 06:16:30 2017
@@ -88,6 +88,7 @@ void polly_copyFromHostToDevice(void *Ho
                                 long MemSize);
 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
                                 long MemSize);
+void polly_synchronizeDevice();
 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                         unsigned int GridDimY, unsigned int BlockSizeX,
                         unsigned int BlockSizeY, unsigned int BlockSizeZ,