[polly] r301640 - [Polly] [PPCGCodeGeneration] Add managed memory support to GPU code
Siddharth Bhat via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 28 04:16:31 PDT 2017
Author: bollu
Date: Fri Apr 28 06:16:30 2017
New Revision: 301640
URL: http://llvm.org/viewvc/llvm-project?rev=301640&view=rev
Log:
[Polly] [PPCGCodeGeneration] Add managed memory support to GPU code
generation.
This needs changes to GPURuntime to expose synchronization between host
and device.
1. Needs better function naming, I want a better name than
"getOrCreateManagedDeviceArray"
2. DeviceAllocations is used by both the managed memory and the
non-managed memory path. This exploits the fact that the two code paths
are never run together. I'm not sure if this is the best design decision
Reviewed by: PhilippSchaad
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32215
Added:
polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
Modified:
polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
polly/trunk/tools/GPURuntime/GPUJIT.c
polly/trunk/tools/GPURuntime/GPUJIT.h
Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=301640&r1=301639&r2=301640&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri Apr 28 06:16:30 2017
@@ -86,6 +86,13 @@ static cl::opt<bool> PrivateMemory("poll
cl::init(false), cl::ZeroOrMore,
cl::cat(PollyCategory));
+static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory",
+ cl::desc("Generate Host kernel code assuming"
+ " that all memory has been"
+ " declared as managed memory"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore,
+ cl::cat(PollyCategory));
+
static cl::opt<std::string>
CudaVersion("polly-acc-cuda-version",
cl::desc("The CUDA version to compile for"), cl::Hidden,
@@ -243,6 +250,14 @@ private:
/// @returns A tuple with grid sizes for X and Y dimension
std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
+ /// Creates a array that can be sent to the kernel on the device using a
+ /// host pointer. This is required for managed memory, when we directly send
+ /// host pointers to the device.
+ /// \note
+ /// This is to be used only with managed memory
+ Value *getOrCreateManagedDeviceArray(gpu_array_info *Array,
+ ScopArrayInfo *ArrayInfo);
+
/// Compute the sizes of the thread blocks for a given kernel.
///
/// @param Kernel The kernel to compute thread block sizes for.
@@ -449,6 +464,11 @@ private:
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
Value *Size);
+ /// Create a call to synchronize Host & Device.
+ /// \note
+ /// This is to be used only with managed memory.
+ void createCallSynchronizeDevice();
+
/// Create a call to get a kernel from an assembly string.
///
/// @param Buffer The string describing the kernel.
@@ -485,16 +505,22 @@ void GPUNodeBuilder::initializeAfterRTH(
Builder.SetInsertPoint(&NewBB->front());
GPUContext = createCallInitContext();
- allocateDeviceArrays();
+
+ if (!ManagedMemory)
+ allocateDeviceArrays();
}
void GPUNodeBuilder::finalize() {
- freeDeviceArrays();
+ if (!ManagedMemory)
+ freeDeviceArrays();
+
createCallFreeContext(GPUContext);
IslNodeBuilder::finalize();
}
void GPUNodeBuilder::allocateDeviceArrays() {
+ assert(!ManagedMemory && "Managed memory will directly send host pointers "
+ "to the kernel. There is no need for device arrays");
isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
for (int i = 0; i < Prog->n_array; ++i) {
@@ -540,6 +566,7 @@ void GPUNodeBuilder::addCUDAAnnotations(
}
void GPUNodeBuilder::freeDeviceArrays() {
+ assert(!ManagedMemory && "Managed memory does not use device arrays");
for (auto &Array : DeviceAllocations)
createCallFreeDeviceMemory(Array.second);
}
@@ -624,6 +651,8 @@ void GPUNodeBuilder::createCallFreeKerne
}
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
+ assert(!ManagedMemory && "Managed memory does not allocate or free memory "
+ "for device");
const char *Name = "polly_freeDeviceMemory";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
@@ -641,6 +670,8 @@ void GPUNodeBuilder::createCallFreeDevic
}
Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
+ assert(!ManagedMemory && "Managed memory does not allocate or free memory "
+ "for device");
const char *Name = "polly_allocateMemoryForDevice";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
@@ -660,6 +691,8 @@ Value *GPUNodeBuilder::createCallAllocat
void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
Value *DeviceData,
Value *Size) {
+ assert(!ManagedMemory && "Managed memory does not transfer memory between "
+ "device and host");
const char *Name = "polly_copyFromHostToDevice";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
@@ -681,6 +714,8 @@ void GPUNodeBuilder::createCallCopyFromH
void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
Value *HostData,
Value *Size) {
+ assert(!ManagedMemory && "Managed memory does not transfer memory between "
+ "device and host");
const char *Name = "polly_copyFromDeviceToHost";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *F = M->getFunction(Name);
@@ -699,6 +734,23 @@ void GPUNodeBuilder::createCallCopyFromD
Builder.CreateCall(F, {DeviceData, HostData, Size});
}
+void GPUNodeBuilder::createCallSynchronizeDevice() {
+ assert(ManagedMemory && "explicit synchronization is only necessary for "
+ "managed memory");
+ const char *Name = "polly_synchronizeDevice";
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Function *F = M->getFunction(Name);
+
+ // If F is not available, declare it.
+ if (!F) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
+ F = Function::Create(Ty, Linkage, Name, M);
+ }
+
+ Builder.CreateCall(F);
+}
+
Value *GPUNodeBuilder::createCallInitContext() {
const char *Name = "polly_initContext";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -805,8 +857,39 @@ Value *GPUNodeBuilder::getArrayOffset(gp
return ResultValue;
}
+Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array,
+ ScopArrayInfo *ArrayInfo) {
+
+ assert(ManagedMemory && "Only used when you wish to get a host "
+ "pointer for sending data to the kernel, "
+ "with managed memory");
+ std::map<ScopArrayInfo *, Value *>::iterator it;
+ if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) {
+ return it->second;
+ } else {
+ Value *HostPtr;
+
+ if (gpu_array_is_scalar(Array))
+ HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo);
+ else
+ HostPtr = ArrayInfo->getBasePtr();
+
+ Value *Offset = getArrayOffset(Array);
+ if (Offset) {
+ HostPtr = Builder.CreatePointerCast(
+ HostPtr, ArrayInfo->getElementType()->getPointerTo());
+ HostPtr = Builder.CreateGEP(HostPtr, Offset);
+ }
+
+ HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
+ DeviceAllocations[ArrayInfo] = HostPtr;
+ return HostPtr;
+ }
+}
+
void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
enum DataDirection Direction) {
+ assert(!ManagedMemory && "Managed memory needs no data transfers");
isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
isl_id *Id = isl_ast_expr_get_id(Arg);
@@ -864,13 +947,22 @@ void GPUNodeBuilder::createUser(__isl_ta
}
if (isPrefix(Str, "to_device")) {
- createDataTransfer(UserStmt, HOST_TO_DEVICE);
+ if (!ManagedMemory)
+ createDataTransfer(UserStmt, HOST_TO_DEVICE);
+ else
+ isl_ast_node_free(UserStmt);
+
isl_ast_expr_free(Expr);
return;
}
if (isPrefix(Str, "from_device")) {
- createDataTransfer(UserStmt, DEVICE_TO_HOST);
+ if (!ManagedMemory) {
+ createDataTransfer(UserStmt, DEVICE_TO_HOST);
+ } else {
+ createCallSynchronizeDevice();
+ isl_ast_node_free(UserStmt);
+ }
isl_ast_expr_free(Expr);
return;
}
@@ -1096,9 +1188,16 @@ GPUNodeBuilder::createLaunchParameters(p
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
- Value *DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
- DevArray = createCallGetDevicePtr(DevArray);
-
+ Value *DevArray = nullptr;
+ if (ManagedMemory) {
+ DevArray = getOrCreateManagedDeviceArray(
+ &Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
+ } else {
+ DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
+ DevArray = createCallGetDevicePtr(DevArray);
+ }
+ assert(DevArray != nullptr && "Array to be offloaded to device not "
+ "initialized");
Value *Offset = getArrayOffset(&Prog->array[i]);
if (Offset) {
@@ -1111,7 +1210,14 @@ GPUNodeBuilder::createLaunchParameters(p
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
- Value *ValPtr = BlockGen.getOrCreateAlloca(SAI);
+ Value *ValPtr = nullptr;
+ if (ManagedMemory)
+ ValPtr = DevArray;
+ else
+ ValPtr = BlockGen.getOrCreateAlloca(SAI);
+
+ assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
+ " to be stored into Parameters");
Value *ValPtrCast =
Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
Builder.CreateStore(ValPtrCast, Slot);
Added: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll?rev=301640&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll (added)
+++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll Fri Apr 28 06:16:30 2017
@@ -0,0 +1,118 @@
+; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+;
+; #include <cuda_runtime.h>
+;
+; static const int N = 45;
+;
+; void copy(int *R, int *A) {
+; for (int i = 0; i < N; i++) {
+; R[i] = A[i] * 10;
+; }
+; }
+;
+; int main() {
+; int *A, *R;
+;
+; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
+; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
+;
+; for (int i = 0; i < N; i++) {
+; A[i] = i;
+; R[i] = 0;
+; }
+; copy(R, A);
+;
+; return 0;
+; }
+;
+
+; CHECK-NOT: polly_copyFromHostToDevice
+; CHECK-NOT: polly_copyFromDeviceToHost
+; CHECK-NOT: polly_freeDeviceMemory
+; CHECK-NOT: polly_allocateMemoryForDevice
+
+; CHECK: %13 = call i8* @polly_initContext()
+; CHECK-NEXT: %14 = bitcast i32* %A to i8*
+; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
+; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0
+; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8*
+; CHECK-NEXT: store i8* %16, i8** %15
+; CHECK-NEXT: %17 = bitcast i32* %R to i8*
+; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
+; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1
+; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*
+; CHECK-NEXT: store i8* %19, i8** %18
+; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([820 x i8], [820 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
+; CHECK-NEXT: call void @polly_freeKernel(i8* %20)
+; CHECK-NEXT: call void @polly_synchronizeDevice()
+; CHECK-NEXT: call void @polly_freeContext(i8* %13)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @copy(i32* %R, i32* %A) {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+ %exitcond = icmp ne i64 %indvars.iv, 45
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %tmp = load i32, i32* %arrayidx, align 4
+ %mul = mul nsw i32 %tmp, 10
+ %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
+ store i32 %mul, i32* %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret void
+}
+
+define i32 @main() {
+entry:
+ %A = alloca i32*, align 8
+ %R = alloca i32*, align 8
+ %tmp = bitcast i32** %A to i8**
+ %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
+ %tmp1 = bitcast i32** %R to i8**
+ %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+ %exitcond = icmp ne i64 %indvars.iv, 45
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %tmp2 = load i32*, i32** %A, align 8
+ %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
+ %tmp3 = trunc i64 %indvars.iv to i32
+ store i32 %tmp3, i32* %arrayidx, align 4
+ %tmp4 = load i32*, i32** %R, align 8
+ %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
+ store i32 0, i32* %arrayidx3, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ %tmp5 = load i32*, i32** %R, align 8
+ %tmp6 = load i32*, i32** %A, align 8
+ call void @copy(i32* %tmp5, i32* %tmp6)
+ ret i32 0
+}
+
+declare i32 @cudaMallocManaged(i8**, i64, i32) #1
Modified: polly/trunk/tools/GPURuntime/GPUJIT.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.c?rev=301640&r1=301639&r2=301640&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.c (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.c Fri Apr 28 06:16:30 2017
@@ -130,6 +130,9 @@ static CuLinkCompleteFcnTy *CuLinkComple
typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
+typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
+static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;
+
/* Type-defines of function pointer ot CUDA runtime APIs. */
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
@@ -233,6 +236,9 @@ static int initialDeviceAPIs() {
CuLinkDestroyFcnPtr =
(CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
+ CuCtxSynchronizeFcnPtr =
+ (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
+
/* Get function pointer to CUDA Runtime APIs. */
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
HandleCudaRT, "cudaThreadSynchronize");
@@ -436,6 +442,13 @@ void polly_copyFromDeviceToHost(PollyGPU
exit(-1);
}
}
+void polly_synchronizeDevice() {
+ dump_function();
+ if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
+ fprintf(stdout, "Synchronizing device and host memory failed.\n");
+ exit(-1);
+ }
+}
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
unsigned int GridDimY, unsigned int BlockDimX,
Modified: polly/trunk/tools/GPURuntime/GPUJIT.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.h?rev=301640&r1=301639&r2=301640&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.h (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.h Fri Apr 28 06:16:30 2017
@@ -88,6 +88,7 @@ void polly_copyFromHostToDevice(void *Ho
long MemSize);
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
long MemSize);
+void polly_synchronizeDevice();
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
unsigned int GridDimY, unsigned int BlockSizeX,
unsigned int BlockSizeY, unsigned int BlockSizeZ,
More information about the llvm-commits
mailing list