[polly] r276645 - GPGPU: Load GPU kernels
Tobias Grosser via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 25 09:31:22 PDT 2016
Author: grosser
Date: Mon Jul 25 11:31:21 2016
New Revision: 276645
URL: http://llvm.org/viewvc/llvm-project?rev=276645&view=rev
Log:
GPGPU: Load GPU kernels
We embed the PTX code into the host IR as a global variable and compile it
at run-time into a GPU kernel.
Modified:
polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
polly/trunk/test/GPGPU/double-parallel-loop.ll
polly/trunk/test/GPGPU/host-control-flow.ll
polly/trunk/tools/GPURuntime/GPUJIT.c
polly/trunk/tools/GPURuntime/GPUJIT.h
Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=276645&r1=276644&r2=276645&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Mon Jul 25 11:31:21 2016
@@ -281,7 +281,9 @@ private:
///
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
/// dump its IR to stderr.
- void finalizeKernelFunction();
+ ///
+ /// @returns The Assembly string of the kernel.
+ std::string finalizeKernelFunction();
/// Create code that allocates memory to store arrays on device.
void allocateDeviceArrays();
@@ -324,6 +326,19 @@ private:
/// @param HostPtr A host pointer specifying the location to copy to.
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
Value *Size);
+
+ /// Create a call to get a kernel from an assembly string.
+ ///
+ /// @param Buffer The string describing the kernel.
+ /// @param Entry The name of the kernel function to call.
+ ///
+ /// @returns A pointer to a kernel object
+ Value *createCallGetKernel(Value *Buffer, Value *Entry);
+
+ /// Create a call to free a GPU kernel.
+ ///
+ /// @param GPUKernel THe kernel to free.
+ void createCallFreeKernel(Value *GPUKernel);
};
void GPUNodeBuilder::initializeAfterRTH() {
@@ -360,6 +375,41 @@ void GPUNodeBuilder::freeDeviceArrays()
createCallFreeDeviceMemory(Array.second);
}
+Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
+ const char *Name = "polly_getKernel";
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Function *F = M->getFunction(Name);
+
+ // If F is not available, declare it.
+ if (!F) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ std::vector<Type *> Args;
+ Args.push_back(Builder.getInt8PtrTy());
+ Args.push_back(Builder.getInt8PtrTy());
+ FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
+ F = Function::Create(Ty, Linkage, Name, M);
+ }
+
+ return Builder.CreateCall(F, {Buffer, Entry});
+}
+
+void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
+ const char *Name = "polly_freeKernel";
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Function *F = M->getFunction(Name);
+
+ // If F is not available, declare it.
+ if (!F) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ std::vector<Type *> Args;
+ Args.push_back(Builder.getInt8PtrTy());
+ FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+ F = Function::Create(Ty, Linkage, Name, M);
+ }
+
+ Builder.CreateCall(F, {GPUKernel});
+}
+
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
const char *Name = "polly_freeDeviceMemory";
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -755,7 +805,12 @@ void GPUNodeBuilder::createKernel(__isl_
S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
LocalArrays.clear();
- finalizeKernelFunction();
+ std::string ASMString = finalizeKernelFunction();
+ std::string Name = "kernel_" + std::to_string(Kernel->id);
+ Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
+ Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
+ Value *GPUKernel = createCallGetKernel(KernelString, NameString);
+ createCallFreeKernel(GPUKernel);
}
/// Compute the DataLayout string for the NVPTX backend.
@@ -943,7 +998,7 @@ std::string GPUNodeBuilder::createKernel
return ASMStream.str();
}
-void GPUNodeBuilder::finalizeKernelFunction() {
+std::string GPUNodeBuilder::finalizeKernelFunction() {
// Verify module.
llvm::legacy::PassManager Passes;
Passes.add(createVerifierPass());
@@ -967,6 +1022,8 @@ void GPUNodeBuilder::finalizeKernelFunct
GPUModule.release();
KernelIDs.clear();
+
+ return Assembly;
}
namespace {
Modified: polly/trunk/test/GPGPU/double-parallel-loop.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/double-parallel-loop.ll?rev=276645&r1=276644&r2=276645&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/double-parallel-loop.ll (original)
+++ polly/trunk/test/GPGPU/double-parallel-loop.ll Mon Jul 25 11:31:21 2016
@@ -96,6 +96,8 @@
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
+; IR-NEXT: call i8* @polly_getKernel
+; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
Modified: polly/trunk/test/GPGPU/host-control-flow.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/host-control-flow.ll?rev=276645&r1=276644&r2=276645&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/host-control-flow.ll (original)
+++ polly/trunk/test/GPGPU/host-control-flow.ll Mon Jul 25 11:31:21 2016
@@ -30,6 +30,8 @@
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; IR-NEXT: call i8* @polly_getKernel
+; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
Modified: polly/trunk/tools/GPURuntime/GPUJIT.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.c?rev=276645&r1=276644&r2=276645&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.c (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.c Mon Jul 25 11:31:21 2016
@@ -17,6 +17,7 @@
#include <dlfcn.h>
#include <stdarg.h>
#include <stdio.h>
+#include <string.h>
static int DebugMode;
@@ -36,12 +37,9 @@ struct PollyGPUContextT {
CUcontext Cuda;
};
-struct PollyGPUModuleT {
- CUmodule Cuda;
-};
-
struct PollyGPUFunctionT {
CUfunction Cuda;
+ CUmodule CudaModule;
};
struct PollyGPUDevicePtrT {
@@ -101,6 +99,10 @@ typedef CUresult CUDAAPI CuModuleLoadDat
void **);
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
+typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
+ const void *image);
+static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
+
typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
const char *);
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;
@@ -111,6 +113,27 @@ static CuDeviceComputeCapabilityFcnTy *C
typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
+typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
+ CUjitInputType type, void *data,
+ size_t size, const char *name,
+ unsigned int numOptions,
+ CUjit_option *options,
+ void **optionValues);
+static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
+ CUjit_option *options,
+ void **optionValues,
+ CUlinkState *stateOut);
+static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
+ size_t *sizeOut);
+static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
+
+typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
+static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
+
/* Type-defines of function pointer ot CUDA runtime APIs. */
typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
@@ -198,6 +221,9 @@ static int initialDeviceAPIs() {
CuModuleLoadDataExFcnPtr =
(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
+ CuModuleLoadDataFcnPtr =
+ (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
+
CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
HandleCuda, "cuModuleGetFunction");
@@ -208,6 +234,18 @@ static int initialDeviceAPIs() {
CuDeviceGetNameFcnPtr =
(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
+ CuLinkAddDataFcnPtr =
+ (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
+
+ CuLinkCreateFcnPtr =
+ (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
+
+ CuLinkCompleteFcnPtr =
+ (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
+
+ CuLinkDestroyFcnPtr =
+ (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
+
/* Get function pointer to CUDA Runtime APIs. */
CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
HandleCudaRT, "cudaThreadSynchronize");
@@ -262,38 +300,93 @@ PollyGPUContext *polly_initContext() {
return Context;
}
-void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+ const char *KernelName) {
dump_function();
- *Module = malloc(sizeof(PollyGPUModule));
- if (*Module == 0) {
- fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
+ PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+
+ if (Function == 0) {
+ fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
exit(-1);
}
- if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) !=
- CUDA_SUCCESS) {
- fprintf(stdout, "Loading ptx assembly text failed.\n");
+ CUresult Res;
+ CUlinkState LState;
+ CUjit_option Options[6];
+ void *OptionVals[6];
+ float Walltime = 0;
+ unsigned long LogSize = 8192;
+ char ErrorLog[8192], InfoLog[8192];
+ void *CuOut;
+ size_t OutSize;
+
+ // Setup linker options
+ // Return walltime from JIT compilation
+ Options[0] = CU_JIT_WALL_TIME;
+ OptionVals[0] = (void *)&Walltime;
+ // Pass a buffer for info messages
+ Options[1] = CU_JIT_INFO_LOG_BUFFER;
+ OptionVals[1] = (void *)InfoLog;
+ // Pass the size of the info buffer
+ Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+ OptionVals[2] = (void *)LogSize;
+ // Pass a buffer for error message
+ Options[3] = CU_JIT_ERROR_LOG_BUFFER;
+ OptionVals[3] = (void *)ErrorLog;
+ // Pass the size of the error buffer
+ Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+ OptionVals[4] = (void *)LogSize;
+ // Make the linker verbose
+ Options[5] = CU_JIT_LOG_VERBOSE;
+ OptionVals[5] = (void *)1;
+
+ memset(ErrorLog, 0, sizeof(ErrorLog));
+
+ CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
+ Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
+ strlen(PTXBuffer) + 1, 0, 0, 0, 0);
+ if (Res != CUDA_SUCCESS) {
+ fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
exit(-1);
}
-}
-void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
- PollyGPUFunction **Kernel) {
- dump_function();
+ Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
+ if (Res != CUDA_SUCCESS) {
+ fprintf(stdout, "Complete ptx linker step failed.\n");
+ fprintf(stdout, "\n%s\n", ErrorLog);
+ exit(-1);
+ }
- *Kernel = malloc(sizeof(PollyGPUFunction));
- if (*Kernel == 0) {
- fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
+ debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
+ InfoLog);
+
+ Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
+ if (Res != CUDA_SUCCESS) {
+ fprintf(stdout, "Loading ptx assembly text failed.\n");
exit(-1);
}
- /* Locate the kernel entry point. */
- if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) !=
- CUDA_SUCCESS) {
+ Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
+ KernelName);
+ if (Res != CUDA_SUCCESS) {
fprintf(stdout, "Loading kernel function failed.\n");
exit(-1);
}
+
+ CuLinkDestroyFcnPtr(LState);
+
+ return Function;
+}
+
+void polly_freeKernel(PollyGPUFunction *Kernel) {
+ dump_function();
+
+ if (Kernel->CudaModule)
+ CuModuleUnloadFcnPtr(Kernel->CudaModule);
+
+ if (Kernel)
+ free(Kernel);
}
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
Modified: polly/trunk/tools/GPURuntime/GPUJIT.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.h?rev=276645&r1=276644&r2=276645&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.h (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.h Mon Jul 25 11:31:21 2016
@@ -44,7 +44,6 @@
* const char *Entry = "_Z8myKernelPi";
*
* int main() {
- * PollyGPUModule *Module;
* PollyGPUFunction *Kernel;
* PollyGPUContext *Context;
* PollyGPUDevicePtr *DevArray;
@@ -58,11 +57,11 @@
* MemSize = 256*64*sizeof(int);
* Context = polly_initContext();
* DevArray = polly_allocateMemoryForDevice(MemSize);
- * polly_getPTXModule(KernelString, &Module);
- * polly_getPTXKernelEntry(Entry, Module, &Kernel);
+ * Kernel = polly_getKernel(KernelString, KernelName);
* polly_setKernelParameters(Kernel, BlockWidth, BlockHeight, DevData);
* polly_launchKernel(Kernel, GridWidth, GridHeight);
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
+ * polly_freeKernel(Kernel);
* polly_freeDeviceMemory(DevArray);
* polly_freeContext(Context);
* }
@@ -70,14 +69,13 @@
*/
typedef struct PollyGPUContextT PollyGPUContext;
-typedef struct PollyGPUModuleT PollyGPUModule;
typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
PollyGPUContext *polly_initContext();
-void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
-void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
- PollyGPUFunction **Kernel);
+PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+ const char *KernelName);
+void polly_freeKernel(PollyGPUFunction *Kernel);
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
long MemSize);
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
More information about the llvm-commits
mailing list