[polly] r302379 - [Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen

Siddharth Bhat via llvm-commits llvm-commits at lists.llvm.org
Sun May 7 14:03:46 PDT 2017


Author: bollu
Date: Sun May  7 16:03:46 2017
New Revision: 302379

URL: http://llvm.org/viewvc/llvm-project?rev=302379&view=rev
Log:
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen

Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).

Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).

Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay

Reviewed By: grosser, Meinersbur

Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia

Tags: #polly

Differential Revision: https://reviews.llvm.org/D32431

Added:
    polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h
Modified:
    polly/trunk/CMakeLists.txt
    polly/trunk/include/polly/LinkAllPasses.h
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/lib/Support/RegisterPasses.cpp
    polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
    polly/trunk/test/GPGPU/size-cast.ll
    polly/trunk/tools/CMakeLists.txt
    polly/trunk/tools/GPURuntime/GPUJIT.c
    polly/trunk/tools/GPURuntime/GPUJIT.h

Modified: polly/trunk/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/CMakeLists.txt?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/CMakeLists.txt (original)
+++ polly/trunk/CMakeLists.txt Sun May  7 16:03:46 2017
@@ -152,9 +152,10 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TR
 
 option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
 if (POLLY_ENABLE_GPGPU_CODEGEN)
-  # Do not require CUDA, as GPU code generation test cases can be run without
-  # a cuda library.
+  # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
+  # without a CUDA/OpenCL library.
   FIND_PACKAGE(CUDA)
+  FIND_PACKAGE(OpenCL)
   set(GPU_CODEGEN TRUE)
 else(POLLY_ENABLE_GPGPU_CODEGEN)
   set(GPU_CODEGEN FALSE)
@@ -163,8 +164,13 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)
 
 # Support GPGPU code generation if the library is available.
 if (CUDALIB_FOUND)
+  add_definitions(-DHAS_LIBCUDART)
   INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
 endif(CUDALIB_FOUND)
+if (OpenCL_FOUND)
+  add_definitions(-DHAS_LIBOPENCL)
+  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
+endif(OpenCL_FOUND)
 
 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)

Added: polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h?rev=302379&view=auto
==============================================================================
--- polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h (added)
+++ polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h Sun May  7 16:03:46 2017
@@ -0,0 +1,24 @@
+//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Take a scop created by ScopInfo and map it to GPU code using the ppcg
+// GPU mapping strategy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_PPCGCODEGENERATION_H
+#define POLLY_PPCGCODEGENERATION_H
+
+/// The GPU Architecture to target.
+enum GPUArch { NVPTX64 };
+
+/// The GPU Runtime implementation to use.
+enum GPURuntime { CUDA, OpenCL };
+
+#endif // POLLY_PPCGCODEGENERATION_H

Modified: polly/trunk/include/polly/LinkAllPasses.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/LinkAllPasses.h?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/include/polly/LinkAllPasses.h (original)
+++ polly/trunk/include/polly/LinkAllPasses.h Sun May  7 16:03:46 2017
@@ -15,6 +15,7 @@
 #ifndef POLLY_LINKALLPASSES_H
 #define POLLY_LINKALLPASSES_H
 
+#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/Simplify.h"
@@ -48,7 +49,8 @@ llvm::Pass *createScopInfoWrapperPassPas
 llvm::Pass *createIslAstInfoPass();
 llvm::Pass *createCodeGenerationPass();
 #ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass();
+llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
+                                         GPURuntime Runtime = GPURuntime::CUDA);
 #endif
 llvm::Pass *createIslScheduleOptimizerPass();
 llvm::Pass *createFlattenSchedulePass();

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Sun May  7 16:03:46 2017
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/CodeGen/IslAst.h"
 #include "polly/CodeGen/IslNodeBuilder.h"
 #include "polly/CodeGen/Utils.h"
@@ -153,9 +154,9 @@ public:
   GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
                  const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
                  DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
-                 gpu_prog *Prog)
+                 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
       : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
-        Prog(Prog) {
+        Prog(Prog), Runtime(Runtime), Arch(Arch) {
     getExprBuilder().setIDToSAI(&IDToSAI);
   }
 
@@ -201,6 +202,12 @@ private:
   /// The GPU program we generate code for.
   gpu_prog *Prog;
 
+  /// The GPU Runtime implementation to use (OpenCL or CUDA).
+  GPURuntime Runtime;
+
+  /// The GPU Architecture to target.
+  GPUArch Arch;
+
   /// Class to free isl_ids.
   class IslIdDeleter {
   public:
@@ -752,7 +759,17 @@ void GPUNodeBuilder::createCallSynchroni
 }
 
 Value *GPUNodeBuilder::createCallInitContext() {
-  const char *Name = "polly_initContext";
+  const char *Name;
+
+  switch (Runtime) {
+  case GPURuntime::CUDA:
+    Name = "polly_initContextCUDA";
+    break;
+  case GPURuntime::OpenCL:
+    Name = "polly_initContextCL";
+    break;
+  }
+
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Function *F = M->getFunction(Name);
 
@@ -1028,7 +1045,15 @@ void GPUNodeBuilder::createScopStmt(isl_
 
 void GPUNodeBuilder::createKernelSync() {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
+
+  Function *Sync;
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
+    break;
+  }
+
   Builder.CreateCall(Sync, {});
 }
 
@@ -1434,7 +1459,12 @@ GPUNodeBuilder::createKernelFunctionDecl
   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                               GPUModule.get());
-  FN->setCallingConv(CallingConv::PTX_Kernel);
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    FN->setCallingConv(CallingConv::PTX_Kernel);
+    break;
+  }
 
   auto Arg = FN->arg_begin();
   for (long i = 0; i < Kernel->n_array; i++) {
@@ -1495,12 +1525,19 @@ GPUNodeBuilder::createKernelFunctionDecl
 }
 
 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
-  Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
-                                   Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
+  Intrinsic::ID IntrinsicsBID[2];
+  Intrinsic::ID IntrinsicsTID[3];
 
-  Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
-                                   Intrinsic::nvvm_read_ptx_sreg_tid_y,
-                                   Intrinsic::nvvm_read_ptx_sreg_tid_z};
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
+    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
+
+    IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
+    IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
+    IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
+    break;
+  }
 
   auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
     std::string Name = isl_id_get_name(Id);
@@ -1649,11 +1686,18 @@ void GPUNodeBuilder::createKernelVariabl
 
 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
                                           SetVector<Value *> &SubtreeValues) {
-
   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
   GPUModule.reset(new Module(Identifier, Builder.getContext()));
-  GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
-  GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    if (Runtime == GPURuntime::CUDA)
+      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+    else if (Runtime == GPURuntime::OpenCL)
+      GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
+    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
+    break;
+  }
 
   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
 
@@ -1674,7 +1718,21 @@ void GPUNodeBuilder::createKernelFunctio
 }
 
 std::string GPUNodeBuilder::createKernelASM() {
-  llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+  llvm::Triple GPUTriple;
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    switch (Runtime) {
+    case GPURuntime::CUDA:
+      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
+      break;
+    case GPURuntime::OpenCL:
+      GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
+      break;
+    }
+    break;
+  }
+
   std::string ErrMsg;
   auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
 
@@ -1685,9 +1743,17 @@ std::string GPUNodeBuilder::createKernel
 
   TargetOptions Options;
   Options.UnsafeFPMath = FastMath;
-  std::unique_ptr<TargetMachine> TargetM(
-      GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
-                                     Options, Optional<Reloc::Model>()));
+
+  std::string subtarget;
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    subtarget = CudaVersion;
+    break;
+  }
+
+  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
+      GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
 
   SmallString<0> ASMString;
   raw_svector_ostream ASMStream(ASMString);
@@ -1739,6 +1805,10 @@ class PPCGCodeGeneration : public ScopPa
 public:
   static char ID;
 
+  GPURuntime Runtime = GPURuntime::CUDA;
+
+  GPUArch Architecture = GPUArch::NVPTX64;
+
   /// The scop that is currently processed.
   Scop *S;
 
@@ -2522,7 +2592,7 @@ public:
         executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
 
     GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
-                               StartBlock, Prog);
+                               StartBlock, Prog, Runtime, Architecture);
 
     // TODO: Handle LICM
     auto SplitBlock = StartBlock->getSinglePredecessor();
@@ -2610,7 +2680,12 @@ public:
 
 char PPCGCodeGeneration::ID = 1;
 
-Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
+Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
+  PPCGCodeGeneration *generator = new PPCGCodeGeneration();
+  generator->Runtime = Runtime;
+  generator->Architecture = Arch;
+  return generator;
+}
 
 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
                       "Polly - Apply PPCG translation to SCOP", false, false)

Modified: polly/trunk/lib/Support/RegisterPasses.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Support/RegisterPasses.cpp?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/lib/Support/RegisterPasses.cpp (original)
+++ polly/trunk/lib/Support/RegisterPasses.cpp Sun May  7 16:03:46 2017
@@ -23,6 +23,7 @@
 #include "polly/Canonicalization.h"
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/CodegenCleanup.h"
+#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/DeLICM.h"
 #include "polly/DependenceInfo.h"
 #include "polly/FlattenSchedule.h"
@@ -101,6 +102,23 @@ static cl::opt<TargetChoice>
                           ),
            cl::init(TARGET_CPU), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+#ifdef GPU_CODEGEN
+static cl::opt<GPURuntime> GPURuntimeChoice(
+    "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
+    cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
+                          "use the CUDA Runtime API"),
+               clEnumValN(GPURuntime::OpenCL, "libopencl",
+                          "use the OpenCL Runtime API")),
+    cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<GPUArch>
+    GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
+                  cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
+                                        "target NVIDIA 64-bit architecture")),
+                  cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
+                  cl::cat(PollyCategory));
+#endif
+
 VectorizerChoice polly::PollyVectorizerChoice;
 static cl::opt<polly::VectorizerChoice, true> Vectorizer(
     "polly-vectorizer", cl::desc("Select the vectorization strategy"),
@@ -309,7 +327,8 @@ void registerPollyPasses(llvm::legacy::P
 
   if (Target == TARGET_GPU) {
 #ifdef GPU_CODEGEN
-    PM.add(polly::createPPCGCodeGenerationPass());
+    PM.add(
+        polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice));
 #endif
   } else {
     switch (CodeGeneration) {

Modified: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll (original)
+++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll Sun May  7 16:03:46 2017
@@ -35,7 +35,7 @@
 ; CHECK-NOT: polly_freeDeviceMemory
 ; CHECK-NOT: polly_allocateMemoryForDevice
 
-; CHECK:       %13 = call i8* @polly_initContext()
+; CHECK:       %13 = call i8* @polly_initContextCUDA()
 ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
 ; CHECK-NEXT:  %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
@@ -46,7 +46,7 @@
 ; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
 ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
 ; CHECK-NEXT:  store i8* %19, i8** %18
-; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
+; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
 ; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
 ; CHECK-NEXT:  call void @polly_synchronizeDevice()

Modified: polly/trunk/test/GPGPU/size-cast.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/size-cast.ll?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/size-cast.ll (original)
+++ polly/trunk/test/GPGPU/size-cast.ll Sun May  7 16:03:46 2017
@@ -29,7 +29,7 @@
 ; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
 ; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
 
-; IR:        call i8* @polly_initContext()
+; IR:        call i8* @polly_initContextCUDA()
 ; IR-NEXT:   sext i32 %arg to i64
 ; IR-NEXT:   mul i64
 ; IR-NEXT:   @polly_allocateMemoryForDevice

Modified: polly/trunk/tools/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/CMakeLists.txt?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/tools/CMakeLists.txt (original)
+++ polly/trunk/tools/CMakeLists.txt Sun May  7 16:03:46 2017
@@ -1,5 +1,5 @@
-if (CUDALIB_FOUND)
+if (CUDALIB_FOUND OR OpenCL_FOUND)
   add_subdirectory(GPURuntime)
-endif (CUDALIB_FOUND)
+endif (CUDALIB_FOUND OR OpenCL_FOUND)
 
 set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)

Modified: polly/trunk/tools/GPURuntime/GPUJIT.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.c?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.c (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.c Sun May  7 16:03:46 2017
@@ -12,8 +12,20 @@
 /******************************************************************************/
 
 #include "GPUJIT.h"
+
+#ifdef HAS_LIBCUDART
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif /* HAS_LIBCUDART */
+
+#ifdef HAS_LIBOPENCL
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif /* HAS_LIBOPENCL */
+
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -22,6 +34,8 @@
 static int DebugMode;
 static int CacheMode;
 
+static PollyGPURuntime Runtime = RUNTIME_NONE;
+
 static void debug_print(const char *format, ...) {
   if (!DebugMode)
     return;
@@ -33,18 +47,853 @@ static void debug_print(const char *form
 }
 #define dump_function() debug_print("-> %s\n", __func__)
 
-/* Define Polly's GPGPU data types. */
+#define KERNEL_CACHE_SIZE 10
+
+static void err_runtime() {
+  fprintf(stderr, "Runtime not correctly initialized.\n");
+  exit(-1);
+}
+
 struct PollyGPUContextT {
-  CUcontext Cuda;
+  void *Context;
 };
 
 struct PollyGPUFunctionT {
+  void *Kernel;
+};
+
+struct PollyGPUDevicePtrT {
+  void *DevicePtr;
+};
+
+/******************************************************************************/
+/*                                  OpenCL                                    */
+/******************************************************************************/
+#ifdef HAS_LIBOPENCL
+
+struct OpenCLContextT {
+  cl_context Context;
+  cl_command_queue CommandQueue;
+};
+
+struct OpenCLKernelT {
+  cl_kernel Kernel;
+  cl_program Program;
+  const char *BinaryString;
+};
+
+struct OpenCLDevicePtrT {
+  cl_mem MemObj;
+};
+
+/* Dynamic library handles for the OpenCL runtime library. */
+static void *HandleOpenCL;
+
+/* Type-defines of function pointer to OpenCL Runtime API. */
+typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
+                                     cl_platform_id *Platforms,
+                                     cl_uint *NumPlatforms);
+static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
+
+typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
+                                   cl_device_type DeviceType,
+                                   cl_uint NumEntries, cl_device_id *Devices,
+                                   cl_uint *NumDevices);
+static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
+
+typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
+                                    cl_device_info ParamName,
+                                    size_t ParamValueSize, void *ParamValue,
+                                    size_t *ParamValueSizeRet);
+static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
+
+typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
+                                    size_t ParamValueSize, void *ParamValue,
+                                    size_t *ParamValueSizeRet);
+static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
+
+typedef cl_context clCreateContextFcnTy(
+    const cl_context_properties *Properties, cl_uint NumDevices,
+    const cl_device_id *Devices,
+    void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
+                                 size_t CB, void *UserData),
+    void *UserData, cl_int *ErrcodeRet);
+static clCreateContextFcnTy *clCreateContextFcnPtr;
+
+typedef cl_command_queue
+clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
+                          cl_command_queue_properties Properties,
+                          cl_int *ErrcodeRet);
+static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
+
+typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
+                                   size_t Size, void *HostPtr,
+                                   cl_int *ErrcodeRet);
+static clCreateBufferFcnTy *clCreateBufferFcnPtr;
+
+typedef cl_int
+clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
+                          cl_bool BlockingWrite, size_t Offset, size_t Size,
+                          const void *Ptr, cl_uint NumEventsInWaitList,
+                          const cl_event *EventWaitList, cl_event *Event);
+static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
+
+typedef cl_program clCreateProgramWithBinaryFcnTy(
+    cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
+    const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
+    cl_int *ErrcodeRet);
+static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
+
+typedef cl_int clBuildProgramFcnTy(
+    cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
+    const char *Options,
+    void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
+    void *UserData);
+static clBuildProgramFcnTy *clBuildProgramFcnPtr;
+
+typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
+                                      const char *KernelName,
+                                      cl_int *ErrcodeRet);
+static clCreateKernelFcnTy *clCreateKernelFcnPtr;
+
+typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
+                                   size_t ArgSize, const void *ArgValue);
+static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
+
+typedef cl_int clEnqueueNDRangeKernelFcnTy(
+    cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
+    const cl_event *EventWaitList, cl_event *Event);
+static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
+
+typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
+                                        cl_mem Buffer, cl_bool BlockingRead,
+                                        size_t Offset, size_t Size, void *Ptr,
+                                        cl_uint NumEventsInWaitList,
+                                        const cl_event *EventWaitList,
+                                        cl_event *Event);
+static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
+
+typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
+static clFlushFcnTy *clFlushFcnPtr;
+
+typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
+static clFinishFcnTy *clFinishFcnPtr;
+
+typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
+static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
+
+typedef cl_int clReleaseProgramFcnTy(cl_program Program);
+static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
+
+typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
+static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
+
+typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
+static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
+
+typedef cl_int clReleaseContextFcnTy(cl_context Context);
+static clReleaseContextFcnTy *clReleaseContextFcnPtr;
+
+static void *getAPIHandleCL(void *Handle, const char *FuncName) {
+  char *Err;
+  void *FuncPtr;
+  dlerror();
+  FuncPtr = dlsym(Handle, FuncName);
+  if ((Err = dlerror()) != 0) {
+    fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
+    return 0;
+  }
+  return FuncPtr;
+}
+
+static int initialDeviceAPILibrariesCL() {
+  HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
+  if (!HandleOpenCL) {
+    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
+    return 0;
+  }
+  return 1;
+}
+
+static int initialDeviceAPIsCL() {
+  if (initialDeviceAPILibrariesCL() == 0)
+    return 0;
+
+  /* Get function pointer to OpenCL Runtime API.
+   *
+   * Note that compilers conforming to the ISO C standard are required to
+   * generate a warning if a conversion from a void * pointer to a function
+   * pointer is attempted as in the following statements. The warning
+   * of this kind of cast may not be emitted by clang and new versions of gcc
+   * as it is valid on POSIX 2008.
+   */
+  clGetPlatformIDsFcnPtr =
+      (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
+
+  clGetDeviceIDsFcnPtr =
+      (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
+
+  clGetDeviceInfoFcnPtr =
+      (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
+
+  clGetKernelInfoFcnPtr =
+      (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
+
+  clCreateContextFcnPtr =
+      (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
+
+  clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
+      HandleOpenCL, "clCreateCommandQueue");
+
+  clCreateBufferFcnPtr =
+      (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
+
+  clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
+      HandleOpenCL, "clEnqueueWriteBuffer");
+
+  clCreateProgramWithBinaryFcnPtr =
+      (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
+          HandleOpenCL, "clCreateProgramWithBinary");
+
+  clBuildProgramFcnPtr =
+      (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
+
+  clCreateKernelFcnPtr =
+      (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
+
+  clSetKernelArgFcnPtr =
+      (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
+
+  clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
+      HandleOpenCL, "clEnqueueNDRangeKernel");
+
+  clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
+      HandleOpenCL, "clEnqueueReadBuffer");
+
+  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
+
+  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
+
+  clReleaseKernelFcnPtr =
+      (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
+
+  clReleaseProgramFcnPtr =
+      (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
+
+  clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
+      HandleOpenCL, "clReleaseMemObject");
+
+  clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
+      HandleOpenCL, "clReleaseCommandQueue");
+
+  clReleaseContextFcnPtr =
+      (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
+
+  return 1;
+}
+
+/* Context and Device. */
+static PollyGPUContext *GlobalContext = NULL;
+static cl_device_id GlobalDeviceID = NULL;
+
+/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
+static void printOpenCLError(int Error);
+
+static void checkOpenCLError(int Ret, const char *format, ...) {
+  if (Ret == CL_SUCCESS)
+    return;
+
+  printOpenCLError(Ret);
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+  exit(-1);
+}
+
+static PollyGPUContext *initContextCL() {
+  dump_function();
+
+  PollyGPUContext *Context;
+
+  cl_platform_id PlatformID = NULL;
+  cl_device_id DeviceID = NULL;
+  cl_uint NumDevicesRet;
+  cl_int Ret;
+
+  char DeviceRevision[256];
+  char DeviceName[256];
+  size_t DeviceRevisionRetSize, DeviceNameRetSize;
+
+  static __thread PollyGPUContext *CurrentContext = NULL;
+
+  if (CurrentContext)
+    return CurrentContext;
+
+  /* Get API handles. */
+  if (initialDeviceAPIsCL() == 0) {
+    fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
+    exit(-1);
+  }
+
+  /* Get number of devices that support OpenCL. */
+  static const int NumberOfPlatforms = 1;
+  Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
+  checkOpenCLError(Ret, "Failed to get platform IDs.\n");
+  // TODO: Extend to CL_DEVICE_TYPE_ALL?
+  static const int NumberOfDevices = 1;
+  Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
+                             &DeviceID, &NumDevicesRet);
+  checkOpenCLError(Ret, "Failed to get device IDs.\n");
+
+  GlobalDeviceID = DeviceID;
+  if (NumDevicesRet == 0) {
+    fprintf(stderr, "There is no device supporting OpenCL.\n");
+    exit(-1);
+  }
+
+  /* Get device revision. */
+  Ret =
+      clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
+                            DeviceRevision, &DeviceRevisionRetSize);
+  checkOpenCLError(Ret, "Failed to fetch device revision.\n");
+
+  /* Get device name. */
+  Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
+                              DeviceName, &DeviceNameRetSize);
+  checkOpenCLError(Ret, "Failed to fetch device name.\n");
+
+  debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);
+
+  /* Create context on the device. */
+  Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
+  if (Context == 0) {
+    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
+    exit(-1);
+  }
+  Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
+  if (Context->Context == 0) {
+    fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
+    exit(-1);
+  }
+  ((OpenCLContext *)Context->Context)->Context =
+      clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
+  checkOpenCLError(Ret, "Failed to create context.\n");
+
+  static const int ExtraProperties = 0;
+  ((OpenCLContext *)Context->Context)->CommandQueue =
+      clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
+                                 DeviceID, ExtraProperties, &Ret);
+  checkOpenCLError(Ret, "Failed to create command queue.\n");
+
+  if (CacheMode)
+    CurrentContext = Context;
+
+  GlobalContext = Context;
+  return Context;
+}
+
+static void freeKernelCL(PollyGPUFunction *Kernel) {
+  dump_function();
+
+  if (CacheMode)
+    return;
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  cl_int Ret;
+  Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
+  checkOpenCLError(Ret, "Failed to flush command queue.\n");
+  Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
+  checkOpenCLError(Ret, "Failed to finish command queue.\n");
+
+  if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
+    cl_int Ret =
+        clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
+    checkOpenCLError(Ret, "Failed to release kernel.\n");
+  }
+
+  if (((OpenCLKernel *)Kernel->Kernel)->Program) {
+    cl_int Ret =
+        clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
+    checkOpenCLError(Ret, "Failed to release program.\n");
+  }
+
+  if (Kernel->Kernel)
+    free((OpenCLKernel *)Kernel->Kernel);
+
+  if (Kernel)
+    free(Kernel);
+}
+
+static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
+                                     const char *KernelName) {
+  dump_function();
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
+  static __thread int NextCacheItem = 0;
+
+  for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
+    // We exploit here the property that all Polly-ACC kernels are allocated
+    // as global constants, hence a pointer comparision is sufficient to
+    // determin equality.
+    if (KernelCache[i] &&
+        ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
+            BinaryBuffer) {
+      debug_print("  -> using cached kernel\n");
+      return KernelCache[i];
+    }
+  }
+
+  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
+  if (Function == 0) {
+    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
+    exit(-1);
+  }
+  Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
+  if (Function->Kernel == 0) {
+    fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
+    exit(-1);
+  }
+
+  if (!GlobalDeviceID) {
+    fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
+    exit(-1);
+  }
+
+  cl_int Ret;
+  size_t BinarySize = strlen(BinaryBuffer);
+  ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
+      ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
+      (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
+      &Ret);
+  checkOpenCLError(Ret, "Failed to create program from binary.\n");
+
+  Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
+                             &GlobalDeviceID, NULL, NULL, NULL);
+  checkOpenCLError(Ret, "Failed to build program.\n");
+
+  ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
+      ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
+  checkOpenCLError(Ret, "Failed to create kernel.\n");
+
+  ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
+
+  if (CacheMode) {
+    if (KernelCache[NextCacheItem])
+      freeKernelCL(KernelCache[NextCacheItem]);
+
+    KernelCache[NextCacheItem] = Function;
+
+    NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
+  }
+
+  return Function;
+}
+
+static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
+                                   long MemSize) {
+  dump_function();
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  cl_int Ret;
+  Ret = clEnqueueWriteBufferFcnPtr(
+      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
+      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
+      HostData, 0, NULL, NULL);
+  checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
+}
+
+static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
+                                   long MemSize) {
+  dump_function();
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  cl_int Ret;
+  Ret = clEnqueueReadBufferFcnPtr(
+      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
+      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
+      HostData, 0, NULL, NULL);
+  checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
+}
+
+static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
+                           unsigned int GridDimY, unsigned int BlockDimX,
+                           unsigned int BlockDimY, unsigned int BlockDimZ,
+                           void **Parameters) {
+  dump_function();
+
+  cl_int Ret;
+  cl_uint NumArgs;
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
+  Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
+                              sizeof(cl_uint), &NumArgs, NULL);
+  checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
+
+  // TODO: Pass the size of the kernel arguments in to launchKernelCL, along
+  // with the arguments themselves. This is a dirty workaround that can be
+  // broken.
+  for (cl_uint i = 0; i < NumArgs; i++) {
+    Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]);
+    if (Ret == CL_INVALID_ARG_SIZE) {
+      Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]);
+      if (Ret == CL_INVALID_ARG_SIZE) {
+        Ret =
+            clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]);
+        if (Ret == CL_INVALID_ARG_SIZE) {
+          Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
+                                     (void *)Parameters[i]);
+          checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
+        }
+      }
+    }
+    if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
+      fprintf(stderr, "Failed to set Kernel argument.\n");
+      printOpenCLError(Ret);
+      exit(-1);
+    }
+  }
+
+  unsigned int GridDimZ = 1;
+  size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
+                              BlockDimZ * GridDimZ};
+  size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
+
+  static const int WorkDim = 3;
+  OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
+  Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
+                                     WorkDim, NULL, GlobalWorkSize,
+                                     LocalWorkSize, 0, NULL, NULL);
+  checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
+}
+
+static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
+  dump_function();
+
+  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
+  cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
+  checkOpenCLError(Ret, "Failed to free device memory.\n");
+
+  free(DevPtr);
+  free(Allocation);
+}
+
+static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
+  dump_function();
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
+  if (DevData == 0) {
+    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+    exit(-1);
+  }
+  DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
+  if (DevData->DevicePtr == 0) {
+    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+    exit(-1);
+  }
+
+  cl_int Ret;
+  ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
+      clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
+                           CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
+  checkOpenCLError(Ret,
+                   "Allocate memory for GPU device memory pointer failed.\n");
+
+  return DevData;
+}
+
+static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
+  dump_function();
+
+  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
+  return (void *)DevPtr->MemObj;
+}
+
+static void synchronizeDeviceCL() {
+  dump_function();
+
+  if (!GlobalContext) {
+    fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
+    exit(-1);
+  }
+
+  if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
+      CL_SUCCESS) {
+    fprintf(stderr, "Synchronizing device and host memory failed.\n");
+    exit(-1);
+  }
+}
+
+static void freeContextCL(PollyGPUContext *Context) {
+  dump_function();
+
+  cl_int Ret;
+
+  GlobalContext = NULL;
+
+  OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
+  if (Ctx->CommandQueue) {
+    Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
+    checkOpenCLError(Ret, "Could not release command queue.\n");
+  }
+
+  if (Ctx->Context) {
+    Ret = clReleaseContextFcnPtr(Ctx->Context);
+    checkOpenCLError(Ret, "Could not release context.\n");
+  }
+
+  free(Ctx);
+  free(Context);
+}
+
+static void printOpenCLError(int Error) {
+
+  switch (Error) {
+  case CL_SUCCESS:
+    // Success, don't print an error.
+    break;
+
+  // JIT/Runtime errors.
+  case CL_DEVICE_NOT_FOUND:
+    fprintf(stderr, "Device not found.\n");
+    break;
+  case CL_DEVICE_NOT_AVAILABLE:
+    fprintf(stderr, "Device not available.\n");
+    break;
+  case CL_COMPILER_NOT_AVAILABLE:
+    fprintf(stderr, "Compiler not available.\n");
+    break;
+  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+    fprintf(stderr, "Mem object allocation failure.\n");
+    break;
+  case CL_OUT_OF_RESOURCES:
+    fprintf(stderr, "Out of resources.\n");
+    break;
+  case CL_OUT_OF_HOST_MEMORY:
+    fprintf(stderr, "Out of host memory.\n");
+    break;
+  case CL_PROFILING_INFO_NOT_AVAILABLE:
+    fprintf(stderr, "Profiling info not available.\n");
+    break;
+  case CL_MEM_COPY_OVERLAP:
+    fprintf(stderr, "Mem copy overlap.\n");
+    break;
+  case CL_IMAGE_FORMAT_MISMATCH:
+    fprintf(stderr, "Image format mismatch.\n");
+    break;
+  case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+    fprintf(stderr, "Image format not supported.\n");
+    break;
+  case CL_BUILD_PROGRAM_FAILURE:
+    fprintf(stderr, "Build program failure.\n");
+    break;
+  case CL_MAP_FAILURE:
+    fprintf(stderr, "Map failure.\n");
+    break;
+  case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+    fprintf(stderr, "Misaligned sub buffer offset.\n");
+    break;
+  case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+    fprintf(stderr, "Exec status error for events in wait list.\n");
+    break;
+  case CL_COMPILE_PROGRAM_FAILURE:
+    fprintf(stderr, "Compile program failure.\n");
+    break;
+  case CL_LINKER_NOT_AVAILABLE:
+    fprintf(stderr, "Linker not available.\n");
+    break;
+  case CL_LINK_PROGRAM_FAILURE:
+    fprintf(stderr, "Link program failure.\n");
+    break;
+  case CL_DEVICE_PARTITION_FAILED:
+    fprintf(stderr, "Device partition failed.\n");
+    break;
+  case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+    fprintf(stderr, "Kernel arg info not available.\n");
+    break;
+
+  // Compiler errors.
+  case CL_INVALID_VALUE:
+    fprintf(stderr, "Invalid value.\n");
+    break;
+  case CL_INVALID_DEVICE_TYPE:
+    fprintf(stderr, "Invalid device type.\n");
+    break;
+  case CL_INVALID_PLATFORM:
+    fprintf(stderr, "Invalid platform.\n");
+    break;
+  case CL_INVALID_DEVICE:
+    fprintf(stderr, "Invalid device.\n");
+    break;
+  case CL_INVALID_CONTEXT:
+    fprintf(stderr, "Invalid context.\n");
+    break;
+  case CL_INVALID_QUEUE_PROPERTIES:
+    fprintf(stderr, "Invalid queue properties.\n");
+    break;
+  case CL_INVALID_COMMAND_QUEUE:
+    fprintf(stderr, "Invalid command queue.\n");
+    break;
+  case CL_INVALID_HOST_PTR:
+    fprintf(stderr, "Invalid host pointer.\n");
+    break;
+  case CL_INVALID_MEM_OBJECT:
+    fprintf(stderr, "Invalid memory object.\n");
+    break;
+  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+    fprintf(stderr, "Invalid image format descriptor.\n");
+    break;
+  case CL_INVALID_IMAGE_SIZE:
+    fprintf(stderr, "Invalid image size.\n");
+    break;
+  case CL_INVALID_SAMPLER:
+    fprintf(stderr, "Invalid sampler.\n");
+    break;
+  case CL_INVALID_BINARY:
+    fprintf(stderr, "Invalid binary.\n");
+    break;
+  case CL_INVALID_BUILD_OPTIONS:
+    fprintf(stderr, "Invalid build options.\n");
+    break;
+  case CL_INVALID_PROGRAM:
+    fprintf(stderr, "Invalid program.\n");
+    break;
+  case CL_INVALID_PROGRAM_EXECUTABLE:
+    fprintf(stderr, "Invalid program executable.\n");
+    break;
+  case CL_INVALID_KERNEL_NAME:
+    fprintf(stderr, "Invalid kernel name.\n");
+    break;
+  case CL_INVALID_KERNEL_DEFINITION:
+    fprintf(stderr, "Invalid kernel definition.\n");
+    break;
+  case CL_INVALID_KERNEL:
+    fprintf(stderr, "Invalid kernel.\n");
+    break;
+  case CL_INVALID_ARG_INDEX:
+    fprintf(stderr, "Invalid arg index.\n");
+    break;
+  case CL_INVALID_ARG_VALUE:
+    fprintf(stderr, "Invalid arg value.\n");
+    break;
+  case CL_INVALID_ARG_SIZE:
+    fprintf(stderr, "Invalid arg size.\n");
+    break;
+  case CL_INVALID_KERNEL_ARGS:
+    fprintf(stderr, "Invalid kernel args.\n");
+    break;
+  case CL_INVALID_WORK_DIMENSION:
+    fprintf(stderr, "Invalid work dimension.\n");
+    break;
+  case CL_INVALID_WORK_GROUP_SIZE:
+    fprintf(stderr, "Invalid work group size.\n");
+    break;
+  case CL_INVALID_WORK_ITEM_SIZE:
+    fprintf(stderr, "Invalid work item size.\n");
+    break;
+  case CL_INVALID_GLOBAL_OFFSET:
+    fprintf(stderr, "Invalid global offset.\n");
+    break;
+  case CL_INVALID_EVENT_WAIT_LIST:
+    fprintf(stderr, "Invalid event wait list.\n");
+    break;
+  case CL_INVALID_EVENT:
+    fprintf(stderr, "Invalid event.\n");
+    break;
+  case CL_INVALID_OPERATION:
+    fprintf(stderr, "Invalid operation.\n");
+    break;
+  case CL_INVALID_GL_OBJECT:
+    fprintf(stderr, "Invalid GL object.\n");
+    break;
+  case CL_INVALID_BUFFER_SIZE:
+    fprintf(stderr, "Invalid buffer size.\n");
+    break;
+  case CL_INVALID_MIP_LEVEL:
+    fprintf(stderr, "Invalid mip level.\n");
+    break;
+  case CL_INVALID_GLOBAL_WORK_SIZE:
+    fprintf(stderr, "Invalid global work size.\n");
+    break;
+  case CL_INVALID_PROPERTY:
+    fprintf(stderr, "Invalid property.\n");
+    break;
+  case CL_INVALID_IMAGE_DESCRIPTOR:
+    fprintf(stderr, "Invalid image descriptor.\n");
+    break;
+  case CL_INVALID_COMPILER_OPTIONS:
+    fprintf(stderr, "Invalid compiler options.\n");
+    break;
+  case CL_INVALID_LINKER_OPTIONS:
+    fprintf(stderr, "Invalid linker options.\n");
+    break;
+  case CL_INVALID_DEVICE_PARTITION_COUNT:
+    fprintf(stderr, "Invalid device partition count.\n");
+    break;
+  case CL_INVALID_PIPE_SIZE:
+    fprintf(stderr, "Invalid pipe size.\n");
+    break;
+  case CL_INVALID_DEVICE_QUEUE:
+    fprintf(stderr, "Invalid device queue.\n");
+    break;
+
+  // NVIDIA specific error.
+  case -9999:
+    fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
+    break;
+
+  default:
+    fprintf(stderr, "Unknown error code!\n");
+    break;
+  }
+}
+
+#endif /* HAS_LIBOPENCL */
+/******************************************************************************/
+/*                                   CUDA                                     */
+/******************************************************************************/
+#ifdef HAS_LIBCUDART
+
+struct CUDAContextT {
+  CUcontext Cuda;
+};
+
+struct CUDAKernelT {
   CUfunction Cuda;
   CUmodule CudaModule;
-  const char *PTXString;
+  const char *BinaryString;
 };
 
-struct PollyGPUDevicePtrT {
+struct CUDADevicePtrT {
   CUdeviceptr Cuda;
 };
 
@@ -58,7 +907,7 @@ static CuMemAllocFcnTy *CuMemAllocFcnPtr
 
 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
     CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
-    unsigned int GridDimZ, unsigned int BlockDimX, unsigned int BlockDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
     unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
     void **KernelParams, void **Extra);
 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
@@ -137,7 +986,7 @@ static CuCtxSynchronizeFcnTy *CuCtxSynch
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
 
-static void *getAPIHandle(void *Handle, const char *FuncName) {
+static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
   char *Err;
   void *FuncPtr;
   dlerror();
@@ -149,24 +998,24 @@ static void *getAPIHandle(void *Handle,
   return FuncPtr;
 }
 
-static int initialDeviceAPILibraries() {
+static int initialDeviceAPILibrariesCUDA() {
   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
   if (!HandleCuda) {
-    printf("Cannot open library: %s. \n", dlerror());
+    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
     return 0;
   }
 
   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
   if (!HandleCudaRT) {
-    printf("Cannot open library: %s. \n", dlerror());
+    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
     return 0;
   }
 
   return 1;
 }
 
-static int initialDeviceAPIs() {
-  if (initialDeviceAPILibraries() == 0)
+static int initialDeviceAPIsCUDA() {
+  if (initialDeviceAPILibrariesCUDA() == 0)
     return 0;
 
   /* Get function pointer to CUDA Driver APIs.
@@ -178,77 +1027,76 @@ static int initialDeviceAPIs() {
    * as it is valid on POSIX 2008.
    */
   CuLaunchKernelFcnPtr =
-      (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
+      (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");
 
   CuMemAllocFcnPtr =
-      (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
+      (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
 
-  CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");
+  CuMemFreeFcnPtr =
+      (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
 
   CuMemcpyDtoHFcnPtr =
-      (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");
+      (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");
 
   CuMemcpyHtoDFcnPtr =
-      (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");
+      (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");
 
   CuModuleUnloadFcnPtr =
-      (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");
+      (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");
 
   CuCtxDestroyFcnPtr =
-      (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");
+      (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
 
-  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");
+  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
 
   CuDeviceGetCountFcnPtr =
-      (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");
+      (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");
 
   CuDeviceGetFcnPtr =
-      (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");
+      (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
 
   CuCtxCreateFcnPtr =
-      (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");
+      (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");
 
-  CuModuleLoadDataExFcnPtr =
-      (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");
+  CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
+      HandleCuda, "cuModuleLoadDataEx");
 
   CuModuleLoadDataFcnPtr =
-      (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");
+      (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");
 
-  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
+  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
       HandleCuda, "cuModuleGetFunction");
 
   CuDeviceComputeCapabilityFcnPtr =
-      (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
+      (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
           HandleCuda, "cuDeviceComputeCapability");
 
   CuDeviceGetNameFcnPtr =
-      (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");
+      (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");
 
   CuLinkAddDataFcnPtr =
-      (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
+      (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");
 
   CuLinkCreateFcnPtr =
-      (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
+      (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
 
   CuLinkCompleteFcnPtr =
-      (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
+      (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");
 
   CuLinkDestroyFcnPtr =
-      (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
+      (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");
 
   CuCtxSynchronizeFcnPtr =
-      (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize");
+      (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");
 
   /* Get function pointer to CUDA Runtime APIs. */
-  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
+  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
       HandleCudaRT, "cudaThreadSynchronize");
 
   return 1;
 }
 
-PollyGPUContext *polly_initContext() {
-  DebugMode = getenv("POLLY_DEBUG") != 0;
-
+static PollyGPUContext *initContextCUDA() {
   dump_function();
   PollyGPUContext *Context;
   CUdevice Device;
@@ -263,7 +1111,7 @@ PollyGPUContext *polly_initContext() {
     return CurrentContext;
 
   /* Get API handles. */
-  if (initialDeviceAPIs() == 0) {
+  if (initialDeviceAPIsCUDA() == 0) {
     fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
     exit(-1);
   }
@@ -293,9 +1141,12 @@ PollyGPUContext *polly_initContext() {
     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
     exit(-1);
   }
-  CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
-
-  CacheMode = getenv("POLLY_NOCACHE") == 0;
+  Context->Context = malloc(sizeof(CUDAContext));
+  if (Context->Context == 0) {
+    fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
+    exit(-1);
+  }
+  CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0, Device);
 
   if (CacheMode)
     CurrentContext = Context;
@@ -303,18 +1154,24 @@ PollyGPUContext *polly_initContext() {
   return Context;
 }
 
-static void freeKernel(PollyGPUFunction *Kernel) {
-  if (Kernel->CudaModule)
-    CuModuleUnloadFcnPtr(Kernel->CudaModule);
+static void freeKernelCUDA(PollyGPUFunction *Kernel) {
+  dump_function();
+
+  if (CacheMode)
+    return;
+
+  if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
+    CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
+
+  if (Kernel->Kernel)
+    free((CUDAKernel *)Kernel->Kernel);
 
   if (Kernel)
     free(Kernel);
 }
 
-#define KERNEL_CACHE_SIZE 10
-
-PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
-                                  const char *KernelName) {
+static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
+                                       const char *KernelName) {
   dump_function();
 
   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
@@ -324,18 +1181,23 @@ PollyGPUFunction *polly_getKernel(const
     // We exploit here the property that all Polly-ACC kernels are allocated
     // as global constants, hence a pointer comparision is sufficient to
     // determin equality.
-    if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
+    if (KernelCache[i] &&
+        ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
       debug_print("  -> using cached kernel\n");
       return KernelCache[i];
     }
   }
 
   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
-
   if (Function == 0) {
     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
     exit(-1);
   }
+  Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
+  if (Function->Kernel == 0) {
+    fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
+    exit(-1);
+  }
 
   CUresult Res;
   CUlinkState LState;
@@ -370,8 +1232,8 @@ PollyGPUFunction *polly_getKernel(const
   memset(ErrorLog, 0, sizeof(ErrorLog));
 
   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
-  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
-                            strlen(PTXBuffer) + 1, 0, 0, 0, 0);
+  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
+                            strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
     exit(-1);
@@ -387,13 +1249,15 @@ PollyGPUFunction *polly_getKernel(const
   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
               InfoLog);
 
-  Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
+  Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
+                               CuOut);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Loading ptx assembly text failed.\n");
     exit(-1);
   }
 
-  Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
+  Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
+                                  ((CUDAKernel *)Function->Kernel)->CudaModule,
                                   KernelName);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Loading kernel function failed.\n");
@@ -402,11 +1266,11 @@ PollyGPUFunction *polly_getKernel(const
 
   CuLinkDestroyFcnPtr(LState);
 
-  Function->PTXString = PTXBuffer;
+  ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
 
   if (CacheMode) {
     if (KernelCache[NextCacheItem])
-      freeKernel(KernelCache[NextCacheItem]);
+      freeKernelCUDA(KernelCache[NextCacheItem]);
 
     KernelCache[NextCacheItem] = Function;
 
@@ -416,44 +1280,37 @@ PollyGPUFunction *polly_getKernel(const
   return Function;
 }
 
-void polly_freeKernel(PollyGPUFunction *Kernel) {
+static void synchronizeDeviceCUDA() {
   dump_function();
-
-  if (CacheMode)
-    return;
-
-  freeKernel(Kernel);
+  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
+    fprintf(stderr, "Synchronizing device and host memory failed.\n");
+    exit(-1);
+  }
 }
 
-void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
-                                long MemSize) {
+static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
+                                     long MemSize) {
   dump_function();
 
-  CUdeviceptr CuDevData = DevData->Cuda;
+  CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
 }
 
-void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
-                                long MemSize) {
+static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
+                                     long MemSize) {
   dump_function();
 
-  if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
+  if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
+                         MemSize) != CUDA_SUCCESS) {
     fprintf(stderr, "Copying results from device to host memory failed.\n");
     exit(-1);
   }
 }
-void polly_synchronizeDevice() {
-  dump_function();
-  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
-    fprintf(stderr, "Synchronizing device and host memory failed.\n");
-    exit(-1);
-  }
-}
 
-void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
-                        unsigned int GridDimY, unsigned int BlockDimX,
-                        unsigned int BlockDimY, unsigned int BlockDimZ,
-                        void **Parameters) {
+static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
+                             unsigned int GridDimY, unsigned int BlockDimX,
+                             unsigned int BlockDimY, unsigned int BlockDimZ,
+                             void **Parameters) {
   dump_function();
 
   unsigned GridDimZ = 1;
@@ -462,32 +1319,40 @@ void polly_launchKernel(PollyGPUFunction
   void **Extra = 0;
 
   CUresult Res;
-  Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
-                             BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
-                             Stream, Parameters, Extra);
+  Res =
+      CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
+                           GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
+                           SharedMemBytes, Stream, Parameters, Extra);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Launching CUDA kernel failed.\n");
     exit(-1);
   }
 }
 
-void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
+static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
   dump_function();
-  CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
+  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
+  CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
+  free(DevPtr);
   free(Allocation);
 }
 
-PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
+static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
   dump_function();
 
   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
-
   if (DevData == 0) {
     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }
+  DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
+  if (DevData->DevicePtr == 0) {
+    fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
+    exit(-1);
+  }
 
-  CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);
+  CUresult Res =
+      CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);
 
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
@@ -497,10 +1362,247 @@ PollyGPUDevicePtr *polly_allocateMemoryF
   return DevData;
 }
 
+static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
+  dump_function();
+
+  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
+  return (void *)DevPtr->Cuda;
+}
+
+static void freeContextCUDA(PollyGPUContext *Context) {
+  dump_function();
+
+  CUDAContext *Ctx = (CUDAContext *)Context->Context;
+  if (Ctx->Cuda) {
+    CuCtxDestroyFcnPtr(Ctx->Cuda);
+    free(Ctx);
+    free(Context);
+  }
+
+  dlclose(HandleCuda);
+  dlclose(HandleCudaRT);
+}
+
+#endif /* HAS_LIBCUDART */
+/******************************************************************************/
+/*                                    API                                     */
+/******************************************************************************/
+
+PollyGPUContext *polly_initContext() {
+  DebugMode = getenv("POLLY_DEBUG") != 0;
+  CacheMode = getenv("POLLY_NOCACHE") == 0;
+
+  dump_function();
+
+  PollyGPUContext *Context;
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    Context = initContextCUDA();
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    Context = initContextCL();
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+
+  return Context;
+}
+
+void polly_freeKernel(PollyGPUFunction *Kernel) {
+  dump_function();
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    freeKernelCUDA(Kernel);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    freeKernelCL(Kernel);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+}
+
+PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
+                                  const char *KernelName) {
+  dump_function();
+
+  PollyGPUFunction *Function;
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    Function = getKernelCUDA(BinaryBuffer, KernelName);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    Function = getKernelCL(BinaryBuffer, KernelName);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+
+  return Function;
+}
+
+void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
+                                long MemSize) {
+  dump_function();
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    copyFromHostToDeviceCL(HostData, DevData, MemSize);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+}
+
+void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
+                                long MemSize) {
+  dump_function();
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    copyFromDeviceToHostCL(DevData, HostData, MemSize);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+}
+
+void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
+                        unsigned int GridDimY, unsigned int BlockDimX,
+                        unsigned int BlockDimY, unsigned int BlockDimZ,
+                        void **Parameters) {
+  dump_function();
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
+                     BlockDimZ, Parameters);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
+                   Parameters);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+}
+
+void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
+  dump_function();
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    freeDeviceMemoryCUDA(Allocation);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    freeDeviceMemoryCL(Allocation);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+}
+
+PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
+  dump_function();
+
+  PollyGPUDevicePtr *DevData;
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    DevData = allocateMemoryForDeviceCUDA(MemSize);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    DevData = allocateMemoryForDeviceCL(MemSize);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+
+  return DevData;
+}
+
 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
   dump_function();
 
-  return (void *)Allocation->Cuda;
+  void *DevPtr;
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    DevPtr = getDevicePtrCUDA(Allocation);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    DevPtr = getDevicePtrCL(Allocation);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
+
+  return DevPtr;
+}
+
+void polly_synchronizeDevice() {
+  dump_function();
+
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    synchronizeDeviceCUDA();
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    synchronizeDeviceCL();
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
+  }
 }
 
 void polly_freeContext(PollyGPUContext *Context) {
@@ -509,11 +1611,40 @@ void polly_freeContext(PollyGPUContext *
   if (CacheMode)
     return;
 
-  if (Context->Cuda) {
-    CuCtxDestroyFcnPtr(Context->Cuda);
-    free(Context);
+  switch (Runtime) {
+#ifdef HAS_LIBCUDART
+  case RUNTIME_CUDA:
+    freeContextCUDA(Context);
+    break;
+#endif /* HAS_LIBCUDART */
+#ifdef HAS_LIBOPENCL
+  case RUNTIME_CL:
+    freeContextCL(Context);
+    break;
+#endif /* HAS_LIBOPENCL */
+  default:
+    err_runtime();
   }
+}
 
-  dlclose(HandleCuda);
-  dlclose(HandleCudaRT);
+/* Initialize GPUJIT with CUDA as runtime library. */
+PollyGPUContext *polly_initContextCUDA() {
+#ifdef HAS_LIBCUDART
+  Runtime = RUNTIME_CUDA;
+  return polly_initContext();
+#else
+  fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
+  exit(-1);
+#endif /* HAS_LIBCUDART */
+}
+
+/* Initialize GPUJIT with OpenCL as runtime library. */
+PollyGPUContext *polly_initContextCL() {
+#ifdef HAS_LIBOPENCL
+  Runtime = RUNTIME_CL;
+  return polly_initContext();
+#else
+  fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
+  exit(-1);
+#endif /* HAS_LIBOPENCL */
 }

Modified: polly/trunk/tools/GPURuntime/GPUJIT.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.h?rev=302379&r1=302378&r2=302379&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.h (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.h Sun May  7 16:03:46 2017
@@ -76,12 +76,27 @@
  *
  */
 
+typedef enum PollyGPURuntimeT {
+  RUNTIME_NONE,
+  RUNTIME_CUDA,
+  RUNTIME_CL
+} PollyGPURuntime;
+
 typedef struct PollyGPUContextT PollyGPUContext;
 typedef struct PollyGPUFunctionT PollyGPUFunction;
 typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
 
-PollyGPUContext *polly_initContext();
-PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
+typedef struct OpenCLContextT OpenCLContext;
+typedef struct OpenCLKernelT OpenCLKernel;
+typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
+
+typedef struct CUDAContextT CUDAContext;
+typedef struct CUDAKernelT CUDAKernel;
+typedef struct CUDADevicePtrT CUDADevicePtr;
+
+PollyGPUContext *polly_initContextCUDA();
+PollyGPUContext *polly_initContextCL();
+PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
                                   const char *KernelName);
 void polly_freeKernel(PollyGPUFunction *Kernel);
 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,




More information about the llvm-commits mailing list