[polly] r302217 - Revert "[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen"

Tobias Grosser via llvm-commits llvm-commits at lists.llvm.org
Fri May 5 02:41:33 PDT 2017


On Fri, May 5, 2017, at 11:02 AM, Siddharth Bhat via llvm-commits wrote:
> Author: bollu
> Date: Fri May  5 04:02:08 2017
> New Revision: 302217
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=302217&view=rev
> Log:
> Revert "[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU
> CodeGen"
> 
> This reverts commit 17a84e414adb51ee375d14836d4c2a817b191933.

Hi siddharth,

please use SVN ids in your messages.

Best,
Tobias

> 
> Patches should have been submitted in the order of:
> 
> 1. D32852
> 2. D32854
> 3. D32431
> 
> I mistakenly pushed D32431(3) first. Reverting to push in the correct
> order.
> 
> Removed:
>     polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h
> Modified:
>     polly/trunk/CMakeLists.txt
>     polly/trunk/include/polly/LinkAllPasses.h
>     polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
>     polly/trunk/lib/Support/RegisterPasses.cpp
>     polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
>     polly/trunk/test/GPGPU/size-cast.ll
>     polly/trunk/tools/CMakeLists.txt
>     polly/trunk/tools/GPURuntime/GPUJIT.c
>     polly/trunk/tools/GPURuntime/GPUJIT.h
> 
> Modified: polly/trunk/CMakeLists.txt
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/CMakeLists.txt?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/CMakeLists.txt (original)
> +++ polly/trunk/CMakeLists.txt Fri May  5 04:02:08 2017
> @@ -152,10 +152,9 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TR
>  
>  option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature"
>  OFF)
>  if (POLLY_ENABLE_GPGPU_CODEGEN)
> -  # Do not require CUDA/OpenCL, as GPU code generation test cases can be
> run
> -  # without a CUDA/OpenCL library.
> +  # Do not require CUDA, as GPU code generation test cases can be run
> without
> +  # a cuda library.
>    FIND_PACKAGE(CUDA)
> -  FIND_PACKAGE(OpenCL)
>    set(GPU_CODEGEN TRUE)
>  else(POLLY_ENABLE_GPGPU_CODEGEN)
>    set(GPU_CODEGEN FALSE)
> @@ -164,13 +163,8 @@ endif(POLLY_ENABLE_GPGPU_CODEGEN)
>  
>  # Support GPGPU code generation if the library is available.
>  if (CUDALIB_FOUND)
> -  add_definitions(-DHAS_LIBCUDART)
>    INCLUDE_DIRECTORIES( ${CUDALIB_INCLUDE_DIR} )
>  endif(CUDALIB_FOUND)
> -if (OpenCL_FOUND)
> -  add_definitions(-DHAS_LIBOPENCL)
> -  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
> -endif(OpenCL_FOUND)
>  
>  option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in
>  Polly" ON)
>  if (NOT POLLY_BUNDLED_ISL)
> 
> Removed: polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h?rev=302216&view=auto
> ==============================================================================
> --- polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h (original)
> +++ polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h (removed)
> @@ -1,24 +0,0 @@
> -//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation.
> --===//
> -//
> -//                     The LLVM Compiler Infrastructure
> -//
> -// This file is distributed under the University of Illinois Open Source
> -// License. See LICENSE.TXT for details.
> -//
> -//===----------------------------------------------------------------------===//
> -//
> -// Take a scop created by ScopInfo and map it to GPU code using the ppcg
> -// GPU mapping strategy.
> -//
> -//===----------------------------------------------------------------------===//
> -
> -#ifndef POLLY_PPCGCODEGENERATION_H
> -#define POLLY_PPCGCODEGENERATION_H
> -
> -/// The GPU Architecture to target.
> -enum GPUArch { NVPTX64 };
> -
> -/// The GPU Runtime implementation to use.
> -enum GPURuntime { CUDA, OpenCL };
> -
> -#endif // POLLY_PPCGCODEGENERATION_H
> 
> Modified: polly/trunk/include/polly/LinkAllPasses.h
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/LinkAllPasses.h?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/LinkAllPasses.h (original)
> +++ polly/trunk/include/polly/LinkAllPasses.h Fri May  5 04:02:08 2017
> @@ -15,7 +15,6 @@
>  #ifndef POLLY_LINKALLPASSES_H
>  #define POLLY_LINKALLPASSES_H
>  
> -#include "polly/CodeGen/PPCGCodeGeneration.h"
>  #include "polly/Config/config.h"
>  #include "polly/PruneUnprofitable.h"
>  #include "polly/Simplify.h"
> @@ -49,8 +48,7 @@ llvm::Pass *createScopInfoWrapperPassPas
>  llvm::Pass *createIslAstInfoPass();
>  llvm::Pass *createCodeGenerationPass();
>  #ifdef GPU_CODEGEN
> -llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch =
> GPUArch::NVPTX64,
> -                                         GPURuntime Runtime =
> GPURuntime::CUDA);
> +llvm::Pass *createPPCGCodeGenerationPass();
>  #endif
>  llvm::Pass *createIslScheduleOptimizerPass();
>  llvm::Pass *createFlattenSchedulePass();
> 
> Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
> +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri May  5 04:02:08
> 2017
> @@ -12,7 +12,6 @@
>  //
>  //===----------------------------------------------------------------------===//
>  
> -#include "polly/CodeGen/PPCGCodeGeneration.h"
>  #include "polly/CodeGen/IslAst.h"
>  #include "polly/CodeGen/IslNodeBuilder.h"
>  #include "polly/CodeGen/Utils.h"
> @@ -154,9 +153,9 @@ public:
>    GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
>                   const DataLayout &DL, LoopInfo &LI, ScalarEvolution
>                   &SE,
>                   DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
> -                 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
> +                 gpu_prog *Prog)
>        : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S,
>        StartBlock),
> -        Prog(Prog), Runtime(Runtime), Arch(Arch) {
> +        Prog(Prog) {
>      getExprBuilder().setIDToSAI(&IDToSAI);
>    }
>  
> @@ -202,12 +201,6 @@ private:
>    /// The GPU program we generate code for.
>    gpu_prog *Prog;
>  
> -  /// The GPU Runtime implementation to use (OpenCL or CUDA).
> -  GPURuntime Runtime;
> -
> -  /// The GPU Architecture to target.
> -  GPUArch Arch;
> -
>    /// Class to free isl_ids.
>    class IslIdDeleter {
>    public:
> @@ -759,17 +752,7 @@ void GPUNodeBuilder::createCallSynchroni
>  }
>  
>  Value *GPUNodeBuilder::createCallInitContext() {
> -  const char *Name;
> -
> -  switch (Runtime) {
> -  case GPURuntime::CUDA:
> -    Name = "polly_initContextCUDA";
> -    break;
> -  case GPURuntime::OpenCL:
> -    Name = "polly_initContextCL";
> -    break;
> -  }
> -
> +  const char *Name = "polly_initContext";
>    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
>    Function *F = M->getFunction(Name);
>  
> @@ -1045,15 +1028,7 @@ void GPUNodeBuilder::createScopStmt(isl_
>  
>  void GPUNodeBuilder::createKernelSync() {
>    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
> -
> -  Function *Sync;
> -
> -  switch (Arch) {
> -  case GPUArch::NVPTX64:
> -    Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
> -    break;
> -  }
> -
> +  auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
>    Builder.CreateCall(Sync, {});
>  }
>  
> @@ -1459,12 +1434,7 @@ GPUNodeBuilder::createKernelFunctionDecl
>    auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
>    auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
>                                GPUModule.get());
> -
> -  switch (Arch) {
> -  case GPUArch::NVPTX64:
> -    FN->setCallingConv(CallingConv::PTX_Kernel);
> -    break;
> -  }
> +  FN->setCallingConv(CallingConv::PTX_Kernel);
>  
>    auto Arg = FN->arg_begin();
>    for (long i = 0; i < Kernel->n_array; i++) {
> @@ -1525,19 +1495,12 @@ GPUNodeBuilder::createKernelFunctionDecl
>  }
>  
>  void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
> -  Intrinsic::ID IntrinsicsBID[2];
> -  Intrinsic::ID IntrinsicsTID[3];
> +  Intrinsic::ID IntrinsicsBID[] =
> {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
> +                                  
> Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
>  
> -  switch (Arch) {
> -  case GPUArch::NVPTX64:
> -    IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
> -    IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
> -
> -    IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
> -    IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
> -    IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
> -    break;
> -  }
> +  Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
> +                                   Intrinsic::nvvm_read_ptx_sreg_tid_y,
> +                                   Intrinsic::nvvm_read_ptx_sreg_tid_z};
>  
>    auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable
>    {
>      std::string Name = isl_id_get_name(Id);
> @@ -1686,18 +1649,11 @@ void GPUNodeBuilder::createKernelVariabl
>  
>  void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
>                                            SetVector<Value *>
>                                            &SubtreeValues) {
> +
>    std::string Identifier = "kernel_" + std::to_string(Kernel->id);
>    GPUModule.reset(new Module(Identifier, Builder.getContext()));
> -
> -  switch (Arch) {
> -  case GPUArch::NVPTX64:
> -    if (Runtime == GPURuntime::CUDA)
> -     
> GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
> -    else if (Runtime == GPURuntime::OpenCL)
> -     
> GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
> -    GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit
> */));
> -    break;
> -  }
> +  GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
> +  GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
>  
>    Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
>  
> @@ -1718,21 +1674,7 @@ void GPUNodeBuilder::createKernelFunctio
>  }
>  
>  std::string GPUNodeBuilder::createKernelASM() {
> -  llvm::Triple GPUTriple;
> -
> -  switch (Arch) {
> -  case GPUArch::NVPTX64:
> -    switch (Runtime) {
> -    case GPURuntime::CUDA:
> -      GPUTriple =
> llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
> -      break;
> -    case GPURuntime::OpenCL:
> -      GPUTriple =
> llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
> -      break;
> -    }
> -    break;
> -  }
> -
> +  llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
>    std::string ErrMsg;
>    auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(),
>    ErrMsg);
>  
> @@ -1743,17 +1685,9 @@ std::string GPUNodeBuilder::createKernel
>  
>    TargetOptions Options;
>    Options.UnsafeFPMath = FastMath;
> -
> -  std::string subtarget;
> -
> -  switch (Arch) {
> -  case GPUArch::NVPTX64:
> -    subtarget = CudaVersion;
> -    break;
> -  }
> -
> -  std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
> -      GPUTriple.getTriple(), subtarget, "", Options,
> Optional<Reloc::Model>()));
> +  std::unique_ptr<TargetMachine> TargetM(
> +      GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion,
> "",
> +                                     Options,
> Optional<Reloc::Model>()));
>  
>    SmallString<0> ASMString;
>    raw_svector_ostream ASMStream(ASMString);
> @@ -1805,10 +1739,6 @@ class PPCGCodeGeneration : public ScopPa
>  public:
>    static char ID;
>  
> -  GPURuntime Runtime = GPURuntime::CUDA;
> -
> -  GPUArch Architecture = GPUArch::NVPTX64;
> -
>    /// The scop that is currently processed.
>    Scop *S;
>  
> @@ -2592,7 +2522,7 @@ public:
>          executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
>  
>      GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT,
>      *S,
> -                               StartBlock, Prog, Runtime, Architecture);
> +                               StartBlock, Prog);
>  
>      // TODO: Handle LICM
>      auto SplitBlock = StartBlock->getSinglePredecessor();
> @@ -2680,12 +2610,7 @@ public:
>  
>  char PPCGCodeGeneration::ID = 1;
>  
> -Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime
> Runtime) {
> -  PPCGCodeGeneration *generator = new PPCGCodeGeneration();
> -  generator->Runtime = Runtime;
> -  generator->Architecture = Arch;
> -  return generator;
> -}
> +Pass *polly::createPPCGCodeGenerationPass() { return new
> PPCGCodeGeneration(); }
>  
>  INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
>                        "Polly - Apply PPCG translation to SCOP", false,
>                        false)
> 
> Modified: polly/trunk/lib/Support/RegisterPasses.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Support/RegisterPasses.cpp?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/lib/Support/RegisterPasses.cpp (original)
> +++ polly/trunk/lib/Support/RegisterPasses.cpp Fri May  5 04:02:08 2017
> @@ -23,7 +23,6 @@
>  #include "polly/Canonicalization.h"
>  #include "polly/CodeGen/CodeGeneration.h"
>  #include "polly/CodeGen/CodegenCleanup.h"
> -#include "polly/CodeGen/PPCGCodeGeneration.h"
>  #include "polly/DeLICM.h"
>  #include "polly/DependenceInfo.h"
>  #include "polly/FlattenSchedule.h"
> @@ -102,23 +101,6 @@ static cl::opt<TargetChoice>
>                            ),
>             cl::init(TARGET_CPU), cl::ZeroOrMore,
>             cl::cat(PollyCategory));
>  
> -#ifdef GPU_CODEGEN
> -static cl::opt<GPURuntime> GPURuntimeChoice(
> -    "polly-gpu-runtime", cl::desc("The GPU Runtime API to target"),
> -    cl::values(clEnumValN(GPURuntime::CUDA, "libcudart",
> -                          "use the CUDA Runtime API"),
> -               clEnumValN(GPURuntime::OpenCL, "libopencl",
> -                          "use the OpenCL Runtime API")),
> -    cl::init(GPURuntime::CUDA), cl::ZeroOrMore, cl::cat(PollyCategory));
> -
> -static cl::opt<GPUArch>
> -    GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to
> target"),
> -                  cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
> -                                        "target NVIDIA 64-bit
> architecture")),
> -                  cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
> -                  cl::cat(PollyCategory));
> -#endif
> -
>  VectorizerChoice polly::PollyVectorizerChoice;
>  static cl::opt<polly::VectorizerChoice, true> Vectorizer(
>      "polly-vectorizer", cl::desc("Select the vectorization strategy"),
> @@ -327,8 +309,7 @@ void registerPollyPasses(llvm::legacy::P
>  
>    if (Target == TARGET_GPU) {
>  #ifdef GPU_CODEGEN
> -    PM.add(
> -        polly::createPPCGCodeGenerationPass(GPUArchChoice,
> GPURuntimeChoice));
> +    PM.add(polly::createPPCGCodeGenerationPass());
>  #endif
>    } else {
>      switch (CodeGeneration) {
> 
> Modified: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll (original)
> +++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll Fri May  5
> 04:02:08 2017
> @@ -35,7 +35,7 @@
>  ; CHECK-NOT: polly_freeDeviceMemory
>  ; CHECK-NOT: polly_allocateMemoryForDevice
>  
> -; CHECK:       %13 = call i8* @polly_initContextCUDA()
> +; CHECK:       %13 = call i8* @polly_initContext()
>  ; CHECK-NEXT:  %14 = bitcast i32* %A to i8*
>  ; CHECK-NEXT:  %15 = getelementptr [2 x i8*], [2 x i8*]*
>  %polly_launch_0_params, i64 0, i64 0
>  ; CHECK-NEXT:  store i8* %14, i8** %polly_launch_0_param_0
> @@ -46,7 +46,7 @@
>  ; CHECK-NEXT:  store i8* %17, i8** %polly_launch_0_param_1
>  ; CHECK-NEXT:  %19 = bitcast i8** %polly_launch_0_param_1 to i8*
>  ; CHECK-NEXT:  store i8* %19, i8** %18
> -; CHECK-NEXT:  %20 = call i8* @polly_getKernel(i8* getelementptr
> inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8*
> getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32
> 0))
> +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds
> ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr
> inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
>  ; CHECK-NEXT:  call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32
>  32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
>  ; CHECK-NEXT:  call void @polly_freeKernel(i8* %20)
>  ; CHECK-NEXT:  call void @polly_synchronizeDevice()
> 
> Modified: polly/trunk/test/GPGPU/size-cast.ll
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/size-cast.ll?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/test/GPGPU/size-cast.ll (original)
> +++ polly/trunk/test/GPGPU/size-cast.ll Fri May  5 04:02:08 2017
> @@ -29,7 +29,7 @@
>  ; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
>  ; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
>  
> -; IR:        call i8* @polly_initContextCUDA()
> +; IR:        call i8* @polly_initContext()
>  ; IR-NEXT:   sext i32 %arg to i64
>  ; IR-NEXT:   mul i64
>  ; IR-NEXT:   @polly_allocateMemoryForDevice
> 
> Modified: polly/trunk/tools/CMakeLists.txt
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/tools/CMakeLists.txt?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/tools/CMakeLists.txt (original)
> +++ polly/trunk/tools/CMakeLists.txt Fri May  5 04:02:08 2017
> @@ -1,5 +1,5 @@
> -if (CUDALIB_FOUND OR OpenCL_FOUND)
> +if (CUDALIB_FOUND)
>    add_subdirectory(GPURuntime)
> -endif (CUDALIB_FOUND OR OpenCL_FOUND)
> +endif (CUDALIB_FOUND)
>  
>  set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
> 
> Modified: polly/trunk/tools/GPURuntime/GPUJIT.c
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.c?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/tools/GPURuntime/GPUJIT.c (original)
> +++ polly/trunk/tools/GPURuntime/GPUJIT.c Fri May  5 04:02:08 2017
> @@ -12,20 +12,8 @@
>  /******************************************************************************/
>  
>  #include "GPUJIT.h"
> -
> -#ifdef HAS_LIBCUDART
>  #include <cuda.h>
>  #include <cuda_runtime.h>
> -#endif /* HAS_LIBCUDART */
> -
> -#ifdef HAS_LIBOPENCL
> -#ifdef __APPLE__
> -#include <OpenCL/opencl.h>
> -#else
> -#include <CL/cl.h>
> -#endif
> -#endif /* HAS_LIBOPENCL */
> -
>  #include <dlfcn.h>
>  #include <stdarg.h>
>  #include <stdio.h>
> @@ -34,8 +22,6 @@
>  static int DebugMode;
>  static int CacheMode;
>  
> -static PollyGPURuntime Runtime = RUNTIME_NONE;
> -
>  static void debug_print(const char *format, ...) {
>    if (!DebugMode)
>      return;
> @@ -47,853 +33,18 @@ static void debug_print(const char *form
>  }
>  #define dump_function() debug_print("-> %s\n", __func__)
>  
> -#define KERNEL_CACHE_SIZE 10
> -
> -static void err_runtime() {
> -  fprintf(stderr, "Runtime not correctly initialized.\n");
> -  exit(-1);
> -}
> -
> +/* Define Polly's GPGPU data types. */
>  struct PollyGPUContextT {
> -  void *Context;
> -};
> -
> -struct PollyGPUFunctionT {
> -  void *Kernel;
> -};
> -
> -struct PollyGPUDevicePtrT {
> -  void *DevicePtr;
> -};
> -
> -/******************************************************************************/
> -/*                                  OpenCL                              
>      */
> -/******************************************************************************/
> -#ifdef HAS_LIBOPENCL
> -
> -struct OpenCLContextT {
> -  cl_context Context;
> -  cl_command_queue CommandQueue;
> -};
> -
> -struct OpenCLKernelT {
> -  cl_kernel Kernel;
> -  cl_program Program;
> -  const char *BinaryString;
> -};
> -
> -struct OpenCLDevicePtrT {
> -  cl_mem MemObj;
> -};
> -
> -/* Dynamic library handles for the OpenCL runtime library. */
> -static void *HandleOpenCL;
> -
> -/* Type-defines of function pointer to OpenCL Runtime API. */
> -typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
> -                                     cl_platform_id *Platforms,
> -                                     cl_uint *NumPlatforms);
> -static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;
> -
> -typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
> -                                   cl_device_type DeviceType,
> -                                   cl_uint NumEntries, cl_device_id
> *Devices,
> -                                   cl_uint *NumDevices);
> -static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;
> -
> -typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
> -                                    cl_device_info ParamName,
> -                                    size_t ParamValueSize, void
> *ParamValue,
> -                                    size_t *ParamValueSizeRet);
> -static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;
> -
> -typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info
> ParamName,
> -                                    size_t ParamValueSize, void
> *ParamValue,
> -                                    size_t *ParamValueSizeRet);
> -static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;
> -
> -typedef cl_context clCreateContextFcnTy(
> -    const cl_context_properties *Properties, cl_uint NumDevices,
> -    const cl_device_id *Devices,
> -    void CL_CALLBACK *pfn_notify(const char *Errinfo, const void
> *PrivateInfo,
> -                                 size_t CB, void *UserData),
> -    void *UserData, cl_int *ErrcodeRet);
> -static clCreateContextFcnTy *clCreateContextFcnPtr;
> -
> -typedef cl_command_queue
> -clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
> -                          cl_command_queue_properties Properties,
> -                          cl_int *ErrcodeRet);
> -static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;
> -
> -typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags
> Flags,
> -                                   size_t Size, void *HostPtr,
> -                                   cl_int *ErrcodeRet);
> -static clCreateBufferFcnTy *clCreateBufferFcnPtr;
> -
> -typedef cl_int
> -clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
> -                          cl_bool BlockingWrite, size_t Offset, size_t
> Size,
> -                          const void *Ptr, cl_uint NumEventsInWaitList,
> -                          const cl_event *EventWaitList, cl_event
> *Event);
> -static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
> -
> -typedef cl_program clCreateProgramWithBinaryFcnTy(
> -    cl_context Context, cl_uint NumDevices, const cl_device_id
> *DeviceList,
> -    const size_t *Lengths, const unsigned char **Binaries, cl_int
> *BinaryStatus,
> -    cl_int *ErrcodeRet);
> -static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;
> -
> -typedef cl_int clBuildProgramFcnTy(
> -    cl_program Program, cl_uint NumDevices, const cl_device_id
> *DeviceList,
> -    const char *Options,
> -    void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
> -    void *UserData);
> -static clBuildProgramFcnTy *clBuildProgramFcnPtr;
> -
> -typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
> -                                      const char *KernelName,
> -                                      cl_int *ErrcodeRet);
> -static clCreateKernelFcnTy *clCreateKernelFcnPtr;
> -
> -typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
> -                                   size_t ArgSize, const void
> *ArgValue);
> -static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;
> -
> -typedef cl_int clEnqueueNDRangeKernelFcnTy(
> -    cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
> -    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
> -    const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
> -    const cl_event *EventWaitList, cl_event *Event);
> -static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;
> -
> -typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
> -                                        cl_mem Buffer, cl_bool
> BlockingRead,
> -                                        size_t Offset, size_t Size, void
> *Ptr,
> -                                        cl_uint NumEventsInWaitList,
> -                                        const cl_event *EventWaitList,
> -                                        cl_event *Event);
> -static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;
> -
> -typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
> -static clFlushFcnTy *clFlushFcnPtr;
> -
> -typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
> -static clFinishFcnTy *clFinishFcnPtr;
> -
> -typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
> -static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;
> -
> -typedef cl_int clReleaseProgramFcnTy(cl_program Program);
> -static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;
> -
> -typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
> -static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;
> -
> -typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue
> CommandQueue);
> -static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;
> -
> -typedef cl_int clReleaseContextFcnTy(cl_context Context);
> -static clReleaseContextFcnTy *clReleaseContextFcnPtr;
> -
> -static void *getAPIHandleCL(void *Handle, const char *FuncName) {
> -  char *Err;
> -  void *FuncPtr;
> -  dlerror();
> -  FuncPtr = dlsym(Handle, FuncName);
> -  if ((Err = dlerror()) != 0) {
> -    fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
> -    return 0;
> -  }
> -  return FuncPtr;
> -}
> -
> -static int initialDeviceAPILibrariesCL() {
> -  HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
> -  if (!HandleOpenCL) {
> -    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
> -    return 0;
> -  }
> -  return 1;
> -}
> -
> -static int initialDeviceAPIsCL() {
> -  if (initialDeviceAPILibrariesCL() == 0)
> -    return 0;
> -
> -  /* Get function pointer to OpenCL Runtime API.
> -   *
> -   * Note that compilers conforming to the ISO C standard are required
> to
> -   * generate a warning if a conversion from a void * pointer to a
> function
> -   * pointer is attempted as in the following statements. The warning
> -   * of this kind of cast may not be emitted by clang and new versions
> of gcc
> -   * as it is valid on POSIX 2008.
> -   */
> -  clGetPlatformIDsFcnPtr =
> -      (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clGetPlatformIDs");
> -
> -  clGetDeviceIDsFcnPtr =
> -      (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clGetDeviceIDs");
> -
> -  clGetDeviceInfoFcnPtr =
> -      (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clGetDeviceInfo");
> -
> -  clGetKernelInfoFcnPtr =
> -      (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clGetKernelInfo");
> -
> -  clCreateContextFcnPtr =
> -      (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clCreateContext");
> -
> -  clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy
> *)getAPIHandleCL(
> -      HandleOpenCL, "clCreateCommandQueue");
> -
> -  clCreateBufferFcnPtr =
> -      (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clCreateBuffer");
> -
> -  clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy
> *)getAPIHandleCL(
> -      HandleOpenCL, "clEnqueueWriteBuffer");
> -
> -  clCreateProgramWithBinaryFcnPtr =
> -      (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
> -          HandleOpenCL, "clCreateProgramWithBinary");
> -
> -  clBuildProgramFcnPtr =
> -      (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clBuildProgram");
> -
> -  clCreateKernelFcnPtr =
> -      (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clCreateKernel");
> -
> -  clSetKernelArgFcnPtr =
> -      (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clSetKernelArg");
> -
> -  clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy
> *)getAPIHandleCL(
> -      HandleOpenCL, "clEnqueueNDRangeKernel");
> -
> -  clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy
> *)getAPIHandleCL(
> -      HandleOpenCL, "clEnqueueReadBuffer");
> -
> -  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clFlush");
> -
> -  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clFinish");
> -
> -  clReleaseKernelFcnPtr =
> -      (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clReleaseKernel");
> -
> -  clReleaseProgramFcnPtr =
> -      (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clReleaseProgram");
> -
> -  clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
> -      HandleOpenCL, "clReleaseMemObject");
> -
> -  clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy
> *)getAPIHandleCL(
> -      HandleOpenCL, "clReleaseCommandQueue");
> -
> -  clReleaseContextFcnPtr =
> -      (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL,
> "clReleaseContext");
> -
> -  return 1;
> -}
> -
> -/* Context and Device. */
> -static PollyGPUContext *GlobalContext = NULL;
> -static cl_device_id GlobalDeviceID = NULL;
> -
> -/* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
> -static void printOpenCLError(int Error);
> -
> -static void checkOpenCLError(int Ret, const char *format, ...) {
> -  if (Ret == CL_SUCCESS)
> -    return;
> -
> -  printOpenCLError(Ret);
> -  va_list args;
> -  va_start(args, format);
> -  vfprintf(stderr, format, args);
> -  va_end(args);
> -  exit(-1);
> -}
> -
> -static PollyGPUContext *initContextCL() {
> -  dump_function();
> -
> -  PollyGPUContext *Context;
> -
> -  cl_platform_id PlatformID = NULL;
> -  cl_device_id DeviceID = NULL;
> -  cl_uint NumDevicesRet;
> -  cl_int Ret;
> -
> -  char DeviceRevision[256];
> -  char DeviceName[256];
> -  size_t DeviceRevisionRetSize, DeviceNameRetSize;
> -
> -  static __thread PollyGPUContext *CurrentContext = NULL;
> -
> -  if (CurrentContext)
> -    return CurrentContext;
> -
> -  /* Get API handles. */
> -  if (initialDeviceAPIsCL() == 0) {
> -    fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime
> failed.\n");
> -    exit(-1);
> -  }
> -
> -  /* Get number of devices that support OpenCL. */
> -  static const int NumberOfPlatforms = 1;
> -  Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
> -  checkOpenCLError(Ret, "Failed to get platform IDs.\n");
> -  // TODO: Extend to CL_DEVICE_TYPE_ALL?
> -  static const int NumberOfDevices = 1;
> -  Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU,
> NumberOfDevices,
> -                             &DeviceID, &NumDevicesRet);
> -  checkOpenCLError(Ret, "Failed to get device IDs.\n");
> -
> -  GlobalDeviceID = DeviceID;
> -  if (NumDevicesRet == 0) {
> -    fprintf(stderr, "There is no device supporting OpenCL.\n");
> -    exit(-1);
> -  }
> -
> -  /* Get device revision. */
> -  Ret =
> -      clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION,
> sizeof(DeviceRevision),
> -                            DeviceRevision, &DeviceRevisionRetSize);
> -  checkOpenCLError(Ret, "Failed to fetch device revision.\n");
> -
> -  /* Get device name. */
> -  Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME,
> sizeof(DeviceName),
> -                              DeviceName, &DeviceNameRetSize);
> -  checkOpenCLError(Ret, "Failed to fetch device name.\n");
> -
> -  debug_print("> Running on GPU device %d : %s.\n", DeviceID,
> DeviceName);
> -
> -  /* Create context on the device. */
> -  Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
> -  if (Context == 0) {
> -    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
> -    exit(-1);
> -  }
> -  Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
> -  if (Context->Context == 0) {
> -    fprintf(stderr, "Allocate memory for Polly OpenCL context
> failed.\n");
> -    exit(-1);
> -  }
> -  ((OpenCLContext *)Context->Context)->Context =
> -      clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL,
> &Ret);
> -  checkOpenCLError(Ret, "Failed to create context.\n");
> -
> -  static const int ExtraProperties = 0;
> -  ((OpenCLContext *)Context->Context)->CommandQueue =
> -      clCreateCommandQueueFcnPtr(((OpenCLContext
> *)Context->Context)->Context,
> -                                 DeviceID, ExtraProperties, &Ret);
> -  checkOpenCLError(Ret, "Failed to create command queue.\n");
> -
> -  if (CacheMode)
> -    CurrentContext = Context;
> -
> -  GlobalContext = Context;
> -  return Context;
> -}
> -
> -static void freeKernelCL(PollyGPUFunction *Kernel) {
> -  dump_function();
> -
> -  if (CacheMode)
> -    return;
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  cl_int Ret;
> -  Ret = clFlushFcnPtr(((OpenCLContext
> *)GlobalContext->Context)->CommandQueue);
> -  checkOpenCLError(Ret, "Failed to flush command queue.\n");
> -  Ret = clFinishFcnPtr(((OpenCLContext
> *)GlobalContext->Context)->CommandQueue);
> -  checkOpenCLError(Ret, "Failed to finish command queue.\n");
> -
> -  if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
> -    cl_int Ret =
> -        clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
> -    checkOpenCLError(Ret, "Failed to release kernel.\n");
> -  }
> -
> -  if (((OpenCLKernel *)Kernel->Kernel)->Program) {
> -    cl_int Ret =
> -        clReleaseProgramFcnPtr(((OpenCLKernel
> *)Kernel->Kernel)->Program);
> -    checkOpenCLError(Ret, "Failed to release program.\n");
> -  }
> -
> -  if (Kernel->Kernel)
> -    free((OpenCLKernel *)Kernel->Kernel);
> -
> -  if (Kernel)
> -    free(Kernel);
> -}
> -
> -static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
> -                                     const char *KernelName) {
> -  dump_function();
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
> -  static __thread int NextCacheItem = 0;
> -
> -  for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
> -    // We exploit here the property that all Polly-ACC kernels are
> allocated
> -    // as global constants, hence a pointer comparision is sufficient to
> -    // determin equality.
> -    if (KernelCache[i] &&
> -        ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
> -            BinaryBuffer) {
> -      debug_print("  -> using cached kernel\n");
> -      return KernelCache[i];
> -    }
> -  }
> -
> -  PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
> -  if (Function == 0) {
> -    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
> -    exit(-1);
> -  }
> -  Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
> -  if (Function->Kernel == 0) {
> -    fprintf(stderr, "Allocate memory for Polly OpenCL kernel
> failed.\n");
> -    exit(-1);
> -  }
> -
> -  if (!GlobalDeviceID) {
> -    fprintf(stderr, "GPGPU-code generation not initialized
> correctly.\n");
> -    exit(-1);
> -  }
> -
> -  cl_int Ret;
> -  size_t BinarySize = strlen(BinaryBuffer);
> -  ((OpenCLKernel *)Function->Kernel)->Program =
> clCreateProgramWithBinaryFcnPtr(
> -      ((OpenCLContext *)GlobalContext->Context)->Context, 1,
> &GlobalDeviceID,
> -      (const size_t *)&BinarySize, (const unsigned char
> **)&BinaryBuffer, NULL,
> -      &Ret);
> -  checkOpenCLError(Ret, "Failed to create program from binary.\n");
> -
> -  Ret = clBuildProgramFcnPtr(((OpenCLKernel
> *)Function->Kernel)->Program, 1,
> -                             &GlobalDeviceID, NULL, NULL, NULL);
> -  checkOpenCLError(Ret, "Failed to build program.\n");
> -
> -  ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
> -      ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
> -  checkOpenCLError(Ret, "Failed to create kernel.\n");
> -
> -  ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
> -
> -  if (CacheMode) {
> -    if (KernelCache[NextCacheItem])
> -      freeKernelCL(KernelCache[NextCacheItem]);
> -
> -    KernelCache[NextCacheItem] = Function;
> -
> -    NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
> -  }
> -
> -  return Function;
> -}
> -
> -static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr
> *DevData,
> -                                   long MemSize) {
> -  dump_function();
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  cl_int Ret;
> -  Ret = clEnqueueWriteBufferFcnPtr(
> -      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
> -      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0,
> MemSize,
> -      HostData, 0, NULL, NULL);
> -  checkOpenCLError(Ret, "Copying data from host memory to device
> failed.\n");
> -}
> -
> -static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void
> *HostData,
> -                                   long MemSize) {
> -  dump_function();
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  cl_int Ret;
> -  Ret = clEnqueueReadBufferFcnPtr(
> -      ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
> -      ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0,
> MemSize,
> -      HostData, 0, NULL, NULL);
> -  checkOpenCLError(Ret, "Copying results from device to host memory
> failed.\n");
> -}
> -
> -static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int
> GridDimX,
> -                           unsigned int GridDimY, unsigned int
> BlockDimX,
> -                           unsigned int BlockDimY, unsigned int
> BlockDimZ,
> -                           void **Parameters) {
> -  dump_function();
> -
> -  cl_int Ret;
> -  cl_uint NumArgs;
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
> -  Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
> -                              sizeof(cl_uint), &NumArgs, NULL);
> -  checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");
> -
> -  // TODO: Pass the size of the kernel arguments in to launchKernelCL,
> along
> -  // with the arguments themselves. This is a dirty workaround that can
> be
> -  // broken.
> -  for (cl_uint i = 0; i < NumArgs; i++) {
> -    Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void
> *)Parameters[i]);
> -    if (Ret == CL_INVALID_ARG_SIZE) {
> -      Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void
> *)Parameters[i]);
> -      if (Ret == CL_INVALID_ARG_SIZE) {
> -        Ret =
> -            clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void
> *)Parameters[i]);
> -        if (Ret == CL_INVALID_ARG_SIZE) {
> -          Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
> -                                     (void *)Parameters[i]);
> -          checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n",
> i);
> -        }
> -      }
> -    }
> -    if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
> -      fprintf(stderr, "Failed to set Kernel argument.\n");
> -      printOpenCLError(Ret);
> -      exit(-1);
> -    }
> -  }
> -
> -  unsigned int GridDimZ = 1;
> -  size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY *
> GridDimY,
> -                              BlockDimZ * GridDimZ};
> -  size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};
> -
> -  static const int WorkDim = 3;
> -  OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
> -  Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue,
> CLKernel->Kernel,
> -                                     WorkDim, NULL, GlobalWorkSize,
> -                                     LocalWorkSize, 0, NULL, NULL);
> -  checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
> -}
> -
> -static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
> -  dump_function();
> -
> -  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
> -  cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
> -  checkOpenCLError(Ret, "Failed to free device memory.\n");
> -
> -  free(DevPtr);
> -  free(Allocation);
> -}
> -
> -static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
> -  dump_function();
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
> -  if (DevData == 0) {
> -    fprintf(stderr, "Allocate memory for GPU device memory pointer
> failed.\n");
> -    exit(-1);
> -  }
> -  DevData->DevicePtr = (OpenCLDevicePtr
> *)malloc(sizeof(OpenCLDevicePtr));
> -  if (DevData->DevicePtr == 0) {
> -    fprintf(stderr, "Allocate memory for GPU device memory pointer
> failed.\n");
> -    exit(-1);
> -  }
> -
> -  cl_int Ret;
> -  ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
> -      clCreateBufferFcnPtr(((OpenCLContext
> *)GlobalContext->Context)->Context,
> -                           CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
> -  checkOpenCLError(Ret,
> -                   "Allocate memory for GPU device memory pointer
> failed.\n");
> -
> -  return DevData;
> -}
> -
> -static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
> -  dump_function();
> -
> -  OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
> -  return (void *)DevPtr->MemObj;
> -}
> -
> -static void synchronizeDeviceCL() {
> -  dump_function();
> -
> -  if (!GlobalContext) {
> -    fprintf(stderr, "GPGPU-code generation not correctly
> initialized.\n");
> -    exit(-1);
> -  }
> -
> -  if (clFinishFcnPtr(((OpenCLContext
> *)GlobalContext->Context)->CommandQueue) !=
> -      CL_SUCCESS) {
> -    fprintf(stderr, "Synchronizing device and host memory failed.\n");
> -    exit(-1);
> -  }
> -}
> -
> -static void freeContextCL(PollyGPUContext *Context) {
> -  dump_function();
> -
> -  cl_int Ret;
> -
> -  GlobalContext = NULL;
> -
> -  OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
> -  if (Ctx->CommandQueue) {
> -    Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
> -    checkOpenCLError(Ret, "Could not release command queue.\n");
> -  }
> -
> -  if (Ctx->Context) {
> -    Ret = clReleaseContextFcnPtr(Ctx->Context);
> -    checkOpenCLError(Ret, "Could not release context.\n");
> -  }
> -
> -  free(Ctx);
> -  free(Context);
> -}
> -
> -static void printOpenCLError(int Error) {
> -
> -  switch (Error) {
> -  case CL_SUCCESS:
> -    // Success, don't print an error.
> -    break;
> -
> -  // JIT/Runtime errors.
> -  case CL_DEVICE_NOT_FOUND:
> -    fprintf(stderr, "Device not found.\n");
> -    break;
> -  case CL_DEVICE_NOT_AVAILABLE:
> -    fprintf(stderr, "Device not available.\n");
> -    break;
> -  case CL_COMPILER_NOT_AVAILABLE:
> -    fprintf(stderr, "Compiler not available.\n");
> -    break;
> -  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
> -    fprintf(stderr, "Mem object allocation failure.\n");
> -    break;
> -  case CL_OUT_OF_RESOURCES:
> -    fprintf(stderr, "Out of resources.\n");
> -    break;
> -  case CL_OUT_OF_HOST_MEMORY:
> -    fprintf(stderr, "Out of host memory.\n");
> -    break;
> -  case CL_PROFILING_INFO_NOT_AVAILABLE:
> -    fprintf(stderr, "Profiling info not available.\n");
> -    break;
> -  case CL_MEM_COPY_OVERLAP:
> -    fprintf(stderr, "Mem copy overlap.\n");
> -    break;
> -  case CL_IMAGE_FORMAT_MISMATCH:
> -    fprintf(stderr, "Image format mismatch.\n");
> -    break;
> -  case CL_IMAGE_FORMAT_NOT_SUPPORTED:
> -    fprintf(stderr, "Image format not supported.\n");
> -    break;
> -  case CL_BUILD_PROGRAM_FAILURE:
> -    fprintf(stderr, "Build program failure.\n");
> -    break;
> -  case CL_MAP_FAILURE:
> -    fprintf(stderr, "Map failure.\n");
> -    break;
> -  case CL_MISALIGNED_SUB_BUFFER_OFFSET:
> -    fprintf(stderr, "Misaligned sub buffer offset.\n");
> -    break;
> -  case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
> -    fprintf(stderr, "Exec status error for events in wait list.\n");
> -    break;
> -  case CL_COMPILE_PROGRAM_FAILURE:
> -    fprintf(stderr, "Compile program failure.\n");
> -    break;
> -  case CL_LINKER_NOT_AVAILABLE:
> -    fprintf(stderr, "Linker not available.\n");
> -    break;
> -  case CL_LINK_PROGRAM_FAILURE:
> -    fprintf(stderr, "Link program failure.\n");
> -    break;
> -  case CL_DEVICE_PARTITION_FAILED:
> -    fprintf(stderr, "Device partition failed.\n");
> -    break;
> -  case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
> -    fprintf(stderr, "Kernel arg info not available.\n");
> -    break;
> -
> -  // Compiler errors.
> -  case CL_INVALID_VALUE:
> -    fprintf(stderr, "Invalid value.\n");
> -    break;
> -  case CL_INVALID_DEVICE_TYPE:
> -    fprintf(stderr, "Invalid device type.\n");
> -    break;
> -  case CL_INVALID_PLATFORM:
> -    fprintf(stderr, "Invalid platform.\n");
> -    break;
> -  case CL_INVALID_DEVICE:
> -    fprintf(stderr, "Invalid device.\n");
> -    break;
> -  case CL_INVALID_CONTEXT:
> -    fprintf(stderr, "Invalid context.\n");
> -    break;
> -  case CL_INVALID_QUEUE_PROPERTIES:
> -    fprintf(stderr, "Invalid queue properties.\n");
> -    break;
> -  case CL_INVALID_COMMAND_QUEUE:
> -    fprintf(stderr, "Invalid command queue.\n");
> -    break;
> -  case CL_INVALID_HOST_PTR:
> -    fprintf(stderr, "Invalid host pointer.\n");
> -    break;
> -  case CL_INVALID_MEM_OBJECT:
> -    fprintf(stderr, "Invalid memory object.\n");
> -    break;
> -  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
> -    fprintf(stderr, "Invalid image format descriptor.\n");
> -    break;
> -  case CL_INVALID_IMAGE_SIZE:
> -    fprintf(stderr, "Invalid image size.\n");
> -    break;
> -  case CL_INVALID_SAMPLER:
> -    fprintf(stderr, "Invalid sampler.\n");
> -    break;
> -  case CL_INVALID_BINARY:
> -    fprintf(stderr, "Invalid binary.\n");
> -    break;
> -  case CL_INVALID_BUILD_OPTIONS:
> -    fprintf(stderr, "Invalid build options.\n");
> -    break;
> -  case CL_INVALID_PROGRAM:
> -    fprintf(stderr, "Invalid program.\n");
> -    break;
> -  case CL_INVALID_PROGRAM_EXECUTABLE:
> -    fprintf(stderr, "Invalid program executable.\n");
> -    break;
> -  case CL_INVALID_KERNEL_NAME:
> -    fprintf(stderr, "Invalid kernel name.\n");
> -    break;
> -  case CL_INVALID_KERNEL_DEFINITION:
> -    fprintf(stderr, "Invalid kernel definition.\n");
> -    break;
> -  case CL_INVALID_KERNEL:
> -    fprintf(stderr, "Invalid kernel.\n");
> -    break;
> -  case CL_INVALID_ARG_INDEX:
> -    fprintf(stderr, "Invalid arg index.\n");
> -    break;
> -  case CL_INVALID_ARG_VALUE:
> -    fprintf(stderr, "Invalid arg value.\n");
> -    break;
> -  case CL_INVALID_ARG_SIZE:
> -    fprintf(stderr, "Invalid arg size.\n");
> -    break;
> -  case CL_INVALID_KERNEL_ARGS:
> -    fprintf(stderr, "Invalid kernel args.\n");
> -    break;
> -  case CL_INVALID_WORK_DIMENSION:
> -    fprintf(stderr, "Invalid work dimension.\n");
> -    break;
> -  case CL_INVALID_WORK_GROUP_SIZE:
> -    fprintf(stderr, "Invalid work group size.\n");
> -    break;
> -  case CL_INVALID_WORK_ITEM_SIZE:
> -    fprintf(stderr, "Invalid work item size.\n");
> -    break;
> -  case CL_INVALID_GLOBAL_OFFSET:
> -    fprintf(stderr, "Invalid global offset.\n");
> -    break;
> -  case CL_INVALID_EVENT_WAIT_LIST:
> -    fprintf(stderr, "Invalid event wait list.\n");
> -    break;
> -  case CL_INVALID_EVENT:
> -    fprintf(stderr, "Invalid event.\n");
> -    break;
> -  case CL_INVALID_OPERATION:
> -    fprintf(stderr, "Invalid operation.\n");
> -    break;
> -  case CL_INVALID_GL_OBJECT:
> -    fprintf(stderr, "Invalid GL object.\n");
> -    break;
> -  case CL_INVALID_BUFFER_SIZE:
> -    fprintf(stderr, "Invalid buffer size.\n");
> -    break;
> -  case CL_INVALID_MIP_LEVEL:
> -    fprintf(stderr, "Invalid mip level.\n");
> -    break;
> -  case CL_INVALID_GLOBAL_WORK_SIZE:
> -    fprintf(stderr, "Invalid global work size.\n");
> -    break;
> -  case CL_INVALID_PROPERTY:
> -    fprintf(stderr, "Invalid property.\n");
> -    break;
> -  case CL_INVALID_IMAGE_DESCRIPTOR:
> -    fprintf(stderr, "Invalid image descriptor.\n");
> -    break;
> -  case CL_INVALID_COMPILER_OPTIONS:
> -    fprintf(stderr, "Invalid compiler options.\n");
> -    break;
> -  case CL_INVALID_LINKER_OPTIONS:
> -    fprintf(stderr, "Invalid linker options.\n");
> -    break;
> -  case CL_INVALID_DEVICE_PARTITION_COUNT:
> -    fprintf(stderr, "Invalid device partition count.\n");
> -    break;
> -  case CL_INVALID_PIPE_SIZE:
> -    fprintf(stderr, "Invalid pipe size.\n");
> -    break;
> -  case CL_INVALID_DEVICE_QUEUE:
> -    fprintf(stderr, "Invalid device queue.\n");
> -    break;
> -
> -  // NVIDIA specific error.
> -  case -9999:
> -    fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
> -    break;
> -
> -  default:
> -    fprintf(stderr, "Unknown error code!\n");
> -    break;
> -  }
> -}
> -
> -#endif /* HAS_LIBOPENCL */
> -/******************************************************************************/
> -/*                                   CUDA                               
>      */
> -/******************************************************************************/
> -#ifdef HAS_LIBCUDART
> -
> -struct CUDAContextT {
>    CUcontext Cuda;
>  };
>  
> -struct CUDAKernelT {
> +struct PollyGPUFunctionT {
>    CUfunction Cuda;
>    CUmodule CudaModule;
> -  const char *BinaryString;
> +  const char *PTXString;
>  };
>  
> -struct CUDADevicePtrT {
> +struct PollyGPUDevicePtrT {
>    CUdeviceptr Cuda;
>  };
>  
> @@ -906,10 +57,10 @@ typedef CUresult CUDAAPI CuMemAllocFcnTy
>  static CuMemAllocFcnTy *CuMemAllocFcnPtr;
>  
>  typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
> -    CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
> -    unsigned int gridDimZ, unsigned int blockDimX, unsigned int
> BlockDimY,
> -    unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream
> HStream,
> -    void **KernelParams, void **Extra);
> +    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
> +    unsigned int gridDimZ, unsigned int blockDimX, unsigned int
> blockDimY,
> +    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream
> hStream,
> +    void **kernelParams, void **extra);
>  static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;
>  
>  typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
> @@ -944,8 +95,8 @@ typedef CUresult CUDAAPI CuModuleLoadDat
>                                                   void **);
>  static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;
>  
> -typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
> -                                               const void *Image);
> +typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
> +                                               const void *image);
>  static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;
>  
>  typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *,
>  CUmodule,
> @@ -958,25 +109,25 @@ static CuDeviceComputeCapabilityFcnTy *C
>  typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
>  static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;
>  
> -typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
> -                                            CUjitInputType Type, void
> *Data,
> -                                            size_t Size, const char
> *Name,
> -                                            unsigned int NumOptions,
> -                                            CUjit_option *Options,
> -                                            void **OptionValues);
> +typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
> +                                            CUjitInputType type, void
> *data,
> +                                            size_t size, const char
> *name,
> +                                            unsigned int numOptions,
> +                                            CUjit_option *options,
> +                                            void **optionValues);
>  static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;
>  
> -typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
> -                                           CUjit_option *Options,
> -                                           void **OptionValues,
> -                                           CUlinkState *StateOut);
> +typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
> +                                           CUjit_option *options,
> +                                           void **optionValues,
> +                                           CUlinkState *stateOut);
>  static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;
>  
> -typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void
> **CubinOut,
> -                                             size_t *SizeOut);
> +typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void
> **cubinOut,
> +                                             size_t *sizeOut);
>  static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;
>  
> -typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
> +typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
>  static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;
>  
>  typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
> @@ -986,36 +137,36 @@ static CuCtxSynchronizeFcnTy *CuCtxSynch
>  typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
>  static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;
>  
> -static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
> +static void *getAPIHandle(void *Handle, const char *FuncName) {
>    char *Err;
>    void *FuncPtr;
>    dlerror();
>    FuncPtr = dlsym(Handle, FuncName);
>    if ((Err = dlerror()) != 0) {
> -    fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
> +    fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
>      return 0;
>    }
>    return FuncPtr;
>  }
>  
> -static int initialDeviceAPILibrariesCUDA() {
> +static int initialDeviceAPILibraries() {
>    HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
>    if (!HandleCuda) {
> -    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
> +    printf("Cannot open library: %s. \n", dlerror());
>      return 0;
>    }
>  
>    HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
>    if (!HandleCudaRT) {
> -    fprintf(stderr, "Cannot open library: %s. \n", dlerror());
> +    printf("Cannot open library: %s. \n", dlerror());
>      return 0;
>    }
>  
>    return 1;
>  }
>  
> -static int initialDeviceAPIsCUDA() {
> -  if (initialDeviceAPILibrariesCUDA() == 0)
> +static int initialDeviceAPIs() {
> +  if (initialDeviceAPILibraries() == 0)
>      return 0;
>  
>    /* Get function pointer to CUDA Driver APIs.
> @@ -1027,76 +178,77 @@ static int initialDeviceAPIsCUDA() {
>     * as it is valid on POSIX 2008.
>     */
>    CuLaunchKernelFcnPtr =
> -      (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuLaunchKernel");
> +      (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");
>  
>    CuMemAllocFcnPtr =
> -      (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");
> +      (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");
>  
> -  CuMemFreeFcnPtr =
> -      (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");
> +  CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda,
> "cuMemFree_v2");
>  
>    CuMemcpyDtoHFcnPtr =
> -      (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuMemcpyDtoH_v2");
> +      (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");
>  
>    CuMemcpyHtoDFcnPtr =
> -      (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuMemcpyHtoD_v2");
> +      (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");
>  
>    CuModuleUnloadFcnPtr =
> -      (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuModuleUnload");
> +      (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");
>  
>    CuCtxDestroyFcnPtr =
> -      (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");
> +      (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");
>  
> -  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");
> +  CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");
>  
>    CuDeviceGetCountFcnPtr =
> -      (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuDeviceGetCount");
> +      (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda,
> "cuDeviceGetCount");
>  
>    CuDeviceGetFcnPtr =
> -      (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");
> +      (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");
>  
>    CuCtxCreateFcnPtr =
> -      (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuCtxCreate_v2");
> +      (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");
>  
> -  CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy
> *)getAPIHandleCUDA(
> -      HandleCuda, "cuModuleLoadDataEx");
> +  CuModuleLoadDataExFcnPtr =
> +      (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda,
> "cuModuleLoadDataEx");
>  
>    CuModuleLoadDataFcnPtr =
> -      (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuModuleLoadData");
> +      (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda,
> "cuModuleLoadData");
>  
> -  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy
> *)getAPIHandleCUDA(
> +  CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
>        HandleCuda, "cuModuleGetFunction");
>  
>    CuDeviceComputeCapabilityFcnPtr =
> -      (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
> +      (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
>            HandleCuda, "cuDeviceComputeCapability");
>  
>    CuDeviceGetNameFcnPtr =
> -      (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuDeviceGetName");
> +      (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda,
> "cuDeviceGetName");
>  
>    CuLinkAddDataFcnPtr =
> -      (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuLinkAddData");
> +      (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");
>  
>    CuLinkCreateFcnPtr =
> -      (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");
> +      (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");
>  
>    CuLinkCompleteFcnPtr =
> -      (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuLinkComplete");
> +      (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");
>  
>    CuLinkDestroyFcnPtr =
> -      (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuLinkDestroy");
> +      (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");
>  
>    CuCtxSynchronizeFcnPtr =
> -      (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda,
> "cuCtxSynchronize");
> +      (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda,
> "cuCtxSynchronize");
>  
>    /* Get function pointer to CUDA Runtime APIs. */
> -  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy
> *)getAPIHandleCUDA(
> +  CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy
> *)getAPIHandle(
>        HandleCudaRT, "cudaThreadSynchronize");
>  
>    return 1;
>  }
>  
> -static PollyGPUContext *initContextCUDA() {
> +PollyGPUContext *polly_initContext() {
> +  DebugMode = getenv("POLLY_DEBUG") != 0;
> +
>    dump_function();
>    PollyGPUContext *Context;
>    CUdevice Device;
> @@ -1111,20 +263,20 @@ static PollyGPUContext *initContextCUDA(
>      return CurrentContext;
>  
>    /* Get API handles. */
> -  if (initialDeviceAPIsCUDA() == 0) {
> -    fprintf(stderr, "Getting the \"handle\" for the CUDA driver API
> failed.\n");
> +  if (initialDeviceAPIs() == 0) {
> +    fprintf(stdout, "Getting the \"handle\" for the CUDA driver API
> failed.\n");
>      exit(-1);
>    }
>  
>    if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
> -    fprintf(stderr, "Initializing the CUDA driver API failed.\n");
> +    fprintf(stdout, "Initializing the CUDA driver API failed.\n");
>      exit(-1);
>    }
>  
>    /* Get number of devices that supports CUDA. */
>    CuDeviceGetCountFcnPtr(&DeviceCount);
>    if (DeviceCount == 0) {
> -    fprintf(stderr, "There is no device supporting CUDA.\n");
> +    fprintf(stdout, "There is no device supporting CUDA.\n");
>      exit(-1);
>    }
>  
> @@ -1138,15 +290,12 @@ static PollyGPUContext *initContextCUDA(
>    /* Create context on the device. */
>    Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
>    if (Context == 0) {
> -    fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
> +    fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
>      exit(-1);
>    }
> -  Context->Context = malloc(sizeof(CUDAContext));
> -  if (Context->Context == 0) {
> -    fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
> -    exit(-1);
> -  }
> -  CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
> Device);
> +  CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);
> +
> +  CacheMode = getenv("POLLY_NOCACHE") == 0;
>  
>    if (CacheMode)
>      CurrentContext = Context;
> @@ -1154,24 +303,18 @@ static PollyGPUContext *initContextCUDA(
>    return Context;
>  }
>  
> -static void freeKernelCUDA(PollyGPUFunction *Kernel) {
> -  dump_function();
> -
> -  if (CacheMode)
> -    return;
> -
> -  if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
> -    CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);
> -
> -  if (Kernel->Kernel)
> -    free((CUDAKernel *)Kernel->Kernel);
> +static void freeKernel(PollyGPUFunction *Kernel) {
> +  if (Kernel->CudaModule)
> +    CuModuleUnloadFcnPtr(Kernel->CudaModule);
>  
>    if (Kernel)
>      free(Kernel);
>  }
>  
> -static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
> -                                       const char *KernelName) {
> +#define KERNEL_CACHE_SIZE 10
> +
> +PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
> +                                  const char *KernelName) {
>    dump_function();
>  
>    static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
> @@ -1181,21 +324,16 @@ static PollyGPUFunction *getKernelCUDA(c
>      // We exploit here the property that all Polly-ACC kernels are
>      allocated
>      // as global constants, hence a pointer comparision is sufficient to
>      // determin equality.
> -    if (KernelCache[i] &&
> -        ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString ==
> BinaryBuffer) {
> +    if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
>        debug_print("  -> using cached kernel\n");
>        return KernelCache[i];
>      }
>    }
>  
>    PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
> +
>    if (Function == 0) {
> -    fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
> -    exit(-1);
> -  }
> -  Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
> -  if (Function->Kernel == 0) {
> -    fprintf(stderr, "Allocate memory for Polly CUDA function
> failed.\n");
> +    fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
>      exit(-1);
>    }
>  
> @@ -1232,45 +370,43 @@ static PollyGPUFunction *getKernelCUDA(c
>    memset(ErrorLog, 0, sizeof(ErrorLog));
>  
>    CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
> -  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void
> *)BinaryBuffer,
> -                            strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
> +  Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
> +                            strlen(PTXBuffer) + 1, 0, 0, 0, 0);
>    if (Res != CUDA_SUCCESS) {
> -    fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
> +    fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
>      exit(-1);
>    }
>  
>    Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
>    if (Res != CUDA_SUCCESS) {
> -    fprintf(stderr, "Complete ptx linker step failed.\n");
> -    fprintf(stderr, "\n%s\n", ErrorLog);
> +    fprintf(stdout, "Complete ptx linker step failed.\n");
> +    fprintf(stdout, "\n%s\n", ErrorLog);
>      exit(-1);
>    }
>  
>    debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n",
>    Walltime,
>                InfoLog);
>  
> -  Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel
> *)Function->Kernel)->CudaModule),
> -                               CuOut);
> +  Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
>    if (Res != CUDA_SUCCESS) {
> -    fprintf(stderr, "Loading ptx assembly text failed.\n");
> +    fprintf(stdout, "Loading ptx assembly text failed.\n");
>      exit(-1);
>    }
>  
> -  Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel
> *)Function->Kernel)->Cuda),
> -                                  ((CUDAKernel
> *)Function->Kernel)->CudaModule,
> +  Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda),
> Function->CudaModule,
>                                    KernelName);
>    if (Res != CUDA_SUCCESS) {
> -    fprintf(stderr, "Loading kernel function failed.\n");
> +    fprintf(stdout, "Loading kernel function failed.\n");
>      exit(-1);
>    }
>  
>    CuLinkDestroyFcnPtr(LState);
>  
> -  ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;
> +  Function->PTXString = PTXBuffer;
>  
>    if (CacheMode) {
>      if (KernelCache[NextCacheItem])
> -      freeKernelCUDA(KernelCache[NextCacheItem]);
> +      freeKernel(KernelCache[NextCacheItem]);
>  
>      KernelCache[NextCacheItem] = Function;
>  
> @@ -1280,37 +416,44 @@ static PollyGPUFunction *getKernelCUDA(c
>    return Function;
>  }
>  
> -static void synchronizeDeviceCUDA() {
> +void polly_freeKernel(PollyGPUFunction *Kernel) {
>    dump_function();
> -  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
> -    fprintf(stderr, "Synchronizing device and host memory failed.\n");
> -    exit(-1);
> -  }
> +
> +  if (CacheMode)
> +    return;
> +
> +  freeKernel(Kernel);
>  }
>  
> -static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr
> *DevData,
> -                                     long MemSize) {
> +void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr
> *DevData,
> +                                long MemSize) {
>    dump_function();
>  
> -  CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
> +  CUdeviceptr CuDevData = DevData->Cuda;
>    CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
>  }
>  
> -static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void
> *HostData,
> -                                     long MemSize) {
> +void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void
> *HostData,
> +                                long MemSize) {
>    dump_function();
>  
> -  if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr
> *)DevData->DevicePtr)->Cuda,
> -                         MemSize) != CUDA_SUCCESS) {
> -    fprintf(stderr, "Copying results from device to host memory
> failed.\n");
> +  if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) !=
> CUDA_SUCCESS) {
> +    fprintf(stdout, "Copying results from device to host memory
> failed.\n");
> +    exit(-1);
> +  }
> +}
> +void polly_synchronizeDevice() {
> +  dump_function();
> +  if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
> +    fprintf(stdout, "Synchronizing device and host memory failed.\n");
>      exit(-1);
>    }
>  }
>  
> -static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int
> GridDimX,
> -                             unsigned int GridDimY, unsigned int
> BlockDimX,
> -                             unsigned int BlockDimY, unsigned int
> BlockDimZ,
> -                             void **Parameters) {
> +void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
> +                        unsigned int GridDimY, unsigned int BlockDimX,
> +                        unsigned int BlockDimY, unsigned int BlockDimZ,
> +                        void **Parameters) {
>    dump_function();
>  
>    unsigned GridDimZ = 1;
> @@ -1319,290 +462,45 @@ static void launchKernelCUDA(PollyGPUFun
>    void **Extra = 0;
>  
>    CUresult Res;
> -  Res =
> -      CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda,
> GridDimX,
> -                           GridDimY, GridDimZ, BlockDimX, BlockDimY,
> BlockDimZ,
> -                           SharedMemBytes, Stream, Parameters, Extra);
> +  Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
> +                             BlockDimX, BlockDimY, BlockDimZ,
> SharedMemBytes,
> +                             Stream, Parameters, Extra);
>    if (Res != CUDA_SUCCESS) {
> -    fprintf(stderr, "Launching CUDA kernel failed.\n");
> +    fprintf(stdout, "Launching CUDA kernel failed.\n");
>      exit(-1);
>    }
>  }
>  
> -static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
> +void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
>    dump_function();
> -  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
> -  CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
> -  free(DevPtr);
> +  CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
>    free(Allocation);
>  }
>  
> -static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
> +PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
>    dump_function();
>  
>    PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
> +
>    if (DevData == 0) {
> -    fprintf(stderr, "Allocate memory for GPU device memory pointer
> failed.\n");
> -    exit(-1);
> -  }
> -  DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
> -  if (DevData->DevicePtr == 0) {
> -    fprintf(stderr, "Allocate memory for GPU device memory pointer
> failed.\n");
> +    fprintf(stdout, "Allocate memory for GPU device memory pointer
> failed.\n");
>      exit(-1);
>    }
>  
> -  CUresult Res =
> -      CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda),
> MemSize);
> +  CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);
>  
>    if (Res != CUDA_SUCCESS) {
> -    fprintf(stderr, "Allocate memory for GPU device memory pointer
> failed.\n");
> +    fprintf(stdout, "Allocate memory for GPU device memory pointer
> failed.\n");
>      exit(-1);
>    }
>  
>    return DevData;
>  }
>  
> -static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
> -  dump_function();
> -
> -  CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
> -  return (void *)DevPtr->Cuda;
> -}
> -
> -static void freeContextCUDA(PollyGPUContext *Context) {
> -  dump_function();
> -
> -  CUDAContext *Ctx = (CUDAContext *)Context->Context;
> -  if (Ctx->Cuda) {
> -    CuCtxDestroyFcnPtr(Ctx->Cuda);
> -    free(Ctx);
> -    free(Context);
> -  }
> -
> -  dlclose(HandleCuda);
> -  dlclose(HandleCudaRT);
> -}
> -
> -#endif /* HAS_LIBCUDART */
> -/******************************************************************************/
> -/*                                    API                               
>      */
> -/******************************************************************************/
> -
> -PollyGPUContext *polly_initContext() {
> -  DebugMode = getenv("POLLY_DEBUG") != 0;
> -  CacheMode = getenv("POLLY_NOCACHE") == 0;
> -
> -  dump_function();
> -
> -  PollyGPUContext *Context;
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    Context = initContextCUDA();
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    Context = initContextCL();
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -
> -  return Context;
> -}
> -
> -void polly_freeKernel(PollyGPUFunction *Kernel) {
> -  dump_function();
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    freeKernelCUDA(Kernel);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    freeKernelCL(Kernel);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -}
> -
> -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
> -                                  const char *KernelName) {
> -  dump_function();
> -
> -  PollyGPUFunction *Function;
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    Function = getKernelCUDA(BinaryBuffer, KernelName);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    Function = getKernelCL(BinaryBuffer, KernelName);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -
> -  return Function;
> -}
> -
> -void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr
> *DevData,
> -                                long MemSize) {
> -  dump_function();
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    copyFromHostToDeviceCL(HostData, DevData, MemSize);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -}
> -
> -void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void
> *HostData,
> -                                long MemSize) {
> -  dump_function();
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    copyFromDeviceToHostCL(DevData, HostData, MemSize);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -}
> -
> -void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
> -                        unsigned int GridDimY, unsigned int BlockDimX,
> -                        unsigned int BlockDimY, unsigned int BlockDimZ,
> -                        void **Parameters) {
> -  dump_function();
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
> -                     BlockDimZ, Parameters);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
> BlockDimZ,
> -                   Parameters);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -}
> -
> -void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
> -  dump_function();
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    freeDeviceMemoryCUDA(Allocation);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    freeDeviceMemoryCL(Allocation);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -}
> -
> -PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
> -  dump_function();
> -
> -  PollyGPUDevicePtr *DevData;
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    DevData = allocateMemoryForDeviceCUDA(MemSize);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    DevData = allocateMemoryForDeviceCL(MemSize);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -
> -  return DevData;
> -}
> -
>  void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
>    dump_function();
>  
> -  void *DevPtr;
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    DevPtr = getDevicePtrCUDA(Allocation);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    DevPtr = getDevicePtrCL(Allocation);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> -
> -  return DevPtr;
> -}
> -
> -void polly_synchronizeDevice() {
> -  dump_function();
> -
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    synchronizeDeviceCUDA();
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    synchronizeDeviceCL();
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> -  }
> +  return (void *)Allocation->Cuda;
>  }
>  
>  void polly_freeContext(PollyGPUContext *Context) {
> @@ -1611,40 +509,11 @@ void polly_freeContext(PollyGPUContext *
>    if (CacheMode)
>      return;
>  
> -  switch (Runtime) {
> -#ifdef HAS_LIBCUDART
> -  case RUNTIME_CUDA:
> -    freeContextCUDA(Context);
> -    break;
> -#endif /* HAS_LIBCUDART */
> -#ifdef HAS_LIBOPENCL
> -  case RUNTIME_CL:
> -    freeContextCL(Context);
> -    break;
> -#endif /* HAS_LIBOPENCL */
> -  default:
> -    err_runtime();
> +  if (Context->Cuda) {
> +    CuCtxDestroyFcnPtr(Context->Cuda);
> +    free(Context);
>    }
> -}
> -
> -/* Initialize GPUJIT with CUDA as runtime library. */
> -PollyGPUContext *polly_initContextCUDA() {
> -#ifdef HAS_LIBCUDART
> -  Runtime = RUNTIME_CUDA;
> -  return polly_initContext();
> -#else
> -  fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
> -  exit(-1);
> -#endif /* HAS_LIBCUDART */
> -}
>  
> -/* Initialize GPUJIT with OpenCL as runtime library. */
> -PollyGPUContext *polly_initContextCL() {
> -#ifdef HAS_LIBOPENCL
> -  Runtime = RUNTIME_CL;
> -  return polly_initContext();
> -#else
> -  fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
> -  exit(-1);
> -#endif /* HAS_LIBOPENCL */
> +  dlclose(HandleCuda);
> +  dlclose(HandleCudaRT);
>  }
> 
> Modified: polly/trunk/tools/GPURuntime/GPUJIT.h
> URL:
> http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.h?rev=302217&r1=302216&r2=302217&view=diff
> ==============================================================================
> --- polly/trunk/tools/GPURuntime/GPUJIT.h (original)
> +++ polly/trunk/tools/GPURuntime/GPUJIT.h Fri May  5 04:02:08 2017
> @@ -76,27 +76,12 @@
>   *
>   */
>  
> -typedef enum PollyGPURuntimeT {
> -  RUNTIME_NONE,
> -  RUNTIME_CUDA,
> -  RUNTIME_CL
> -} PollyGPURuntime;
> -
>  typedef struct PollyGPUContextT PollyGPUContext;
>  typedef struct PollyGPUFunctionT PollyGPUFunction;
>  typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
>  
> -typedef struct OpenCLContextT OpenCLContext;
> -typedef struct OpenCLKernelT OpenCLKernel;
> -typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
> -
> -typedef struct CUDAContextT CUDAContext;
> -typedef struct CUDAKernelT CUDAKernel;
> -typedef struct CUDADevicePtrT CUDADevicePtr;
> -
> -PollyGPUContext *polly_initContextCUDA();
> -PollyGPUContext *polly_initContextCL();
> -PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
> +PollyGPUContext *polly_initContext();
> +PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
>                                    const char *KernelName);
>  void polly_freeKernel(PollyGPUFunction *Kernel);
>  void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr
>  *DevData,
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits


More information about the llvm-commits mailing list