[polly] r308751 - [Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel

Fri Jul 21 09:11:06 PDT 2017

Author: phschaad
Date: Fri Jul 21 09:11:06 2017
New Revision: 308751

URL: http://llvm.org/viewvc/llvm-project?rev=308751&view=rev
Log:
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel

Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).

Reviewers: bollu, grosser, Meinersbur, singam-sanjay

Reviewed By: grosser, singam-sanjay

Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton

Tags: #polly

Differential Revision: https://reviews.llvm.org/D35185

Added:
    polly/trunk/test/GPGPU/spir-codegen.ll
Modified:
    polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/lib/Support/RegisterPasses.cpp
    polly/trunk/tools/GPURuntime/GPUJIT.c

Modified: polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h?rev=308751&r1=308750&r2=308751&view=diff
==============================================================================

--- polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h (original)
+++ polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h Fri Jul 21 09:11:06 2017
@@ -16,7 +16,7 @@
 #define POLLY_PPCGCODEGENERATION_H
 
 /// The GPU Architecture to target.
-enum GPUArch { NVPTX64 };
+enum GPUArch { NVPTX64, SPIR32, SPIR64 };
 
 /// The GPU Runtime implementation to use.
 enum GPURuntime { CUDA, OpenCL };

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=308751&r1=308750&r2=308751&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri Jul 21 09:11:06 2017
@@ -545,6 +545,11 @@ private:
   /// @param The kernel to generate the intrinsic functions for.
   void insertKernelIntrinsics(ppcg_kernel *Kernel);
 
+  /// Insert function calls to retrieve the SPIR group/local ids.
+  ///
+  /// @param The kernel to generate the function calls for.
+  void insertKernelCallsSPIR(ppcg_kernel *Kernel);
+
   /// Setup the creation of functions referenced by the GPU kernel.
   ///
   /// 1. Create new function declarations in GPUModule which are the same as
@@ -1254,10 +1259,24 @@ void GPUNodeBuilder::createScopStmt(isl_
 
 void GPUNodeBuilder::createKernelSync() {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  const char *SpirName = "__gen_ocl_barrier_global";
 
   Function *Sync;
 
   switch (Arch) {
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    Sync = M->getFunction(SpirName);
+
+    // If Sync is not available, declare it.
+    if (!Sync) {
+      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+      std::vector<Type *> Args;
+      FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+      Sync = Function::Create(Ty, Linkage, SpirName, M);
+      Sync->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+    break;
   case GPUArch::NVPTX64:
     Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
     break;
@@ -1668,7 +1687,8 @@ void GPUNodeBuilder::createKernel(__isl_
 
   finalizeKernelArguments(Kernel);
   Function *F = Builder.GetInsertBlock()->getParent();
-  addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
+  if (Arch == GPUArch::NVPTX64)
+    addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
   clearDominators(F);
   clearScalarEvolution(F);
   clearLoops(F);
@@ -1725,12 +1745,35 @@ static std::string computeNVPTXDataLayou
   return Ret;
 }
 
+/// Compute the DataLayout string for a SPIR kernel.
+///
+/// @param is64Bit Are we looking for a 64 bit architecture?
+static std::string computeSPIRDataLayout(bool is64Bit) {
+  std::string Ret = "";
+
+  if (!is64Bit) {
+    Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+           "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+  } else {
+    Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
+           "64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
+           "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
+           "256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+  }
+
+  return Ret;
+}
+
 Function *
 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
                                          SetVector<Value *> &SubtreeValues) {
   std::vector<Type *> Args;
   std::string Identifier = getKernelFuncName(Kernel->id);
 
+  std::vector<Metadata *> MemoryType;
+
   for (long i = 0; i < Prog->n_array; i++) {
     if (!ppcg_kernel_requires_array_argument(Kernel, i))
       continue;
@@ -1739,16 +1782,23 @@ GPUNodeBuilder::createKernelFunctionDecl
       isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
       const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
       Args.push_back(SAI->getElementType());
+      MemoryType.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
     } else {
       static const int UseGlobalMemory = 1;
       Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
+      MemoryType.push_back(
+          ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
     }
   }
 
   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
 
-  for (long i = 0; i < NumHostIters; i++)
+  for (long i = 0; i < NumHostIters; i++) {
     Args.push_back(Builder.getInt64Ty());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+  }
 
   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
 
@@ -1757,19 +1807,49 @@ GPUNodeBuilder::createKernelFunctionDecl
     Value *Val = IDToValue[Id];
     isl_id_free(Id);
     Args.push_back(Val->getType());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
   }
 
-  for (auto *V : SubtreeValues)
+  for (auto *V : SubtreeValues) {
     Args.push_back(V->getType());
+    MemoryType.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
+  }
 
   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                               GPUModule.get());
 
+  std::vector<Metadata *> EmptyStrings;
+
+  for (unsigned int i = 0; i < MemoryType.size(); i++) {
+    EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
+  }
+
+  if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
+    FN->setMetadata("kernel_arg_addr_space",
+                    MDNode::get(FN->getContext(), MemoryType));
+    FN->setMetadata("kernel_arg_name",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_access_qual",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_type",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_type_qual",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+    FN->setMetadata("kernel_arg_base_type",
+                    MDNode::get(FN->getContext(), EmptyStrings));
+  }
+
   switch (Arch) {
   case GPUArch::NVPTX64:
     FN->setCallingConv(CallingConv::PTX_Kernel);
     break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    FN->setCallingConv(CallingConv::SPIR_KERNEL);
+    break;
   }
 
   auto Arg = FN->arg_begin();
@@ -1835,6 +1915,9 @@ void GPUNodeBuilder::insertKernelIntrins
   Intrinsic::ID IntrinsicsTID[3];
 
   switch (Arch) {
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
   case GPUArch::NVPTX64:
     IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
     IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
@@ -1866,6 +1949,41 @@ void GPUNodeBuilder::insertKernelIntrins
   }
 }
 
+void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) {
+  const char *GroupName[3] = {"__gen_ocl_get_group_id0",
+                              "__gen_ocl_get_group_id1",
+                              "__gen_ocl_get_group_id2"};
+
+  const char *LocalName[3] = {"__gen_ocl_get_local_id0",
+                              "__gen_ocl_get_local_id1",
+                              "__gen_ocl_get_local_id2"};
+
+  auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable {
+    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+    Function *FN = M->getFunction(Name);
+
+    // If FN is not available, declare it.
+    if (!FN) {
+      GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+      std::vector<Type *> Args;
+      FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false);
+      FN = Function::Create(Ty, Linkage, Name, M);
+      FN->setCallingConv(CallingConv::SPIR_FUNC);
+    }
+
+    Value *Val = Builder.CreateCall(FN, {});
+    Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
+    IDToValue[Id] = Val;
+    KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
+  };
+
+  for (int i = 0; i < Kernel->n_grid; ++i)
+    createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i));
+
+  for (int i = 0; i < Kernel->n_block; ++i)
+    createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i));
+}
+
 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
   auto Arg = FN->arg_begin();
   for (long i = 0; i < Kernel->n_array; i++) {
@@ -2004,6 +2122,14 @@ void GPUNodeBuilder::createKernelFunctio
       GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
     GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
     break;
+  case GPUArch::SPIR32:
+    GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
+    GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
+    break;
+  case GPUArch::SPIR64:
+    GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
+    GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
+    break;
   }
 
   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
@@ -2021,7 +2147,16 @@ void GPUNodeBuilder::createKernelFunctio
 
   prepareKernelArguments(Kernel, FN);
   createKernelVariables(Kernel, FN);
-  insertKernelIntrinsics(Kernel);
+
+  switch (Arch) {
+  case GPUArch::NVPTX64:
+    insertKernelIntrinsics(Kernel);
+    break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    insertKernelCallsSPIR(Kernel);
+    break;
+  }
 }
 
 std::string GPUNodeBuilder::createKernelASM() {
@@ -2038,6 +2173,13 @@ std::string GPUNodeBuilder::createKernel
       break;
     }
     break;
+  case GPUArch::SPIR64:
+  case GPUArch::SPIR32:
+    std::string SPIRAssembly;
+    raw_string_ostream IROstream(SPIRAssembly);
+    IROstream << *GPUModule;
+    IROstream.flush();
+    return SPIRAssembly;
   }
 
   std::string ErrMsg;
@@ -2057,6 +2199,9 @@ std::string GPUNodeBuilder::createKernel
   case GPUArch::NVPTX64:
     subtarget = CudaVersion;
     break;
+  case GPUArch::SPIR32:
+  case GPUArch::SPIR64:
+    llvm_unreachable("No subtarget for SPIR architecture");
   }
 
   std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
@@ -2097,13 +2242,15 @@ std::string GPUNodeBuilder::finalizeKern
   if (DumpKernelIR)
     outs() << *GPUModule << "\n";
 
-  // Optimize module.
-  llvm::legacy::PassManager OptPasses;
-  PassManagerBuilder PassBuilder;
-  PassBuilder.OptLevel = 3;
-  PassBuilder.SizeLevel = 0;
-  PassBuilder.populateModulePassManager(OptPasses);
-  OptPasses.run(*GPUModule);
+  if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
+    // Optimize module.
+    llvm::legacy::PassManager OptPasses;
+    PassManagerBuilder PassBuilder;
+    PassBuilder.OptLevel = 3;
+    PassBuilder.SizeLevel = 0;
+    PassBuilder.populateModulePassManager(OptPasses);
+    OptPasses.run(*GPUModule);
+  }
 
   std::string Assembly = createKernelASM();
 

Modified: polly/trunk/lib/Support/RegisterPasses.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/Support/RegisterPasses.cpp?rev=308751&r1=308750&r2=308751&view=diff
==============================================================================
--- polly/trunk/lib/Support/RegisterPasses.cpp (original)
+++ polly/trunk/lib/Support/RegisterPasses.cpp Fri Jul 21 09:11:06 2017
@@ -117,7 +117,11 @@ static cl::opt<GPURuntime> GPURuntimeCho
 static cl::opt<GPUArch>
     GPUArchChoice("polly-gpu-arch", cl::desc("The GPU Architecture to target"),
                   cl::values(clEnumValN(GPUArch::NVPTX64, "nvptx64",
-                                        "target NVIDIA 64-bit architecture")),
+                                        "target NVIDIA 64-bit architecture"),
+                             clEnumValN(GPUArch::SPIR32, "spir32",
+                                        "target SPIR 32-bit architecture"),
+                             clEnumValN(GPUArch::SPIR64, "spir64",
+                                        "target SPIR 64-bit architecture")),
                   cl::init(GPUArch::NVPTX64), cl::ZeroOrMore,
                   cl::cat(PollyCategory));
 #endif

Added: polly/trunk/test/GPGPU/spir-codegen.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/spir-codegen.ll?rev=308751&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/spir-codegen.ll (added)
+++ polly/trunk/test/GPGPU/spir-codegen.ll Fri Jul 21 09:11:06 2017
@@ -0,0 +1,118 @@
+; RUN: opt -O3 -polly -polly-target=gpu \
+; RUN: -polly-gpu-arch=spir32 \
+; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
+; RUN: FileCheck %s
+
+; REQUIRES: pollyacc
+
+; CHECK:      target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+; CHECK-NEXT: target triple = "spir-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef0) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = call i32 @__gen_ocl_get_group_id0()
+; CHECK-NEXT:   %__gen_ocl_get_group_id0 = zext i32 %0 to i64
+; CHECK-NEXT:   %1 = call i32 @__gen_ocl_get_group_id1()
+; CHECK-NEXT:   %__gen_ocl_get_group_id1 = zext i32 %1 to i64
+; CHECK-NEXT:   %2 = call i32 @__gen_ocl_get_local_id0()
+; CHECK-NEXT:   %__gen_ocl_get_local_id0 = zext i32 %2 to i64
+; CHECK-NEXT:   %3 = call i32 @__gen_ocl_get_local_id1()
+; CHECK-NEXT:   %__gen_ocl_get_local_id1 = zext i32 %3 to i64
+; CHECK-NEXT:   br label %polly.loop_preheader
+
+; CHECK-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
+; CHECK-NEXT:   ret void
+
+; CHECK-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
+; CHECK-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
+; CHECK-NEXT:   %4 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT:   %5 = add nsw i64 %4, %__gen_ocl_get_local_id0
+; CHECK-NEXT:   %6 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT:   %7 = add nsw i64 %6, %__gen_ocl_get_local_id1
+; CHECK-NEXT:   %8 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT:   %9 = add nsw i64 %7, %8
+; CHECK-NEXT:   br label %polly.stmt.bb5
+
+; CHECK-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
+; CHECK-NEXT:   %10 = mul i64 %5, %9
+; CHECK-NEXT:   %p_tmp6 = sitofp i64 %10 to float
+; CHECK-NEXT:   %polly.access.cast.MemRef0 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)*
+; CHECK-NEXT:   %11 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT:   %12 = add nsw i64 %11, %__gen_ocl_get_local_id0
+; CHECK-NEXT:   %polly.access.mul.MemRef0 = mul nsw i64 %12, 1024
+; CHECK-NEXT:   %13 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT:   %14 = add nsw i64 %13, %__gen_ocl_get_local_id1
+; CHECK-NEXT:   %15 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT:   %16 = add nsw i64 %14, %15
+; CHECK-NEXT:   %polly.access.add.MemRef0 = add nsw i64 %polly.access.mul.MemRef0, %16
+; CHECK-NEXT:   %polly.access.MemRef0 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef0, i64 %polly.access.add.MemRef0
+; CHECK-NEXT:   %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef0, align 4
+; CHECK-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
+; CHECK-NEXT:   %polly.access.cast.MemRef01 = bitcast i8 addrspace(1)* %MemRef0 to float addrspace(1)*
+; CHECK-NEXT:   %17 = mul nsw i64 32, %__gen_ocl_get_group_id0
+; CHECK-NEXT:   %18 = add nsw i64 %17, %__gen_ocl_get_local_id0
+; CHECK-NEXT:   %polly.access.mul.MemRef02 = mul nsw i64 %18, 1024
+; CHECK-NEXT:   %19 = mul nsw i64 32, %__gen_ocl_get_group_id1
+; CHECK-NEXT:   %20 = add nsw i64 %19, %__gen_ocl_get_local_id1
+; CHECK-NEXT:   %21 = mul nsw i64 16, %polly.indvar
+; CHECK-NEXT:   %22 = add nsw i64 %20, %21
+; CHECK-NEXT:   %polly.access.add.MemRef03 = add nsw i64 %polly.access.mul.MemRef02, %22
+; CHECK-NEXT:   %polly.access.MemRef04 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef01, i64 %polly.access.add.MemRef03
+; CHECK-NEXT:   store float %p_tmp9, float addrspace(1)* %polly.access.MemRef04, align 4
+; CHECK-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
+; CHECK-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1
+; CHECK-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
+
+; CHECK-LABEL: polly.loop_preheader:                             ; preds = %entry
+; CHECK-NEXT:   br label %polly.loop_header
+
+; CHECK: attributes #0 = { "polly.skip.fn" }
+
+;    void double_parallel_loop(float A[][1024]) {
+;      for (long i = 0; i < 1024; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i][j] += i * j;
+;    }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @double_parallel_loop([1024 x float]* %A) {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb13, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
+  %exitcond1 = icmp ne i64 %i.0, 1024
+  br i1 %exitcond1, label %bb3, label %bb15
+
+bb3:                                              ; preds = %bb2
+  br label %bb4
+
+bb4:                                              ; preds = %bb10, %bb3
+  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
+  %exitcond = icmp ne i64 %j.0, 1024
+  br i1 %exitcond, label %bb5, label %bb12
+
+bb5:                                              ; preds = %bb4
+  %tmp = mul nuw nsw i64 %i.0, %j.0
+  %tmp6 = sitofp i64 %tmp to float
+  %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
+  %tmp8 = load float, float* %tmp7, align 4
+  %tmp9 = fadd float %tmp8, %tmp6
+  store float %tmp9, float* %tmp7, align 4
+  br label %bb10
+
+bb10:                                             ; preds = %bb5
+  %tmp11 = add nuw nsw i64 %j.0, 1
+  br label %bb4
+
+bb12:                                             ; preds = %bb4
+  br label %bb13
+
+bb13:                                             ; preds = %bb12
+  %tmp14 = add nuw nsw i64 %i.0, 1
+  br label %bb2
+
+bb15:                                             ; preds = %bb2
+  ret void
+}

Modified: polly/trunk/tools/GPURuntime/GPUJIT.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/tools/GPURuntime/GPUJIT.c?rev=308751&r1=308750&r2=308751&view=diff
==============================================================================
--- polly/trunk/tools/GPURuntime/GPUJIT.c (original)
+++ polly/trunk/tools/GPURuntime/GPUJIT.c Fri Jul 21 09:11:06 2017
@@ -23,13 +23,14 @@
 #include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
-#endif
+#endif /* __APPLE__ */
 #endif /* HAS_LIBOPENCL */
 
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 static int DebugMode;
 static int CacheMode;
@@ -89,6 +90,7 @@ struct OpenCLDevicePtrT {
 
 /* Dynamic library handles for the OpenCL runtime library. */
 static void *HandleOpenCL;
+static void *HandleOpenCLBeignet;
 
 /* Type-defines of function pointer to OpenCL Runtime API. */
 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
@@ -139,6 +141,12 @@ clEnqueueWriteBufferFcnTy(cl_command_que
                           const cl_event *EventWaitList, cl_event *Event);
 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;
 
+typedef cl_program
+clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
+                                  const cl_device_id *DeviceList,
+                                  const char *Filename, cl_int *ErrcodeRet);
+static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;
+
 typedef cl_program clCreateProgramWithBinaryFcnTy(
     cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
     const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
@@ -210,6 +218,7 @@ static void *getAPIHandleCL(void *Handle
 }
 
 static int initialDeviceAPILibrariesCL() {
+  HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
   HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
   if (!HandleOpenCL) {
     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
@@ -237,67 +246,79 @@ static int initialDeviceAPIsCL() {
   if (initialDeviceAPILibrariesCL() == 0)
     return 0;
 
+  // FIXME: We are now always selecting the Intel Beignet driver if it is
+  // available on the system, instead of a possible NVIDIA or AMD OpenCL
+  // API. This selection should occurr based on the target architecture
+  // chosen when compiling.
+  void *Handle =
+      (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);
+
   clGetPlatformIDsFcnPtr =
-      (clGetPlatformIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetPlatformIDs");
+      (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");
 
   clGetDeviceIDsFcnPtr =
-      (clGetDeviceIDsFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceIDs");
+      (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");
 
   clGetDeviceInfoFcnPtr =
-      (clGetDeviceInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetDeviceInfo");
+      (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");
 
   clGetKernelInfoFcnPtr =
-      (clGetKernelInfoFcnTy *)getAPIHandleCL(HandleOpenCL, "clGetKernelInfo");
+      (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");
 
   clCreateContextFcnPtr =
-      (clCreateContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateContext");
+      (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");
 
   clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clCreateCommandQueue");
+      Handle, "clCreateCommandQueue");
 
   clCreateBufferFcnPtr =
-      (clCreateBufferFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateBuffer");
+      (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");
 
   clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueWriteBuffer");
+      Handle, "clEnqueueWriteBuffer");
+
+  if (HandleOpenCLBeignet)
+    clCreateProgramWithLLVMIntelFcnPtr =
+        (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
+            Handle, "clCreateProgramWithLLVMIntel");
 
   clCreateProgramWithBinaryFcnPtr =
       (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
-          HandleOpenCL, "clCreateProgramWithBinary");
+          Handle, "clCreateProgramWithBinary");
 
   clBuildProgramFcnPtr =
-      (clBuildProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clBuildProgram");
+      (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");
 
   clCreateKernelFcnPtr =
-      (clCreateKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clCreateKernel");
+      (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");
 
   clSetKernelArgFcnPtr =
-      (clSetKernelArgFcnTy *)getAPIHandleCL(HandleOpenCL, "clSetKernelArg");
+      (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");
 
   clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueNDRangeKernel");
+      Handle, "clEnqueueNDRangeKernel");
 
-  clEnqueueReadBufferFcnPtr = (clEnqueueReadBufferFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clEnqueueReadBuffer");
+  clEnqueueReadBufferFcnPtr =
+      (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");
 
-  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(HandleOpenCL, "clFlush");
+  clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");
 
-  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(HandleOpenCL, "clFinish");
+  clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");
 
   clReleaseKernelFcnPtr =
-      (clReleaseKernelFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseKernel");
+      (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");
 
   clReleaseProgramFcnPtr =
-      (clReleaseProgramFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseProgram");
+      (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");
 
-  clReleaseMemObjectFcnPtr = (clReleaseMemObjectFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clReleaseMemObject");
+  clReleaseMemObjectFcnPtr =
+      (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");
 
   clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
-      HandleOpenCL, "clReleaseCommandQueue");
+      Handle, "clReleaseCommandQueue");
 
   clReleaseContextFcnPtr =
-      (clReleaseContextFcnTy *)getAPIHandleCL(HandleOpenCL, "clReleaseContext");
+      (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");
 
   return 1;
 }
@@ -481,12 +502,32 @@ static PollyGPUFunction *getKernelCL(con
   }
 
   cl_int Ret;
-  size_t BinarySize = strlen(BinaryBuffer);
-  ((OpenCLKernel *)Function->Kernel)->Program = clCreateProgramWithBinaryFcnPtr(
-      ((OpenCLContext *)GlobalContext->Context)->Context, 1, &GlobalDeviceID,
-      (const size_t *)&BinarySize, (const unsigned char **)&BinaryBuffer, NULL,
-      &Ret);
-  checkOpenCLError(Ret, "Failed to create program from binary.\n");
+
+  if (HandleOpenCLBeignet) {
+    // TODO: This is a workaround, since clCreateProgramWithLLVMIntel only
+    // accepts a filename to a valid llvm-ir file as an argument, instead
+    // of accepting the BinaryBuffer directly.
+    FILE *fp = fopen("kernel.ll", "wb");
+    if (fp != NULL) {
+      fputs(BinaryBuffer, fp);
+      fclose(fp);
+    }
+
+    ((OpenCLKernel *)Function->Kernel)->Program =
+        clCreateProgramWithLLVMIntelFcnPtr(
+            ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+            &GlobalDeviceID, "kernel.ll", &Ret);
+    checkOpenCLError(Ret, "Failed to create program from llvm.\n");
+    unlink("kernel.ll");
+  } else {
+    size_t BinarySize = strlen(BinaryBuffer);
+    ((OpenCLKernel *)Function->Kernel)->Program =
+        clCreateProgramWithBinaryFcnPtr(
+            ((OpenCLContext *)GlobalContext->Context)->Context, 1,
+            &GlobalDeviceID, (const size_t *)&BinarySize,
+            (const unsigned char **)&BinaryBuffer, NULL, &Ret);
+    checkOpenCLError(Ret, "Failed to create program from binary.\n");
+  }
 
   Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
                              &GlobalDeviceID, NULL, NULL, NULL);