[llvm-commits] [polly] r161239 - in /polly/trunk: ./ autoconf/ include/polly/ include/polly/CodeGen/ include/polly/Config/ lib/CodeGen/ test/ test/CodeGen/GPGPU/

Tobias Grosser grosser at fim.uni-passau.de
Fri Aug 3 05:50:07 PDT 2012


Author: grosser
Date: Fri Aug  3 07:50:07 2012
New Revision: 161239

URL: http://llvm.org/viewvc/llvm-project?rev=161239&view=rev
Log:
Add preliminary implementation for GPGPU code generation.

Translate the selected parallel loop body into a ptx string and run it with the
cuda driver API. We limit this preliminary implementation to target the
following special test cases:

  - Support only 2-dimensional parallel loops with or without only one innermost
    non-parallel loop.
  - Support write memory access to only one array in a SCoP.

The patch was committed with smaller changes to the build system:

There is now a flag to enable gpu code generation explictly. This was required
as we need the llvm.codegen() patch applied on the llvm sources, to compile this
feature correctly. Also, enabling gpu code generation does not require cuda.
This requirement was removed to allow 'make polly-test' runs, even without an
installed cuda runtime.

Contributed by:  Yabin Hu  <yabin.hwu at gmail.com>

Added:
    polly/trunk/include/polly/CodeGen/PTXGenerator.h
    polly/trunk/lib/CodeGen/PTXGenerator.cpp
    polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
    polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
    polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
    polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
    polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
    polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
    polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
    polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
    polly/trunk/test/CodeGen/GPGPU/lit.local.cfg
Modified:
    polly/trunk/CMakeLists.txt
    polly/trunk/autoconf/configure.ac
    polly/trunk/configure
    polly/trunk/include/polly/Config/config.h.cmake
    polly/trunk/include/polly/Config/config.h.in
    polly/trunk/include/polly/ScopInfo.h
    polly/trunk/lib/CodeGen/CMakeLists.txt
    polly/trunk/lib/CodeGen/CodeGeneration.cpp
    polly/trunk/lib/CodeGen/Makefile
    polly/trunk/test/lit.site.cfg.in

Modified: polly/trunk/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/CMakeLists.txt?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/CMakeLists.txt (original)
+++ polly/trunk/CMakeLists.txt Fri Aug  3 07:50:07 2012
@@ -75,7 +75,14 @@
 FIND_PACKAGE(Isl REQUIRED)
 FIND_PACKAGE(Gmp REQUIRED)
 FIND_PACKAGE(Pluto)
-FIND_PACKAGE(CUDA)
+
+option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
+if (POLLY_ENABLE_GPGPU_CODEGEN)
+  # Do not require CUDA, as GPU code generation test cases can be run without
+  # a cuda library.
+  FIND_PACKAGE(CUDA)
+  set(GPU_CODEGEN TRUE)
+endif(POLLY_ENABLE_GPGPU_CODEGEN)
 
 option(POLLY_ENABLE_OPENSCOP "Enable Openscop library for scop import/export" ON)
 if (POLLY_ENABLE_OPENSCOP)

Modified: polly/trunk/autoconf/configure.ac
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/autoconf/configure.ac?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/autoconf/configure.ac (original)
+++ polly/trunk/autoconf/configure.ac Fri Aug  3 07:50:07 2012
@@ -120,7 +120,20 @@
 AC_SUBST(scoplib_rpath)
 
 dnl Check if CUDA lib there
+dnl Disable the build of polly, even if it is checked out into tools/polly.
+AC_ARG_ENABLE(polly_gpu_codegen,
+              AS_HELP_STRING([--enable-polly-gpu-codegen],
+                             [Enable GPU code generation in Polly(default is NO)]),,
+                             enableval=default)
+case "$enableval" in
+  yes) AC_DEFINE([GPU_CODEGEN],[1], [Define if gpu codegen is enabled]) ;;
+  no)  AC_DEFINE([GPU_CODEGEN],[0], [Define if gpu codegen is enabled]) ;;
+  default) AC_DEFINE([GPU_CODEGEN],[0],  [Define if gpu codegen is enabled]) ;;
+  *) AC_MSG_ERROR([Invalid setting for --enable-polly-gpu-codegen. Use "yes" or "no"]) ;;
+esac
+
 find_lib_and_headers([cuda], [cuda.h], [cuda])
+
 AS_IF([test "x$cuda_found" = "xyes"],
   [AC_DEFINE([CUDALIB_FOUND],[1],[Define if cudalib found])])
 

Modified: polly/trunk/configure
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/configure?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/configure (original)
+++ polly/trunk/configure Fri Aug  3 07:50:07 2012
@@ -654,6 +654,7 @@
 with_pluto
 with_openscop
 with_scoplib
+enable_polly_gpu_codegen
 with_cuda
 '
       ac_precious_vars='build_alias
@@ -1272,6 +1273,13 @@
    esac
   cat <<\_ACEOF
 
+Optional Features:
+  --disable-option-checking  ignore unrecognized --enable/--with options
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-polly-gpu-codegen
+                          Enable GPU code generation in Polly(default is NO)
+
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
@@ -3002,6 +3010,26 @@
 
 
 
+# Check whether --enable-polly_gpu_codegen was given.
+if test "${enable_polly_gpu_codegen+set}" = set; then :
+  enableval=$enable_polly_gpu_codegen;
+else
+  enableval=default
+fi
+
+case "$enableval" in
+  yes)
+$as_echo "#define GPU_CODEGEN 1" >>confdefs.h
+ ;;
+  no)
+$as_echo "#define GPU_CODEGEN 0" >>confdefs.h
+ ;;
+  default)
+$as_echo "#define GPU_CODEGEN 0" >>confdefs.h
+ ;;
+  *) as_fn_error $? "Invalid setting for --enable-polly-gpu-codegen. Use \"yes\" or \"no\"" "$LINENO" 5 ;;
+esac
+
 
   ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
@@ -3081,6 +3109,7 @@
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
+
 if test "x$cuda_found" = "xyes"; then :
 
 $as_echo "#define CUDALIB_FOUND 1" >>confdefs.h

Added: polly/trunk/include/polly/CodeGen/PTXGenerator.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PTXGenerator.h?rev=161239&view=auto
==============================================================================
--- polly/trunk/include/polly/CodeGen/PTXGenerator.h (added)
+++ polly/trunk/include/polly/CodeGen/PTXGenerator.h Fri Aug  3 07:50:07 2012
@@ -0,0 +1,197 @@
+//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions to create GPGPU parallel loops as LLVM-IR.
+//
+//===----------------------------------------------------------------------===//
+#ifndef POLLY_CODEGEN_PTXGENERATOR_H
+#define POLLY_CODEGEN_PTXGENERATOR_H
+
+#include "polly/Config/config.h"
+
+#ifdef GPU_CODEGEN
+#include "llvm/IRBuilder.h"
+#include "llvm/ADT/SetVector.h"
+
+#include <map>
+
+namespace llvm {
+  class Value;
+  class Pass;
+  class BasicBlock;
+}
+
+namespace polly {
+using namespace llvm;
+
+class PTXGenerator {
+public:
+  typedef std::map<Value*, Value*> ValueToValueMapTy;
+
+  PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple);
+
+  /// @brief Create a GPGPU parallel loop.
+  ///
+  /// @param UsedValues   A set of LLVM-IR Values that should be available to
+  ///                     the new loop body.
+  /// @param OriginalIVS  The new values of the original induction variables.
+  /// @param VMap         This map is filled by createParallelLoop(). It
+  ///                     maps the values in UsedValues to Values through which
+  ///                     their content is available within the loop body.
+  /// @param LoopBody     A pointer to an iterator that is set to point to the
+  ///                     body of the created loop. It should be used to insert
+  ///                     instructions that form the actual loop body.
+  void startGeneration(SetVector<Value*> &UsedValues,
+                       SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap,
+                       BasicBlock::iterator *LoopBody);
+
+  /// @brief Execute the post-operations to build a GPGPU parallel loop.
+  ///
+  void finishGeneration(Function *SubFunction);
+
+  /// @brief Set the parameters for launching PTX kernel.
+  ///
+  /// @param GridW    A value of the width of a GPU grid.
+  /// @param GridH    A value of the height of a GPU grid.
+  /// @param BlockW   A value of the width of a GPU block.
+  /// @param BlockH   A value of the height of a GPU block.
+  void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) {
+    GridWidth = GridW;
+    GridHeight = GridH;
+    BlockWidth = BlockW;
+    BlockHeight = BlockH;
+  }
+
+  /// @brief Set the size of the output array.
+  ///
+  /// This size is used to allocate memory on the device and the host.
+  ///
+  /// @param Bytes        Output array size in bytes.
+  void setOutputBytes(unsigned Bytes) {
+    OutputBytes = Bytes;
+  }
+
+private:
+  IRBuilder<> &Builder;
+  Pass *P;
+
+  /// @brief The target triple of the device.
+  const std::string &GPUTriple;
+
+  ///@brief Parameters used for launching PTX kernel.
+  int GridWidth, GridHeight, BlockWidth, BlockHeight;
+
+  /// @brief Size of the output array in bytes.
+  unsigned OutputBytes;
+
+  /// @brief Polly's GPU data types.
+  StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy;
+
+  void InitializeGPUDataTypes();
+  IntegerType *getInt64Type();            // i64
+  PointerType *getI8PtrType();            // char *
+  PointerType *getPtrI8PtrType();         // char **
+  PointerType *getFloatPtrType();         // float *
+  PointerType *getGPUContextPtrType();    // %struct.PollyGPUContextT *
+  PointerType *getGPUModulePtrType();     // %struct.PollyGPUModuleT *
+  PointerType *getGPUDevicePtrType();     // %struct.PollyGPUDeviceT *
+  PointerType *getPtrGPUDevicePtrType();  // %struct.PollyGPUDevicePtrT *
+  PointerType *getGPUFunctionPtrType();   // %struct.PollyGPUFunctionT *
+  PointerType *getGPUEventPtrType();      // %struct.PollyGPUEventT *
+
+  Module *getModule();
+
+  /// @brief Create the kernel string containing LLVM IR.
+  ///
+  /// @param SubFunction  A pointer to the device code function.
+  /// @return             A global string variable containing the LLVM IR codes
+  //                      of the SubFunction.
+  Value *createPTXKernelFunction(Function *SubFunction);
+
+  /// @brief Get the entry name of the device kernel function.
+  ///
+  /// @param SubFunction  A pointer to the device code function.
+  /// @return             A global string variable containing the entry name of
+  ///                     the SubFunction.
+  Value *getPTXKernelEntryName(Function *SubFunction);
+
+  void createCallInitDevice(Value *Context, Value *Device);
+  void createCallGetPTXModule(Value *Buffer, Value *Module);
+  void createCallGetPTXKernelEntry(Value *Entry, Value *Module,
+                                   Value *Kernel);
+  void createCallAllocateMemoryForHostAndDevice(Value *HostData,
+                                                Value *DeviceData,
+                                                Value *Size);
+  void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData,
+                                      Value *Size);
+  void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData,
+                                      Value *Size);
+  void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth,
+                                     Value *BlockHeight, Value *DeviceData);
+  void createCallLaunchKernel(Value *Kernel, Value *GridWidth,
+                              Value *GridHeight);
+  void createCallStartTimerByCudaEvent(Value *StartEvent,
+                                       Value *StopEvent);
+  void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent,
+                                      Value *Timer);
+  void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData,
+                                       Value *Module, Value *Context,
+                                       Value *Kernel);
+
+  /// @brief Create the CUDA subfunction.
+  ///
+  /// @param UsedValues   A set of LLVM-IR Values that should be available to
+  ///                     the new loop body.
+  /// @param VMap         This map that is filled by createSubfunction(). It
+  ///                     maps the values in UsedValues to Values through which
+  ///                     their content is available within the loop body.
+  /// @param OriginalIVS  The new values of the original induction variables.
+  /// @param SubFunction  The newly created SubFunction is returned here.
+  void createSubfunction(SetVector<Value*> &UsedValues,
+                         SetVector<Value*> &OriginalIVS,
+                         ValueToValueMapTy &VMap,
+                         Function **SubFunction);
+
+  /// @brief Create the definition of the CUDA subfunction.
+  ///
+  /// @param NumArgs      The number of parameters of this subfunction. This is
+  ///                     usually set to the number of memory accesses which
+  ///                     will be copied from host to device.
+  Function *createSubfunctionDefinition(int NumArgs);
+
+  /// @brief Extract all the ptx related subfunctions into a new module.
+  ///
+  /// @param M            Current module.
+  /// @return             The generated module containing only gpu related
+  ///                     subfunctions.
+  Module *extractPTXFunctionsFromModule(const Module *M);
+
+  /// @brief Get the Value of CUDA block width.
+  Value *getCUDABlockWidth();
+
+  /// @brief Get the Value of CUDA block height.
+  Value *getCUDABlockHeight();
+
+  /// @brief Get the Value of CUDA Gird width.
+  Value *getCUDAGridWidth();
+
+  /// @brief Get the Value of CUDA grid height.
+  Value *getCUDAGridHeight();
+
+  /// @brief Get the Value of the bytes of the output array.
+  Value *getOutputArraySizeInBytes();
+
+  /// @brief Erase the ptx-related subfunctions and declarations.
+  ///
+  /// @param SubFunction  A pointer to the device code function.
+  void eraseUnusedFunctions(Function *SubFunction);
+};
+} // end namespace polly
+#endif /* GPU_CODEGEN */
+#endif /* POLLY_CODEGEN_PTXGENERATOR_H */

Modified: polly/trunk/include/polly/Config/config.h.cmake
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Config/config.h.cmake?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/include/polly/Config/config.h.cmake (original)
+++ polly/trunk/include/polly/Config/config.h.cmake Fri Aug  3 07:50:07 2012
@@ -19,5 +19,6 @@
 #cmakedefine PLUTO_FOUND
 #cmakedefine SCOPLIB_FOUND
 #cmakedefine CUDALIB_FOUND
+#cmakedefine GPU_CODEGEN
 
 #endif

Modified: polly/trunk/include/polly/Config/config.h.in
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Config/config.h.in?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/include/polly/Config/config.h.in (original)
+++ polly/trunk/include/polly/Config/config.h.in Fri Aug  3 07:50:07 2012
@@ -9,6 +9,9 @@
 /* Define if cudalib found */
 #undef CUDALIB_FOUND
 
+/* Define if gpu codegen is enabled */
+#undef GPU_CODEGEN
+
 /* Define if ISL has a code generator */
 #undef ISL_CODEGEN_FOUND
 

Modified: polly/trunk/include/polly/ScopInfo.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScopInfo.h?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/include/polly/ScopInfo.h (original)
+++ polly/trunk/include/polly/ScopInfo.h Fri Aug  3 07:50:07 2012
@@ -125,6 +125,9 @@
   /// @brief Is this a read memory access?
   bool isRead() const { return Type == MemoryAccess::Read; }
 
+  /// @brief Is this a write memory access?
+  bool isWrite() const { return Type == MemoryAccess::Write; }
+
   isl_map *getAccessRelation() const;
 
   /// @brief Get an isl string representing this access function.

Modified: polly/trunk/lib/CodeGen/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CMakeLists.txt?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/CMakeLists.txt (original)
+++ polly/trunk/lib/CodeGen/CMakeLists.txt Fri Aug  3 07:50:07 2012
@@ -9,10 +9,16 @@
       IslCodeGeneration.cpp)
 endif (ISL_CODEGEN_FOUND)
 
+if (GPU_CODEGEN)
+  set (GPGPU_CODEGEN_FILES
+       PTXGenerator.cpp)
+endif (GPU_CODEGEN)
+
 add_polly_library(PollyCodeGen
   BlockGenerators.cpp
   ${CLOOG_FILES}
   ${ISL_CODEGEN_FILES}
   LoopGenerators.cpp
   Utils.cpp
+  ${GPGPU_CODEGEN_FILES}
 )

Modified: polly/trunk/lib/CodeGen/CodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CodeGeneration.cpp?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/CodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/CodeGeneration.cpp Fri Aug  3 07:50:07 2012
@@ -31,6 +31,7 @@
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/BlockGenerators.h"
 #include "polly/CodeGen/LoopGenerators.h"
+#include "polly/CodeGen/PTXGenerator.h"
 #include "polly/CodeGen/Utils.h"
 #include "polly/Support/GICHelper.h"
 
@@ -65,6 +66,19 @@
        cl::value_desc("OpenMP code generation enabled if true"),
        cl::init(false), cl::ZeroOrMore);
 
+#ifdef GPU_CODEGEN
+static cl::opt<bool>
+GPGPU("enable-polly-gpgpu",
+       cl::desc("Generate GPU parallel code"), cl::Hidden,
+       cl::value_desc("GPGPU code generation enabled if true"),
+       cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<std::string>
+GPUTriple("polly-gpgpu-triple",
+       cl::desc("Target triple for GPU code generation"),
+       cl::Hidden, cl::init(""));
+#endif /* GPU_CODEGEN */
+
 static cl::opt<bool>
 AtLeastOnce("enable-polly-atLeastOnce",
        cl::desc("Give polly the hint, that every loop is executed at least"
@@ -284,6 +298,27 @@
   /// statement.
   void codegenForOpenMP(const clast_for *f);
 
+#ifdef GPU_CODEGEN
+  /// @brief Create GPGPU device memory access values.
+  ///
+  /// Create a list of values that will be set to be parameters of the GPGPU
+  /// subfunction. These parameters represent device memory base addresses
+  /// and the size in bytes.
+  SetVector<Value*> getGPUValues(unsigned &OutputBytes);
+
+  /// @brief Create a GPU parallel for loop.
+  ///
+  /// This loop reflects a loop as if it would have been created by a GPU
+  /// statement.
+  void codegenForGPGPU(const clast_for *F);
+
+  /// @brief Get innermost for loop.
+  const clast_stmt *getScheduleInfo(const clast_for *F,
+                                    std::vector<int> &NumIters,
+                                    unsigned &LoopDepth,
+                                    unsigned &NonPLoopDepth);
+#endif /* GPU_CODEGEN */
+
   /// @brief Check if a loop is parallel
   ///
   /// Detect if a clast_for loop can be executed in parallel.
@@ -530,6 +565,163 @@
   Builder.SetInsertPoint(AfterLoop);
 }
 
+#ifdef GPU_CODEGEN
+static unsigned getArraySizeInBytes(const ArrayType *AT) {
+  unsigned Bytes = AT->getNumElements();
+  if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
+    Bytes *= getArraySizeInBytes(T);
+  else
+    Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
+
+  return Bytes;
+}
+
+SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
+  SetVector<Value*> Values;
+  OutputBytes = 0;
+
+  // Record the memory reference base addresses.
+  for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
+    ScopStmt *Stmt = *SI;
+    for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
+         E = Stmt->memacc_end(); I != E; ++I) {
+      Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
+      Values.insert((BaseAddr));
+
+      // FIXME: we assume that there is one and only one array to be written
+      // in a SCoP.
+      int NumWrites = 0;
+      if ((*I)->isWrite()) {
+        ++NumWrites;
+        assert(NumWrites <= 1 &&
+               "We support at most one array to be written in a SCoP.");
+        if (const PointerType * PT =
+            dyn_cast<PointerType>(BaseAddr->getType())) {
+          Type *T = PT->getArrayElementType();
+          const ArrayType *ATy = dyn_cast<ArrayType>(T);
+          OutputBytes = getArraySizeInBytes(ATy);
+        }
+      }
+    }
+  }
+
+  return Values;
+}
+
+const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
+                                                    std::vector<int> &NumIters,
+                                                    unsigned &LoopDepth,
+                                                    unsigned &NonPLoopDepth) {
+  clast_stmt *Stmt = (clast_stmt *)F;
+  const clast_for *Result;
+  bool NonParaFlag = false;
+  LoopDepth = 0;
+  NonPLoopDepth = 0;
+
+  while (Stmt) {
+    if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
+      const clast_for *T = (clast_for *) Stmt;
+      if (isParallelFor(T)) {
+        if (!NonParaFlag) {
+          NumIters.push_back(getNumberOfIterations(T));
+          Result = T;
+        }
+      } else
+        NonParaFlag = true;
+
+      Stmt = T->body;
+      LoopDepth++;
+      continue;
+    }
+    Stmt = Stmt->next;
+  }
+
+  assert(NumIters.size() == 4 &&
+         "The loops should be tiled into 4-depth parallel loops and an "
+         "innermost non-parallel one (if exist).");
+  NonPLoopDepth = LoopDepth - NumIters.size();
+  assert(NonPLoopDepth <= 1
+         && "We support only one innermost non-parallel loop currently.");
+  return (const clast_stmt *)Result->body;
+}
+
+void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
+  BasicBlock::iterator LoopBody;
+  SetVector<Value *> Values;
+  SetVector<Value *> IVS;
+  std::vector<int> NumIterations;
+  PTXGenerator::ValueToValueMapTy VMap;
+
+  assert(!GPUTriple.empty()
+         && "Target triple should be set properly for GPGPU code generation.");
+  PTXGenerator PTXGen(Builder, P, GPUTriple);
+
+  // Get original IVS and ScopStmt
+  unsigned TiledLoopDepth, NonPLoopDepth;
+  const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
+                                                TiledLoopDepth, NonPLoopDepth);
+  const clast_stmt *TmpStmt;
+  const clast_user_stmt *U;
+  const clast_for *InnerFor;
+  if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
+    InnerFor = (const clast_for *)InnerStmt;
+    TmpStmt = InnerFor->body;
+  } else
+    TmpStmt = InnerStmt;
+  U = (const clast_user_stmt *) TmpStmt;
+  ScopStmt *Statement = (ScopStmt *) U->statement->usr;
+  for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
+    const Value* IV = Statement->getInductionVariableForDimension(i);
+    IVS.insert(const_cast<Value *>(IV));
+  }
+
+  unsigned OutBytes;
+  Values = getGPUValues(OutBytes);
+  PTXGen.setOutputBytes(OutBytes);
+  PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
+
+  BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(LoopBody);
+
+  BasicBlock *AfterBB = 0;
+  if (NonPLoopDepth) {
+    Value *LowerBound, *UpperBound, *IV, *Stride;
+    Type *IntPtrTy = getIntPtrTy();
+    LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
+    UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
+    Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
+    IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
+    const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
+    Value *OldIV = const_cast<Value *>(OldIV_);
+    VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
+  }
+
+  updateWithValueMap(VMap, /* reverse */ false);
+  BlockGenerator::generate(Builder, *Statement, ValueMap, P);
+  updateWithValueMap(VMap, /* reverse */ true);
+
+  if (AfterBB)
+    Builder.SetInsertPoint(AfterBB->begin());
+
+  // FIXME: The replacement of the host base address with the parameter of ptx
+  // subfunction should have been done by updateWithValueMap. We use the
+  // following codes to avoid affecting other parts of Polly. This should be
+  // fixed later.
+  Function *FN = Builder.GetInsertBlock()->getParent();
+  for (unsigned j = 0; j < Values.size(); j++) {
+    Value *baseAddr = Values[j];
+    for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
+      for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
+        I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
+    }
+  }
+  Builder.SetInsertPoint(AfterLoop);
+  PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
+                                NumIterations[2], NumIterations[3]);
+  PTXGen.finishGeneration(FN);
+}
+#endif
+
 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
   const clast_stmt *stmt = f->body;
 
@@ -647,6 +839,18 @@
     }
   }
 
+#ifdef GPU_CODEGEN
+  if (GPGPU && isParallelFor(f)) {
+    if (!parallelCodeGeneration) {
+      parallelCodeGeneration = true;
+      parallelLoops.push_back(f->iterator);
+      codegenForGPGPU(f);
+      parallelCodeGeneration = false;
+      return;
+    }
+  }
+#endif
+
   codegenForSequential(f);
 }
 

Modified: polly/trunk/lib/CodeGen/Makefile
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/Makefile?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/Makefile (original)
+++ polly/trunk/lib/CodeGen/Makefile Fri Aug  3 07:50:07 2012
@@ -10,6 +10,8 @@
 
 CPP.Flags += $(POLLY_INC)
 
+include $(LEVEL)/Makefile.config
+
 #
 # Include Makefile.common so we know what to do.
 #

Added: polly/trunk/lib/CodeGen/PTXGenerator.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PTXGenerator.cpp?rev=161239&view=auto
==============================================================================
--- polly/trunk/lib/CodeGen/PTXGenerator.cpp (added)
+++ polly/trunk/lib/CodeGen/PTXGenerator.cpp Fri Aug  3 07:50:07 2012
@@ -0,0 +1,663 @@
+//===------ PTXGenerator.cpp -  IR helper to create loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions to create GPU parallel codes as LLVM-IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/CodeGen/PTXGenerator.h"
+
+#ifdef GPU_CODEGEN
+#include "polly/ScopDetection.h"
+#include "polly/ScopInfo.h"
+
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+using namespace polly;
+
+PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P,
+                           const std::string &Triple):
+  Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
+  BlockWidth(1), BlockHeight(1), OutputBytes(0) {
+
+  InitializeGPUDataTypes();
+}
+
+Module *PTXGenerator::getModule() {
+  return Builder.GetInsertBlock()->getParent()->getParent();
+}
+
+Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
+  assert(NumArgs == 1 && "we support only one array access now.");
+
+  Module *M = getModule();
+  Function *F = Builder.GetInsertBlock()->getParent();
+  std::vector<Type*> Arguments;
+  for (int i = 0; i < NumArgs; i++)
+    Arguments.push_back(Builder.getInt8PtrTy());
+  FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
+  Function *FN = Function::Create(FT, Function::InternalLinkage,
+      F->getName() + "_ptx_subfn", M);
+  FN->setCallingConv(CallingConv::PTX_Kernel);
+
+  // Do not run any optimization pass on the new function.
+  P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
+
+  for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
+    AI->setName("ptx.Array");
+
+  return FN;
+}
+
+void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues,
+                                     SetVector<Value*> &OriginalIVS,
+                                     PTXGenerator::ValueToValueMapTy &VMap,
+                                     Function **SubFunction) {
+  Function *FN = createSubfunctionDefinition(UsedValues.size());
+  Module *M = getModule();
+  LLVMContext &Context = FN->getContext();
+  IntegerType *Ty = Builder.getInt64Ty();
+
+  // Store the previous basic block.
+  BasicBlock *PrevBB = Builder.GetInsertBlock();
+
+  // Create basic blocks.
+  BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
+  BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
+  BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
+
+  DominatorTree &DT = P->getAnalysis<DominatorTree>();
+  DT.addNewBlock(HeaderBB, PrevBB);
+  DT.addNewBlock(ExitBB, HeaderBB);
+  DT.addNewBlock(BodyBB, HeaderBB);
+
+  Builder.SetInsertPoint(HeaderBB);
+
+  // Insert VMap items with maps of array base address on the host to base
+  // address on the device.
+  Function::arg_iterator AI = FN->arg_begin();
+  for (unsigned j = 0; j < UsedValues.size(); j++) {
+    Value *BaseAddr = UsedValues[j];
+    Type *ArrayTy = BaseAddr->getType();
+    Value *Param = Builder.CreateBitCast(AI, ArrayTy);
+    VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param));
+    AI++;
+  }
+
+  // FIXME: These intrinsics should be inserted on-demand. However, we insert
+  // them all currently for simplicity.
+  Function *GetNctaidX =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
+  Function *GetNctaidY =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
+  Function *GetCtaidX =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
+  Function *GetCtaidY =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
+  Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
+  Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
+  Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
+  Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
+
+  Value *GridWidth = Builder.CreateCall(GetNctaidX);
+  GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
+  Value *GridHeight = Builder.CreateCall(GetNctaidY);
+  GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
+  Value *BlockWidth = Builder.CreateCall(GetNtidX);
+  BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
+  Value *BlockHeight = Builder.CreateCall(GetNtidY);
+  BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
+  Value *BIDx = Builder.CreateCall(GetCtaidX);
+  BIDx = Builder.CreateIntCast(BIDx, Ty, false);
+  Value *BIDy = Builder.CreateCall(GetCtaidY);
+  BIDy = Builder.CreateIntCast(BIDy, Ty, false);
+  Value *TIDx = Builder.CreateCall(GetTidX);
+  TIDx = Builder.CreateIntCast(TIDx, Ty, false);
+  Value *TIDy = Builder.CreateCall(GetTidY);
+  TIDy = Builder.CreateIntCast(TIDy, Ty, false);
+
+  Builder.CreateBr(BodyBB);
+  Builder.SetInsertPoint(BodyBB);
+
+  unsigned NumDims = OriginalIVS.size();
+  std::vector<Value *> Substitutions;
+  Value *BlockID, *ThreadID;
+  switch (NumDims) {
+  case 1: {
+    Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight,
+                                         "p_gpu_blocksize");
+    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
+    BlockID = Builder.CreateAdd(BlockID, BIDx);
+    BlockID = Builder.CreateMul(BlockID, BlockSize);
+    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
+    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
+    ThreadID = Builder.CreateAdd(ThreadID, BlockID);
+    Substitutions.push_back(ThreadID);
+    break;
+  }
+  case 2: {
+    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
+    BlockID = Builder.CreateAdd(BlockID, BIDx);
+    Substitutions.push_back(BlockID);
+    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
+    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
+    Substitutions.push_back(ThreadID);
+    break;
+  }
+  case 3: {
+    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
+    BlockID = Builder.CreateAdd(BlockID, BIDx);
+    Substitutions.push_back(BlockID);
+    Substitutions.push_back(TIDy);
+    Substitutions.push_back(TIDx);
+    break;
+  }
+  case 4: {
+    Substitutions.push_back(BIDy);
+    Substitutions.push_back(BIDx);
+    Substitutions.push_back(TIDy);
+    Substitutions.push_back(TIDx);
+    break;
+  }
+  default:
+    assert(true &&
+           "We cannot transform parallel loops whose depth is larger than 4.");
+    return;
+  }
+
+  assert(OriginalIVS.size() == Substitutions.size()
+         && "The size of IVS should be equal to the size of substitutions.");
+  for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
+    VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i],
+                                               Substitutions[i]));
+  }
+
+  Builder.CreateBr(ExitBB);
+  Builder.SetInsertPoint(--Builder.GetInsertPoint());
+  BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
+
+  // Add the termination of the ptx-device subfunction.
+  Builder.SetInsertPoint(ExitBB);
+  Builder.CreateRetVoid();
+
+  Builder.SetInsertPoint(LoopBody);
+  *SubFunction = FN;
+}
+
+void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues,
+                                   SetVector<Value*> &OriginalIVS,
+                                   ValueToValueMapTy &VMap,
+                                   BasicBlock::iterator *LoopBody) {
+  Function *SubFunction;
+  BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
+  createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
+  *LoopBody = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(PrevInsertPoint);
+}
+
+IntegerType *PTXGenerator::getInt64Type() {
+  return Builder.getInt64Ty();
+}
+
+PointerType *PTXGenerator::getI8PtrType() {
+  return PointerType::getUnqual(Builder.getInt8Ty());
+}
+
+PointerType *PTXGenerator::getPtrI8PtrType() {
+  return PointerType::getUnqual(getI8PtrType());
+}
+
+PointerType *PTXGenerator::getFloatPtrType() {
+  return llvm::Type::getFloatPtrTy(getModule()->getContext());
+}
+
+PointerType *PTXGenerator::getGPUContextPtrType() {
+  return PointerType::getUnqual(ContextTy);
+}
+
+PointerType *PTXGenerator::getGPUModulePtrType() {
+  return PointerType::getUnqual(ModuleTy);
+}
+
+PointerType *PTXGenerator::getGPUDevicePtrType() {
+  return PointerType::getUnqual(DeviceTy);
+}
+
+PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
+  return PointerType::getUnqual(DevDataTy);
+}
+
+PointerType *PTXGenerator::getGPUFunctionPtrType() {
+  return PointerType::getUnqual(KernelTy);
+}
+
+PointerType *PTXGenerator::getGPUEventPtrType() {
+  return PointerType::getUnqual(EventTy);
+}
+
+void PTXGenerator::InitializeGPUDataTypes() {
+  LLVMContext &Context = getModule()->getContext();
+
+  ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
+  ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
+  KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
+  DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
+  DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT");
+  EventTy = StructType::create(Context, "struct.PollyGPUEventT");
+}
+
+void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
+  const char *Name = "polly_initDevice";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
+    Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall2(F, Context, Device);
+}
+
+void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
+  const char *Name = "polly_getPTXModule";
+  llvm::Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall2(F, Buffer, Module);
+}
+
+void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
+                                               Value *Kernel) {
+  const char *Name = "polly_getPTXKernelEntry";
+  llvm::Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(getGPUModulePtrType());
+    Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, Entry, Module, Kernel);
+}
+
+void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
+                                                            Value *DeviceData,
+                                                            Value *Size) {
+  const char *Name = "polly_allocateMemoryForHostAndDevice";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getPtrI8PtrType());
+    Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, HostData, DeviceData, Size);
+}
+
+void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
+                                                  Value *HostData,
+                                                  Value *Size) {
+  const char *Name = "polly_copyFromHostToDevice";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getPtrGPUDevicePtrType());
+    Args.push_back(getI8PtrType());
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, DeviceData, HostData, Size);
+}
+
+void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
+                                                  Value *DeviceData,
+                                                  Value *Size) {
+  const char *Name = "polly_copyFromDeviceToHost";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(getPtrGPUDevicePtrType());
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, HostData, DeviceData, Size);
+}
+
+void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
+                                                 Value *BlockWidth,
+                                                 Value *BlockHeight,
+                                                 Value *DeviceData) {
+  const char *Name = "polly_setKernelParameters";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getGPUFunctionPtrType());
+    Args.push_back(getInt64Type());
+    Args.push_back(getInt64Type());
+    Args.push_back(getPtrGPUDevicePtrType());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
+}
+
+void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
+                                          Value *GridHeight) {
+  const char *Name = "polly_launchKernel";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getGPUFunctionPtrType());
+    Args.push_back(getInt64Type());
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
+}
+
+void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
+                                                   Value *StopEvent) {
+  const char *Name = "polly_startTimerByCudaEvent";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
+    Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall2(F, StartEvent, StopEvent);
+}
+
+void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
+                                                  Value *StopEvent,
+                                                  Value *Timer) {
+  const char *Name = "polly_stopTimerByCudaEvent";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getGPUEventPtrType());
+    Args.push_back(getGPUEventPtrType());
+    Args.push_back(getFloatPtrType());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
+}
+
+void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
+                                                   Value *DeviceData,
+                                                   Value *Module,
+                                                   Value *Context,
+                                                   Value *Kernel) {
+  const char *Name = "polly_cleanupGPGPUResources";
+  llvm::Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(getPtrGPUDevicePtrType());
+    Args.push_back(getGPUModulePtrType());
+    Args.push_back(getGPUContextPtrType());
+    Args.push_back(getGPUFunctionPtrType());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
+}
+
+Value *PTXGenerator::getCUDAGridWidth() {
+  return ConstantInt::get(getInt64Type(), GridWidth);
+}
+
+Value *PTXGenerator::getCUDAGridHeight() {
+  return ConstantInt::get(getInt64Type(), GridHeight);
+}
+
+Value *PTXGenerator::getCUDABlockWidth() {
+  return ConstantInt::get(getInt64Type(), BlockWidth);
+}
+
+Value *PTXGenerator::getCUDABlockHeight() {
+  return ConstantInt::get(getInt64Type(), BlockHeight);
+}
+
+Value *PTXGenerator::getOutputArraySizeInBytes() {
+  return ConstantInt::get(getInt64Type(), OutputBytes);
+}
+
+Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
+  Module *M = getModule();
+  std::string LLVMKernelStr;
+  raw_string_ostream NameROS(LLVMKernelStr);
+  formatted_raw_ostream FOS(NameROS);
+  FOS << "target triple = \"" << GPUTriple <<"\"\n";
+  SubFunction->print(FOS);
+
+  // Insert ptx intrinsics into the kernel string.
+  for (Module::iterator I = M->begin(), E = M->end(); I != E; ) {
+    Function *F = I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->isIntrinsic()) {
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::ptx_read_nctaid_x:
+      case Intrinsic::ptx_read_nctaid_y:
+      case Intrinsic::ptx_read_ctaid_x:
+      case Intrinsic::ptx_read_ctaid_y:
+      case Intrinsic::ptx_read_ntid_x:
+      case Intrinsic::ptx_read_ntid_y:
+      case Intrinsic::ptx_read_tid_x:
+      case Intrinsic::ptx_read_tid_y:
+        F->print(FOS);
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr,
+                                                    "llvm_kernel");
+  Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
+  Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");
+
+  Function *GetDeviceKernel = Intrinsic::getDeclaration(M,
+                                                        Intrinsic::codegen);
+
+  return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
+}
+
+Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
+  StringRef Entry = SubFunction->getName();
+  return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
+}
+
+void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
+  Module *M = getModule();
+  SubFunction->eraseFromParent();
+
+  if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) {
+    FuncPTXReadNCtaidX->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) {
+    FuncPTXReadNCtaidY->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) {
+    FuncPTXReadCtaidX->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) {
+    FuncPTXReadCtaidY->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) {
+    FuncPTXReadNTidX->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) {
+    FuncPTXReadNTidY->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) {
+    FuncPTXReadTidX->eraseFromParent();
+  }
+
+  if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) {
+    FuncPTXReadTidY->eraseFromParent();
+  }
+}
+
+void PTXGenerator::finishGeneration(Function *F) {
+  // Define data used by the GPURuntime library.
+  AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0,
+                                                  "phcontext");
+  AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0,
+                                                 "phdevice");
+  AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0,
+                                                 "phmodule");
+  AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0,
+                                                 "phkernel");
+  AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
+                                                     "pstart_timer");
+  AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
+                                                    "pstop_timer");
+  AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0,
+                                                "pdevice_data");
+  AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0,
+                                                 "phost_data");
+  Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
+  AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
+
+  // Initialize the GPU device.
+  createCallInitDevice(PtrCUContext, PtrCUDevice);
+
+  // Create the GPU kernel module and entry function.
+  Value *PTXString = createPTXKernelFunction(F);
+  Value *PTXEntry = getPTXKernelEntryName(F);
+  createCallGetPTXModule(PTXString, PtrCUModule);
+  LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
+  createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
+
+  // Allocate device memory and its corresponding host memory.
+  createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
+                                           getOutputArraySizeInBytes());
+
+  // Get the pointer to the device memory and set the GPU execution parameters.
+  LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
+  LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
+  createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
+                                getCUDABlockHeight(), DData);
+
+  // Create the start and end timer and record the start time.
+  createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
+
+  // Launch the GPU kernel.
+  createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
+
+  // Copy the results back from the GPU to the host.
+  LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
+  createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
+
+  // Record the end time.
+  LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
+  LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
+  createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent,
+                                 PtrElapsedTimes);
+
+  // Cleanup all the resources used.
+  LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
+  createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext,
+                                  CUKernel);
+
+  // Erase the ptx kernel and device subfunctions and ptx intrinsics from
+  // current module.
+  eraseUnusedFunctions(F);
+}
+#endif /* GPU_CODEGEN */

Added: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c (added)
+++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c Fri Aug  3 07:50:07 2012
@@ -0,0 +1,16 @@
+int A[128][128];
+
+int gpu_pure() {
+  int i,j;
+
+  for(i = 0; i < 128; i++)
+    for(j = 0; j < 128; j++)
+      A[i][j] = i*128 + j;
+
+  return 0;
+}
+
+int main() {
+  int b = gpu_pure();
+  return 0;
+}

Added: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll (added)
+++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll Fri Aug  3 07:50:07 2012
@@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '2d_innermost_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_pure() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc6, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
+  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+  br i1 %exitcond6, label %for.body, label %for.end8
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %tmp = shl nsw i64 %indvars.iv2, 7
+  %tmp7 = add nsw i64 %tmp, %indvars.iv
+  %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+  %tmp8 = trunc i64 %tmp7 to i32
+  store i32 %tmp8, i32* %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc6
+
+for.inc6:                                         ; preds = %for.end
+  %indvars.iv.next3 = add i64 %indvars.iv2, 1
+  br label %for.cond
+
+for.end8:                                         ; preds = %for.cond
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %call = call i32 @gpu_pure()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources

Added: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c (added)
+++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c Fri Aug  3 07:50:07 2012
@@ -0,0 +1,17 @@
+int A[128][128];
+
+int gpu_no_pure() {
+  int i,j,k;
+
+  for(i = 0; i < 128; i++)
+    for(j = 0; j < 128; j++)
+      for(k = 0; k < 256; k++)
+        A[i][j] += i*123/(k+1)+5-j*k-123;
+
+  return 0;
+}
+
+int main() {
+  int b = gpu_no_pure();
+  return 0;
+}

Added: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll (added)
+++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll Fri Aug  3 07:50:07 2012
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '3d_innermost_non_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_no_pure() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc16, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
+  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+  br i1 %exitcond6, label %for.body, label %for.end18
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc13, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond1 = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond1, label %for.body3, label %for.end15
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 256
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %tmp = mul nsw i64 %indvars.iv2, 123
+  %add = add nsw i32 %k.0, 1
+  %tmp7 = trunc i64 %tmp to i32
+  %div = sdiv i32 %tmp7, %add
+  %add7 = add nsw i32 %div, 5
+  %tmp8 = trunc i64 %indvars.iv to i32
+  %mul8 = mul nsw i32 %tmp8, %k.0
+  %sub = sub nsw i32 %add7, %mul8
+  %sub9 = add nsw i32 %sub, -123
+  %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+  %tmp9 = load i32* %arrayidx11, align 4
+  %add12 = add nsw i32 %tmp9, %sub9
+  store i32 %add12, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.end
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond1
+
+for.end15:                                        ; preds = %for.cond1
+  br label %for.inc16
+
+for.inc16:                                        ; preds = %for.end15
+  %indvars.iv.next3 = add i64 %indvars.iv2, 1
+  br label %for.cond
+
+for.end18:                                        ; preds = %for.cond
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %call = call i32 @gpu_no_pure()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources

Added: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop (added)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop Fri Aug  3 07:50:07 2012
@@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
+      }
+   ]
+}

Added: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop.transformed%2Bgpu?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu (added)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu Fri Aug  3 07:50:07 2012
@@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+      }
+   ]
+}

Added: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop (added)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop Fri Aug  3 07:50:07 2012
@@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end8",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
+      }
+   ]
+}

Added: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop.transformed%2Bgpu?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu (added)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu Fri Aug  3 07:50:07 2012
@@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end8",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+      }
+   ]
+}

Added: polly/trunk/test/CodeGen/GPGPU/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/lit.local.cfg?rev=161239&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/lit.local.cfg (added)
+++ polly/trunk/test/CodeGen/GPGPU/lit.local.cfg Fri Aug  3 07:50:07 2012
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+gpgpu = config.root.enable_gpgpu_codegen
+if gpgpu not in ['TRUE', 'true'] :
+    config.unsupported = True

Modified: polly/trunk/test/lit.site.cfg.in
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/lit.site.cfg.in?rev=161239&r1=161238&r2=161239&view=diff
==============================================================================
--- polly/trunk/test/lit.site.cfg.in (original)
+++ polly/trunk/test/lit.site.cfg.in Fri Aug  3 07:50:07 2012
@@ -7,6 +7,7 @@
 config.polly_obj_root = "@POLLY_BINARY_DIR@"
 config.polly_lib_dir = "@POLLY_LIB_DIR@"
 config.target_triple = "@TARGET_TRIPLE@"
+config.enable_gpgpu_codegen = "@CUDALIB_FOUND@"
 lit.params['build_config'] = "@POLLY_SOURCE_DIR@/test"
 
 ## Check the current platform with regex





More information about the llvm-commits mailing list