[llvm-commits] [polly] r161239 - in /polly/trunk: ./ autoconf/ include/polly/ include/polly/CodeGen/ include/polly/Config/ lib/CodeGen/ test/ test/CodeGen/GPGPU/

Mon Aug 20 22:24:41 PDT 2012

Hi,

On Fri, Aug 3, 2012 at 7:50 AM, Tobias Grosser
<grosser at fim.uni-passau.de> wrote:
> Author: grosser
> Date: Fri Aug  3 07:50:07 2012
> New Revision: 161239
>
> URL: http://llvm.org/viewvc/llvm-project?rev=161239&view=rev
> Log:
> Add preliminary implementation for GPGPU code generation.
>
> Translate the selected parallel loop body into a ptx string and run it with the
> cuda driver API. We limit this preliminary implementation to target the
> following special test cases:
>
>   - Support only 2-dimensional parallel loops with or without only one innermost
>     non-parallel loop.
>   - Support write memory access to only one array in a SCoP.
>
> The patch was committed with smaller changes to the build system:
>
> There is now a flag to enable gpu code generation explictly. This was required
> as we need the llvm.codegen() patch applied on the llvm sources, to compile this
> feature correctly. Also, enabling gpu code generation does not require cuda.
> This requirement was removed to allow 'make polly-test' runs, even without an
> installed cuda runtime.
>
> Contributed by:  Yabin Hu  <yabin.hwu at gmail.com>
>
> Added:
>     polly/trunk/include/polly/CodeGen/PTXGenerator.h
>     polly/trunk/lib/CodeGen/PTXGenerator.cpp
>     polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
>     polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
>     polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
>     polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
>     polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
>     polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
>     polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
>     polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
>     polly/trunk/test/CodeGen/GPGPU/lit.local.cfg
> Modified:
>     polly/trunk/CMakeLists.txt
>     polly/trunk/autoconf/configure.ac
>     polly/trunk/configure
>     polly/trunk/include/polly/Config/config.h.cmake
>     polly/trunk/include/polly/Config/config.h.in
>     polly/trunk/include/polly/ScopInfo.h
>     polly/trunk/lib/CodeGen/CMakeLists.txt
>     polly/trunk/lib/CodeGen/CodeGeneration.cpp
>     polly/trunk/lib/CodeGen/Makefile
>     polly/trunk/test/lit.site.cfg.in
>
> Modified: polly/trunk/CMakeLists.txt
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/CMakeLists.txt?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/CMakeLists.txt (original)
> +++ polly/trunk/CMakeLists.txt Fri Aug  3 07:50:07 2012
> @@ -75,7 +75,14 @@
>  FIND_PACKAGE(Isl REQUIRED)
>  FIND_PACKAGE(Gmp REQUIRED)
>  FIND_PACKAGE(Pluto)
> -FIND_PACKAGE(CUDA)
> +
> +option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
> +if (POLLY_ENABLE_GPGPU_CODEGEN)
> +  # Do not require CUDA, as GPU code generation test cases can be run without
> +  # a cuda library.
> +  FIND_PACKAGE(CUDA)
> +  set(GPU_CODEGEN TRUE)
> +endif(POLLY_ENABLE_GPGPU_CODEGEN)
>
>  option(POLLY_ENABLE_OPENSCOP "Enable Openscop library for scop import/export" ON)
>  if (POLLY_ENABLE_OPENSCOP)
>
> Modified: polly/trunk/autoconf/configure.ac
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/autoconf/configure.ac?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/autoconf/configure.ac (original)
> +++ polly/trunk/autoconf/configure.ac Fri Aug  3 07:50:07 2012
> @@ -120,7 +120,20 @@
>  AC_SUBST(scoplib_rpath)
>
>  dnl Check if CUDA lib there
> +dnl Disable the build of polly, even if it is checked out into tools/polly.
> +AC_ARG_ENABLE(polly_gpu_codegen,
> +              AS_HELP_STRING([--enable-polly-gpu-codegen],
> +                             [Enable GPU code generation in Polly(default is NO)]),,
> +                             enableval=default)
> +case "$enableval" in
> +  yes) AC_DEFINE([GPU_CODEGEN],[1], [Define if gpu codegen is enabled]) ;;
> +  no)  AC_DEFINE([GPU_CODEGEN],[0], [Define if gpu codegen is enabled]) ;;
> +  default) AC_DEFINE([GPU_CODEGEN],[0],  [Define if gpu codegen is enabled]) ;;

In all cases, you define GPU_CODEGEN to either 0 or 1.
(By the way, the string description is accurate: it always enables GPU
codegen...
see below...)

> +  *) AC_MSG_ERROR([Invalid setting for --enable-polly-gpu-codegen. Use "yes" or "no"]) ;;
> +esac
> +
>  find_lib_and_headers([cuda], [cuda.h], [cuda])
> +
>  AS_IF([test "x$cuda_found" = "xyes"],
>    [AC_DEFINE([CUDALIB_FOUND],[1],[Define if cudalib found])])
>
>
> Modified: polly/trunk/configure
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/configure?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/configure (original)
> +++ polly/trunk/configure Fri Aug  3 07:50:07 2012
> @@ -654,6 +654,7 @@
>  with_pluto
>  with_openscop
>  with_scoplib
> +enable_polly_gpu_codegen
>  with_cuda
>  '
>        ac_precious_vars='build_alias
> @@ -1272,6 +1273,13 @@
>     esac
>    cat <<\_ACEOF
>
> +Optional Features:
> +  --disable-option-checking  ignore unrecognized --enable/--with options
> +  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
> +  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
> +  --enable-polly-gpu-codegen
> +                          Enable GPU code generation in Polly(default is NO)
> +
>  Optional Packages:
>    --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
>    --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
> @@ -3002,6 +3010,26 @@
>
>
>
> +# Check whether --enable-polly_gpu_codegen was given.
> +if test "${enable_polly_gpu_codegen+set}" = set; then :
> +  enableval=$enable_polly_gpu_codegen;
> +else
> +  enableval=default
> +fi
> +
> +case "$enableval" in
> +  yes)
> +$as_echo "#define GPU_CODEGEN 1" >>confdefs.h
> + ;;
> +  no)
> +$as_echo "#define GPU_CODEGEN 0" >>confdefs.h
> + ;;
> +  default)
> +$as_echo "#define GPU_CODEGEN 0" >>confdefs.h
> + ;;
> +  *) as_fn_error $? "Invalid setting for --enable-polly-gpu-codegen. Use \"yes\" or \"no\"" "$LINENO" 5 ;;
> +esac
> +
>
>    ac_ext=cpp
>  ac_cpp='$CXXCPP $CPPFLAGS'
> @@ -3081,6 +3109,7 @@
>  ac_compiler_gnu=$ac_cv_c_compiler_gnu
>
>
> +
>  if test "x$cuda_found" = "xyes"; then :
>
>  $as_echo "#define CUDALIB_FOUND 1" >>confdefs.h
>
> Added: polly/trunk/include/polly/CodeGen/PTXGenerator.h
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PTXGenerator.h?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/include/polly/CodeGen/PTXGenerator.h (added)
> +++ polly/trunk/include/polly/CodeGen/PTXGenerator.h Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,197 @@
> +//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file contains functions to create GPGPU parallel loops as LLVM-IR.
> +//
> +//===----------------------------------------------------------------------===//
> +#ifndef POLLY_CODEGEN_PTXGENERATOR_H
> +#define POLLY_CODEGEN_PTXGENERATOR_H
> +
> +#include "polly/Config/config.h"
> +
> +#ifdef GPU_CODEGEN

And here you test whether GPU_CODEGEN is defined, that is always true:
it is either 0 or 1.
However in this code you do not test whether it is 0 or 1.
This breaks compiles of Polly with configure and when the rest of the
PTX toolchain is not available.

Sebastian

> +#include "llvm/IRBuilder.h"
> +#include "llvm/ADT/SetVector.h"
> +
> +#include <map>
> +
> +namespace llvm {
> +  class Value;
> +  class Pass;
> +  class BasicBlock;
> +}
> +
> +namespace polly {
> +using namespace llvm;
> +
> +class PTXGenerator {
> +public:
> +  typedef std::map<Value*, Value*> ValueToValueMapTy;
> +
> +  PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple);
> +
> +  /// @brief Create a GPGPU parallel loop.
> +  ///
> +  /// @param UsedValues   A set of LLVM-IR Values that should be available to
> +  ///                     the new loop body.
> +  /// @param OriginalIVS  The new values of the original induction variables.
> +  /// @param VMap         This map is filled by createParallelLoop(). It
> +  ///                     maps the values in UsedValues to Values through which
> +  ///                     their content is available within the loop body.
> +  /// @param LoopBody     A pointer to an iterator that is set to point to the
> +  ///                     body of the created loop. It should be used to insert
> +  ///                     instructions that form the actual loop body.
> +  void startGeneration(SetVector<Value*> &UsedValues,
> +                       SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap,
> +                       BasicBlock::iterator *LoopBody);
> +
> +  /// @brief Execute the post-operations to build a GPGPU parallel loop.
> +  ///
> +  void finishGeneration(Function *SubFunction);
> +
> +  /// @brief Set the parameters for launching PTX kernel.
> +  ///
> +  /// @param GridW    A value of the width of a GPU grid.
> +  /// @param GridH    A value of the height of a GPU grid.
> +  /// @param BlockW   A value of the width of a GPU block.
> +  /// @param BlockH   A value of the height of a GPU block.
> +  void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) {
> +    GridWidth = GridW;
> +    GridHeight = GridH;
> +    BlockWidth = BlockW;
> +    BlockHeight = BlockH;
> +  }
> +
> +  /// @brief Set the size of the output array.
> +  ///
> +  /// This size is used to allocate memory on the device and the host.
> +  ///
> +  /// @param Bytes        Output array size in bytes.
> +  void setOutputBytes(unsigned Bytes) {
> +    OutputBytes = Bytes;
> +  }
> +
> +private:
> +  IRBuilder<> &Builder;
> +  Pass *P;
> +
> +  /// @brief The target triple of the device.
> +  const std::string &GPUTriple;
> +
> +  ///@brief Parameters used for launching PTX kernel.
> +  int GridWidth, GridHeight, BlockWidth, BlockHeight;
> +
> +  /// @brief Size of the output array in bytes.
> +  unsigned OutputBytes;
> +
> +  /// @brief Polly's GPU data types.
> +  StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy;
> +
> +  void InitializeGPUDataTypes();
> +  IntegerType *getInt64Type();            // i64
> +  PointerType *getI8PtrType();            // char *
> +  PointerType *getPtrI8PtrType();         // char **
> +  PointerType *getFloatPtrType();         // float *
> +  PointerType *getGPUContextPtrType();    // %struct.PollyGPUContextT *
> +  PointerType *getGPUModulePtrType();     // %struct.PollyGPUModuleT *
> +  PointerType *getGPUDevicePtrType();     // %struct.PollyGPUDeviceT *
> +  PointerType *getPtrGPUDevicePtrType();  // %struct.PollyGPUDevicePtrT *
> +  PointerType *getGPUFunctionPtrType();   // %struct.PollyGPUFunctionT *
> +  PointerType *getGPUEventPtrType();      // %struct.PollyGPUEventT *
> +
> +  Module *getModule();
> +
> +  /// @brief Create the kernel string containing LLVM IR.
> +  ///
> +  /// @param SubFunction  A pointer to the device code function.
> +  /// @return             A global string variable containing the LLVM IR codes
> +  //                      of the SubFunction.
> +  Value *createPTXKernelFunction(Function *SubFunction);
> +
> +  /// @brief Get the entry name of the device kernel function.
> +  ///
> +  /// @param SubFunction  A pointer to the device code function.
> +  /// @return             A global string variable containing the entry name of
> +  ///                     the SubFunction.
> +  Value *getPTXKernelEntryName(Function *SubFunction);
> +
> +  void createCallInitDevice(Value *Context, Value *Device);
> +  void createCallGetPTXModule(Value *Buffer, Value *Module);
> +  void createCallGetPTXKernelEntry(Value *Entry, Value *Module,
> +                                   Value *Kernel);
> +  void createCallAllocateMemoryForHostAndDevice(Value *HostData,
> +                                                Value *DeviceData,
> +                                                Value *Size);
> +  void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData,
> +                                      Value *Size);
> +  void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData,
> +                                      Value *Size);
> +  void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth,
> +                                     Value *BlockHeight, Value *DeviceData);
> +  void createCallLaunchKernel(Value *Kernel, Value *GridWidth,
> +                              Value *GridHeight);
> +  void createCallStartTimerByCudaEvent(Value *StartEvent,
> +                                       Value *StopEvent);
> +  void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent,
> +                                      Value *Timer);
> +  void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData,
> +                                       Value *Module, Value *Context,
> +                                       Value *Kernel);
> +
> +  /// @brief Create the CUDA subfunction.
> +  ///
> +  /// @param UsedValues   A set of LLVM-IR Values that should be available to
> +  ///                     the new loop body.
> +  /// @param VMap         This map that is filled by createSubfunction(). It
> +  ///                     maps the values in UsedValues to Values through which
> +  ///                     their content is available within the loop body.
> +  /// @param OriginalIVS  The new values of the original induction variables.
> +  /// @param SubFunction  The newly created SubFunction is returned here.
> +  void createSubfunction(SetVector<Value*> &UsedValues,
> +                         SetVector<Value*> &OriginalIVS,
> +                         ValueToValueMapTy &VMap,
> +                         Function **SubFunction);
> +
> +  /// @brief Create the definition of the CUDA subfunction.
> +  ///
> +  /// @param NumArgs      The number of parameters of this subfunction. This is
> +  ///                     usually set to the number of memory accesses which
> +  ///                     will be copied from host to device.
> +  Function *createSubfunctionDefinition(int NumArgs);
> +
> +  /// @brief Extract all the ptx related subfunctions into a new module.
> +  ///
> +  /// @param M            Current module.
> +  /// @return             The generated module containing only gpu related
> +  ///                     subfunctions.
> +  Module *extractPTXFunctionsFromModule(const Module *M);
> +
> +  /// @brief Get the Value of CUDA block width.
> +  Value *getCUDABlockWidth();
> +
> +  /// @brief Get the Value of CUDA block height.
> +  Value *getCUDABlockHeight();
> +
> +  /// @brief Get the Value of CUDA Gird width.
> +  Value *getCUDAGridWidth();
> +
> +  /// @brief Get the Value of CUDA grid height.
> +  Value *getCUDAGridHeight();
> +
> +  /// @brief Get the Value of the bytes of the output array.
> +  Value *getOutputArraySizeInBytes();
> +
> +  /// @brief Erase the ptx-related subfunctions and declarations.
> +  ///
> +  /// @param SubFunction  A pointer to the device code function.
> +  void eraseUnusedFunctions(Function *SubFunction);
> +};
> +} // end namespace polly
> +#endif /* GPU_CODEGEN */
> +#endif /* POLLY_CODEGEN_PTXGENERATOR_H */
>
> Modified: polly/trunk/include/polly/Config/config.h.cmake
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Config/config.h.cmake?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/Config/config.h.cmake (original)
> +++ polly/trunk/include/polly/Config/config.h.cmake Fri Aug  3 07:50:07 2012
> @@ -19,5 +19,6 @@
>  #cmakedefine PLUTO_FOUND
>  #cmakedefine SCOPLIB_FOUND
>  #cmakedefine CUDALIB_FOUND
> +#cmakedefine GPU_CODEGEN
>
>  #endif
>
> Modified: polly/trunk/include/polly/Config/config.h.in
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Config/config.h.in?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/Config/config.h.in (original)
> +++ polly/trunk/include/polly/Config/config.h.in Fri Aug  3 07:50:07 2012
> @@ -9,6 +9,9 @@
>  /* Define if cudalib found */
>  #undef CUDALIB_FOUND
>
> +/* Define if gpu codegen is enabled */
> +#undef GPU_CODEGEN
> +
>  /* Define if ISL has a code generator */
>  #undef ISL_CODEGEN_FOUND
>
>
> Modified: polly/trunk/include/polly/ScopInfo.h
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScopInfo.h?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/ScopInfo.h (original)
> +++ polly/trunk/include/polly/ScopInfo.h Fri Aug  3 07:50:07 2012
> @@ -125,6 +125,9 @@
>    /// @brief Is this a read memory access?
>    bool isRead() const { return Type == MemoryAccess::Read; }
>
> +  /// @brief Is this a write memory access?
> +  bool isWrite() const { return Type == MemoryAccess::Write; }
> +
>    isl_map *getAccessRelation() const;
>
>    /// @brief Get an isl string representing this access function.
>
> Modified: polly/trunk/lib/CodeGen/CMakeLists.txt
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CMakeLists.txt?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/CMakeLists.txt (original)
> +++ polly/trunk/lib/CodeGen/CMakeLists.txt Fri Aug  3 07:50:07 2012
> @@ -9,10 +9,16 @@
>        IslCodeGeneration.cpp)
>  endif (ISL_CODEGEN_FOUND)
>
> +if (GPU_CODEGEN)
> +  set (GPGPU_CODEGEN_FILES
> +       PTXGenerator.cpp)
> +endif (GPU_CODEGEN)
> +
>  add_polly_library(PollyCodeGen
>    BlockGenerators.cpp
>    ${CLOOG_FILES}
>    ${ISL_CODEGEN_FILES}
>    LoopGenerators.cpp
>    Utils.cpp
> +  ${GPGPU_CODEGEN_FILES}
>  )
>
> Modified: polly/trunk/lib/CodeGen/CodeGeneration.cpp
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CodeGeneration.cpp?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/CodeGeneration.cpp (original)
> +++ polly/trunk/lib/CodeGen/CodeGeneration.cpp Fri Aug  3 07:50:07 2012
> @@ -31,6 +31,7 @@
>  #include "polly/CodeGen/CodeGeneration.h"
>  #include "polly/CodeGen/BlockGenerators.h"
>  #include "polly/CodeGen/LoopGenerators.h"
> +#include "polly/CodeGen/PTXGenerator.h"
>  #include "polly/CodeGen/Utils.h"
>  #include "polly/Support/GICHelper.h"
>
> @@ -65,6 +66,19 @@
>         cl::value_desc("OpenMP code generation enabled if true"),
>         cl::init(false), cl::ZeroOrMore);
>
> +#ifdef GPU_CODEGEN
> +static cl::opt<bool>
> +GPGPU("enable-polly-gpgpu",
> +       cl::desc("Generate GPU parallel code"), cl::Hidden,
> +       cl::value_desc("GPGPU code generation enabled if true"),
> +       cl::init(false), cl::ZeroOrMore);
> +
> +static cl::opt<std::string>
> +GPUTriple("polly-gpgpu-triple",
> +       cl::desc("Target triple for GPU code generation"),
> +       cl::Hidden, cl::init(""));
> +#endif /* GPU_CODEGEN */
> +
>  static cl::opt<bool>
>  AtLeastOnce("enable-polly-atLeastOnce",
>         cl::desc("Give polly the hint, that every loop is executed at least"
> @@ -284,6 +298,27 @@
>    /// statement.
>    void codegenForOpenMP(const clast_for *f);
>
> +#ifdef GPU_CODEGEN
> +  /// @brief Create GPGPU device memory access values.
> +  ///
> +  /// Create a list of values that will be set to be parameters of the GPGPU
> +  /// subfunction. These parameters represent device memory base addresses
> +  /// and the size in bytes.
> +  SetVector<Value*> getGPUValues(unsigned &OutputBytes);
> +
> +  /// @brief Create a GPU parallel for loop.
> +  ///
> +  /// This loop reflects a loop as if it would have been created by a GPU
> +  /// statement.
> +  void codegenForGPGPU(const clast_for *F);
> +
> +  /// @brief Get innermost for loop.
> +  const clast_stmt *getScheduleInfo(const clast_for *F,
> +                                    std::vector<int> &NumIters,
> +                                    unsigned &LoopDepth,
> +                                    unsigned &NonPLoopDepth);
> +#endif /* GPU_CODEGEN */
> +
>    /// @brief Check if a loop is parallel
>    ///
>    /// Detect if a clast_for loop can be executed in parallel.
> @@ -530,6 +565,163 @@
>    Builder.SetInsertPoint(AfterLoop);
>  }
>
> +#ifdef GPU_CODEGEN
> +static unsigned getArraySizeInBytes(const ArrayType *AT) {
> +  unsigned Bytes = AT->getNumElements();
> +  if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
> +    Bytes *= getArraySizeInBytes(T);
> +  else
> +    Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
> +
> +  return Bytes;
> +}
> +
> +SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
> +  SetVector<Value*> Values;
> +  OutputBytes = 0;
> +
> +  // Record the memory reference base addresses.
> +  for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
> +    ScopStmt *Stmt = *SI;
> +    for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
> +         E = Stmt->memacc_end(); I != E; ++I) {
> +      Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
> +      Values.insert((BaseAddr));
> +
> +      // FIXME: we assume that there is one and only one array to be written
> +      // in a SCoP.
> +      int NumWrites = 0;
> +      if ((*I)->isWrite()) {
> +        ++NumWrites;
> +        assert(NumWrites <= 1 &&
> +               "We support at most one array to be written in a SCoP.");
> +        if (const PointerType * PT =
> +            dyn_cast<PointerType>(BaseAddr->getType())) {
> +          Type *T = PT->getArrayElementType();
> +          const ArrayType *ATy = dyn_cast<ArrayType>(T);
> +          OutputBytes = getArraySizeInBytes(ATy);
> +        }
> +      }
> +    }
> +  }
> +
> +  return Values;
> +}
> +
> +const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
> +                                                    std::vector<int> &NumIters,
> +                                                    unsigned &LoopDepth,
> +                                                    unsigned &NonPLoopDepth) {
> +  clast_stmt *Stmt = (clast_stmt *)F;
> +  const clast_for *Result;
> +  bool NonParaFlag = false;
> +  LoopDepth = 0;
> +  NonPLoopDepth = 0;
> +
> +  while (Stmt) {
> +    if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
> +      const clast_for *T = (clast_for *) Stmt;
> +      if (isParallelFor(T)) {
> +        if (!NonParaFlag) {
> +          NumIters.push_back(getNumberOfIterations(T));
> +          Result = T;
> +        }
> +      } else
> +        NonParaFlag = true;
> +
> +      Stmt = T->body;
> +      LoopDepth++;
> +      continue;
> +    }
> +    Stmt = Stmt->next;
> +  }
> +
> +  assert(NumIters.size() == 4 &&
> +         "The loops should be tiled into 4-depth parallel loops and an "
> +         "innermost non-parallel one (if exist).");
> +  NonPLoopDepth = LoopDepth - NumIters.size();
> +  assert(NonPLoopDepth <= 1
> +         && "We support only one innermost non-parallel loop currently.");
> +  return (const clast_stmt *)Result->body;
> +}
> +
> +void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
> +  BasicBlock::iterator LoopBody;
> +  SetVector<Value *> Values;
> +  SetVector<Value *> IVS;
> +  std::vector<int> NumIterations;
> +  PTXGenerator::ValueToValueMapTy VMap;
> +
> +  assert(!GPUTriple.empty()
> +         && "Target triple should be set properly for GPGPU code generation.");
> +  PTXGenerator PTXGen(Builder, P, GPUTriple);
> +
> +  // Get original IVS and ScopStmt
> +  unsigned TiledLoopDepth, NonPLoopDepth;
> +  const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
> +                                                TiledLoopDepth, NonPLoopDepth);
> +  const clast_stmt *TmpStmt;
> +  const clast_user_stmt *U;
> +  const clast_for *InnerFor;
> +  if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
> +    InnerFor = (const clast_for *)InnerStmt;
> +    TmpStmt = InnerFor->body;
> +  } else
> +    TmpStmt = InnerStmt;
> +  U = (const clast_user_stmt *) TmpStmt;
> +  ScopStmt *Statement = (ScopStmt *) U->statement->usr;
> +  for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
> +    const Value* IV = Statement->getInductionVariableForDimension(i);
> +    IVS.insert(const_cast<Value *>(IV));
> +  }
> +
> +  unsigned OutBytes;
> +  Values = getGPUValues(OutBytes);
> +  PTXGen.setOutputBytes(OutBytes);
> +  PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
> +
> +  BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
> +  Builder.SetInsertPoint(LoopBody);
> +
> +  BasicBlock *AfterBB = 0;
> +  if (NonPLoopDepth) {
> +    Value *LowerBound, *UpperBound, *IV, *Stride;
> +    Type *IntPtrTy = getIntPtrTy();
> +    LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
> +    UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
> +    Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
> +    IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
> +    const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
> +    Value *OldIV = const_cast<Value *>(OldIV_);
> +    VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
> +  }
> +
> +  updateWithValueMap(VMap, /* reverse */ false);
> +  BlockGenerator::generate(Builder, *Statement, ValueMap, P);
> +  updateWithValueMap(VMap, /* reverse */ true);
> +
> +  if (AfterBB)
> +    Builder.SetInsertPoint(AfterBB->begin());
> +
> +  // FIXME: The replacement of the host base address with the parameter of ptx
> +  // subfunction should have been done by updateWithValueMap. We use the
> +  // following codes to avoid affecting other parts of Polly. This should be
> +  // fixed later.
> +  Function *FN = Builder.GetInsertBlock()->getParent();
> +  for (unsigned j = 0; j < Values.size(); j++) {
> +    Value *baseAddr = Values[j];
> +    for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
> +      for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
> +        I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
> +    }
> +  }
> +  Builder.SetInsertPoint(AfterLoop);
> +  PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
> +                                NumIterations[2], NumIterations[3]);
> +  PTXGen.finishGeneration(FN);
> +}
> +#endif
> +
>  bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
>    const clast_stmt *stmt = f->body;
>
> @@ -647,6 +839,18 @@
>      }
>    }
>
> +#ifdef GPU_CODEGEN
> +  if (GPGPU && isParallelFor(f)) {
> +    if (!parallelCodeGeneration) {
> +      parallelCodeGeneration = true;
> +      parallelLoops.push_back(f->iterator);
> +      codegenForGPGPU(f);
> +      parallelCodeGeneration = false;
> +      return;
> +    }
> +  }
> +#endif
> +
>    codegenForSequential(f);
>  }
>
>
> Modified: polly/trunk/lib/CodeGen/Makefile
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/Makefile?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/Makefile (original)
> +++ polly/trunk/lib/CodeGen/Makefile Fri Aug  3 07:50:07 2012
> @@ -10,6 +10,8 @@
>
>  CPP.Flags += $(POLLY_INC)
>
> +include $(LEVEL)/Makefile.config
> +
>  #
>  # Include Makefile.common so we know what to do.
>  #
>
> Added: polly/trunk/lib/CodeGen/PTXGenerator.cpp
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PTXGenerator.cpp?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/lib/CodeGen/PTXGenerator.cpp (added)
> +++ polly/trunk/lib/CodeGen/PTXGenerator.cpp Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,663 @@
> +//===------ PTXGenerator.cpp -  IR helper to create loops -----------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file contains functions to create GPU parallel codes as LLVM-IR.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#include "polly/CodeGen/PTXGenerator.h"
> +
> +#ifdef GPU_CODEGEN
> +#include "polly/ScopDetection.h"
> +#include "polly/ScopInfo.h"
> +
> +#include "llvm/Intrinsics.h"
> +#include "llvm/Module.h"
> +#include "llvm/PassManager.h"
> +#include "llvm/ADT/SetVector.h"
> +#include "llvm/Analysis/Dominators.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/FormattedStream.h"
> +#include "llvm/Support/TargetRegistry.h"
> +#include "llvm/Target/TargetData.h"
> +#include "llvm/Target/TargetMachine.h"
> +#include "llvm/Transforms/Utils/BasicBlockUtils.h"
> +#include "llvm/Transforms/Utils/Cloning.h"
> +
> +using namespace llvm;
> +using namespace polly;
> +
> +PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P,
> +                           const std::string &Triple):
> +  Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
> +  BlockWidth(1), BlockHeight(1), OutputBytes(0) {
> +
> +  InitializeGPUDataTypes();
> +}
> +
> +Module *PTXGenerator::getModule() {
> +  return Builder.GetInsertBlock()->getParent()->getParent();
> +}
> +
> +Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
> +  assert(NumArgs == 1 && "we support only one array access now.");
> +
> +  Module *M = getModule();
> +  Function *F = Builder.GetInsertBlock()->getParent();
> +  std::vector<Type*> Arguments;
> +  for (int i = 0; i < NumArgs; i++)
> +    Arguments.push_back(Builder.getInt8PtrTy());
> +  FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
> +  Function *FN = Function::Create(FT, Function::InternalLinkage,
> +      F->getName() + "_ptx_subfn", M);
> +  FN->setCallingConv(CallingConv::PTX_Kernel);
> +
> +  // Do not run any optimization pass on the new function.
> +  P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
> +
> +  for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
> +    AI->setName("ptx.Array");
> +
> +  return FN;
> +}
> +
> +void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues,
> +                                     SetVector<Value*> &OriginalIVS,
> +                                     PTXGenerator::ValueToValueMapTy &VMap,
> +                                     Function **SubFunction) {
> +  Function *FN = createSubfunctionDefinition(UsedValues.size());
> +  Module *M = getModule();
> +  LLVMContext &Context = FN->getContext();
> +  IntegerType *Ty = Builder.getInt64Ty();
> +
> +  // Store the previous basic block.
> +  BasicBlock *PrevBB = Builder.GetInsertBlock();
> +
> +  // Create basic blocks.
> +  BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
> +  BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
> +  BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
> +
> +  DominatorTree &DT = P->getAnalysis<DominatorTree>();
> +  DT.addNewBlock(HeaderBB, PrevBB);
> +  DT.addNewBlock(ExitBB, HeaderBB);
> +  DT.addNewBlock(BodyBB, HeaderBB);
> +
> +  Builder.SetInsertPoint(HeaderBB);
> +
> +  // Insert VMap items with maps of array base address on the host to base
> +  // address on the device.
> +  Function::arg_iterator AI = FN->arg_begin();
> +  for (unsigned j = 0; j < UsedValues.size(); j++) {
> +    Value *BaseAddr = UsedValues[j];
> +    Type *ArrayTy = BaseAddr->getType();
> +    Value *Param = Builder.CreateBitCast(AI, ArrayTy);
> +    VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param));
> +    AI++;
> +  }
> +
> +  // FIXME: These intrinsics should be inserted on-demand. However, we insert
> +  // them all currently for simplicity.
> +  Function *GetNctaidX =
> +    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
> +  Function *GetNctaidY =
> +    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
> +  Function *GetCtaidX =
> +    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
> +  Function *GetCtaidY =
> +    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
> +  Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
> +  Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
> +  Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
> +  Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
> +
> +  Value *GridWidth = Builder.CreateCall(GetNctaidX);
> +  GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
> +  Value *GridHeight = Builder.CreateCall(GetNctaidY);
> +  GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
> +  Value *BlockWidth = Builder.CreateCall(GetNtidX);
> +  BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
> +  Value *BlockHeight = Builder.CreateCall(GetNtidY);
> +  BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
> +  Value *BIDx = Builder.CreateCall(GetCtaidX);
> +  BIDx = Builder.CreateIntCast(BIDx, Ty, false);
> +  Value *BIDy = Builder.CreateCall(GetCtaidY);
> +  BIDy = Builder.CreateIntCast(BIDy, Ty, false);
> +  Value *TIDx = Builder.CreateCall(GetTidX);
> +  TIDx = Builder.CreateIntCast(TIDx, Ty, false);
> +  Value *TIDy = Builder.CreateCall(GetTidY);
> +  TIDy = Builder.CreateIntCast(TIDy, Ty, false);
> +
> +  Builder.CreateBr(BodyBB);
> +  Builder.SetInsertPoint(BodyBB);
> +
> +  unsigned NumDims = OriginalIVS.size();
> +  std::vector<Value *> Substitutions;
> +  Value *BlockID, *ThreadID;
> +  switch (NumDims) {
> +  case 1: {
> +    Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight,
> +                                         "p_gpu_blocksize");
> +    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
> +    BlockID = Builder.CreateAdd(BlockID, BIDx);
> +    BlockID = Builder.CreateMul(BlockID, BlockSize);
> +    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
> +    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
> +    ThreadID = Builder.CreateAdd(ThreadID, BlockID);
> +    Substitutions.push_back(ThreadID);
> +    break;
> +  }
> +  case 2: {
> +    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
> +    BlockID = Builder.CreateAdd(BlockID, BIDx);
> +    Substitutions.push_back(BlockID);
> +    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
> +    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
> +    Substitutions.push_back(ThreadID);
> +    break;
> +  }
> +  case 3: {
> +    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
> +    BlockID = Builder.CreateAdd(BlockID, BIDx);
> +    Substitutions.push_back(BlockID);
> +    Substitutions.push_back(TIDy);
> +    Substitutions.push_back(TIDx);
> +    break;
> +  }
> +  case 4: {
> +    Substitutions.push_back(BIDy);
> +    Substitutions.push_back(BIDx);
> +    Substitutions.push_back(TIDy);
> +    Substitutions.push_back(TIDx);
> +    break;
> +  }
> +  default:
> +    assert(true &&
> +           "We cannot transform parallel loops whose depth is larger than 4.");
> +    return;
> +  }
> +
> +  assert(OriginalIVS.size() == Substitutions.size()
> +         && "The size of IVS should be equal to the size of substitutions.");
> +  for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
> +    VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i],
> +                                               Substitutions[i]));
> +  }
> +
> +  Builder.CreateBr(ExitBB);
> +  Builder.SetInsertPoint(--Builder.GetInsertPoint());
> +  BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
> +
> +  // Add the termination of the ptx-device subfunction.
> +  Builder.SetInsertPoint(ExitBB);
> +  Builder.CreateRetVoid();
> +
> +  Builder.SetInsertPoint(LoopBody);
> +  *SubFunction = FN;
> +}
> +
> +void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues,
> +                                   SetVector<Value*> &OriginalIVS,
> +                                   ValueToValueMapTy &VMap,
> +                                   BasicBlock::iterator *LoopBody) {
> +  Function *SubFunction;
> +  BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
> +  createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
> +  *LoopBody = Builder.GetInsertPoint();
> +  Builder.SetInsertPoint(PrevInsertPoint);
> +}
> +
> +IntegerType *PTXGenerator::getInt64Type() {
> +  return Builder.getInt64Ty();
> +}
> +
> +PointerType *PTXGenerator::getI8PtrType() {
> +  return PointerType::getUnqual(Builder.getInt8Ty());
> +}
> +
> +PointerType *PTXGenerator::getPtrI8PtrType() {
> +  return PointerType::getUnqual(getI8PtrType());
> +}
> +
> +PointerType *PTXGenerator::getFloatPtrType() {
> +  return llvm::Type::getFloatPtrTy(getModule()->getContext());
> +}
> +
> +PointerType *PTXGenerator::getGPUContextPtrType() {
> +  return PointerType::getUnqual(ContextTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUModulePtrType() {
> +  return PointerType::getUnqual(ModuleTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUDevicePtrType() {
> +  return PointerType::getUnqual(DeviceTy);
> +}
> +
> +PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
> +  return PointerType::getUnqual(DevDataTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUFunctionPtrType() {
> +  return PointerType::getUnqual(KernelTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUEventPtrType() {
> +  return PointerType::getUnqual(EventTy);
> +}
> +
> +void PTXGenerator::InitializeGPUDataTypes() {
> +  LLVMContext &Context = getModule()->getContext();
> +
> +  ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
> +  ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
> +  KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
> +  DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
> +  DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT");
> +  EventTy = StructType::create(Context, "struct.PollyGPUEventT");
> +}
> +
> +void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
> +  const char *Name = "polly_initDevice";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
> +    Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall2(F, Context, Device);
> +}
> +
> +void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
> +  const char *Name = "polly_getPTXModule";
> +  llvm::Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getI8PtrType());
> +    Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall2(F, Buffer, Module);
> +}
> +
> +void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
> +                                               Value *Kernel) {
> +  const char *Name = "polly_getPTXKernelEntry";
> +  llvm::Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getI8PtrType());
> +    Args.push_back(getGPUModulePtrType());
> +    Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall3(F, Entry, Module, Kernel);
> +}
> +
> +void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
> +                                                            Value *DeviceData,
> +                                                            Value *Size) {
> +  const char *Name = "polly_allocateMemoryForHostAndDevice";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getPtrI8PtrType());
> +    Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
> +    Args.push_back(getInt64Type());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall3(F, HostData, DeviceData, Size);
> +}
> +
> +void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
> +                                                  Value *HostData,
> +                                                  Value *Size) {
> +  const char *Name = "polly_copyFromHostToDevice";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getPtrGPUDevicePtrType());
> +    Args.push_back(getI8PtrType());
> +    Args.push_back(getInt64Type());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall3(F, DeviceData, HostData, Size);
> +}
> +
> +void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
> +                                                  Value *DeviceData,
> +                                                  Value *Size) {
> +  const char *Name = "polly_copyFromDeviceToHost";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getI8PtrType());
> +    Args.push_back(getPtrGPUDevicePtrType());
> +    Args.push_back(getInt64Type());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall3(F, HostData, DeviceData, Size);
> +}
> +
> +void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
> +                                                 Value *BlockWidth,
> +                                                 Value *BlockHeight,
> +                                                 Value *DeviceData) {
> +  const char *Name = "polly_setKernelParameters";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getGPUFunctionPtrType());
> +    Args.push_back(getInt64Type());
> +    Args.push_back(getInt64Type());
> +    Args.push_back(getPtrGPUDevicePtrType());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
> +}
> +
> +void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
> +                                          Value *GridHeight) {
> +  const char *Name = "polly_launchKernel";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getGPUFunctionPtrType());
> +    Args.push_back(getInt64Type());
> +    Args.push_back(getInt64Type());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
> +}
> +
> +void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
> +                                                   Value *StopEvent) {
> +  const char *Name = "polly_startTimerByCudaEvent";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
> +    Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall2(F, StartEvent, StopEvent);
> +}
> +
> +void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
> +                                                  Value *StopEvent,
> +                                                  Value *Timer) {
> +  const char *Name = "polly_stopTimerByCudaEvent";
> +  Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getGPUEventPtrType());
> +    Args.push_back(getGPUEventPtrType());
> +    Args.push_back(getFloatPtrType());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
> +}
> +
> +void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
> +                                                   Value *DeviceData,
> +                                                   Value *Module,
> +                                                   Value *Context,
> +                                                   Value *Kernel) {
> +  const char *Name = "polly_cleanupGPGPUResources";
> +  llvm::Module *M = getModule();
> +  Function *F = M->getFunction(Name);
> +
> +  // If F is not available, declare it.
> +  if (!F) {
> +    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> +    std::vector<Type*> Args;
> +    Args.push_back(getI8PtrType());
> +    Args.push_back(getPtrGPUDevicePtrType());
> +    Args.push_back(getGPUModulePtrType());
> +    Args.push_back(getGPUContextPtrType());
> +    Args.push_back(getGPUFunctionPtrType());
> +    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> +    F = Function::Create(Ty, Linkage, Name, M);
> +  }
> +
> +  Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
> +}
> +
> +Value *PTXGenerator::getCUDAGridWidth() {
> +  return ConstantInt::get(getInt64Type(), GridWidth);
> +}
> +
> +Value *PTXGenerator::getCUDAGridHeight() {
> +  return ConstantInt::get(getInt64Type(), GridHeight);
> +}
> +
> +Value *PTXGenerator::getCUDABlockWidth() {
> +  return ConstantInt::get(getInt64Type(), BlockWidth);
> +}
> +
> +Value *PTXGenerator::getCUDABlockHeight() {
> +  return ConstantInt::get(getInt64Type(), BlockHeight);
> +}
> +
> +Value *PTXGenerator::getOutputArraySizeInBytes() {
> +  return ConstantInt::get(getInt64Type(), OutputBytes);
> +}
> +
> +Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
> +  Module *M = getModule();
> +  std::string LLVMKernelStr;
> +  raw_string_ostream NameROS(LLVMKernelStr);
> +  formatted_raw_ostream FOS(NameROS);
> +  FOS << "target triple = \"" << GPUTriple <<"\"\n";
> +  SubFunction->print(FOS);
> +
> +  // Insert ptx intrinsics into the kernel string.
> +  for (Module::iterator I = M->begin(), E = M->end(); I != E; ) {
> +    Function *F = I++;
> +    // Function must be a prototype and unused.
> +    if (F->isDeclaration() && F->isIntrinsic()) {
> +      switch (F->getIntrinsicID()) {
> +      case Intrinsic::ptx_read_nctaid_x:
> +      case Intrinsic::ptx_read_nctaid_y:
> +      case Intrinsic::ptx_read_ctaid_x:
> +      case Intrinsic::ptx_read_ctaid_y:
> +      case Intrinsic::ptx_read_ntid_x:
> +      case Intrinsic::ptx_read_ntid_y:
> +      case Intrinsic::ptx_read_tid_x:
> +      case Intrinsic::ptx_read_tid_y:
> +        F->print(FOS);
> +        break;
> +      default:
> +        break;
> +      }
> +    }
> +  }
> +
> +  Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr,
> +                                                    "llvm_kernel");
> +  Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
> +  Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");
> +
> +  Function *GetDeviceKernel = Intrinsic::getDeclaration(M,
> +                                                        Intrinsic::codegen);
> +
> +  return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
> +}
> +
> +Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
> +  StringRef Entry = SubFunction->getName();
> +  return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
> +}
> +
> +void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
> +  Module *M = getModule();
> +  SubFunction->eraseFromParent();
> +
> +  if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) {
> +    FuncPTXReadNCtaidX->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) {
> +    FuncPTXReadNCtaidY->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) {
> +    FuncPTXReadCtaidX->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) {
> +    FuncPTXReadCtaidY->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) {
> +    FuncPTXReadNTidX->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) {
> +    FuncPTXReadNTidY->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) {
> +    FuncPTXReadTidX->eraseFromParent();
> +  }
> +
> +  if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) {
> +    FuncPTXReadTidY->eraseFromParent();
> +  }
> +}
> +
> +void PTXGenerator::finishGeneration(Function *F) {
> +  // Define data used by the GPURuntime library.
> +  AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0,
> +                                                  "phcontext");
> +  AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0,
> +                                                 "phdevice");
> +  AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0,
> +                                                 "phmodule");
> +  AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0,
> +                                                 "phkernel");
> +  AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
> +                                                     "pstart_timer");
> +  AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
> +                                                    "pstop_timer");
> +  AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0,
> +                                                "pdevice_data");
> +  AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0,
> +                                                 "phost_data");
> +  Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
> +  AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
> +
> +  // Initialize the GPU device.
> +  createCallInitDevice(PtrCUContext, PtrCUDevice);
> +
> +  // Create the GPU kernel module and entry function.
> +  Value *PTXString = createPTXKernelFunction(F);
> +  Value *PTXEntry = getPTXKernelEntryName(F);
> +  createCallGetPTXModule(PTXString, PtrCUModule);
> +  LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
> +  createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
> +
> +  // Allocate device memory and its corresponding host memory.
> +  createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
> +                                           getOutputArraySizeInBytes());
> +
> +  // Get the pointer to the device memory and set the GPU execution parameters.
> +  LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
> +  LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
> +  createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
> +                                getCUDABlockHeight(), DData);
> +
> +  // Create the start and end timer and record the start time.
> +  createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
> +
> +  // Launch the GPU kernel.
> +  createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
> +
> +  // Copy the results back from the GPU to the host.
> +  LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
> +  createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
> +
> +  // Record the end time.
> +  LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
> +  LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
> +  createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent,
> +                                 PtrElapsedTimes);
> +
> +  // Cleanup all the resources used.
> +  LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
> +  createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext,
> +                                  CUKernel);
> +
> +  // Erase the ptx kernel and device subfunctions and ptx intrinsics from
> +  // current module.
> +  eraseUnusedFunctions(F);
> +}
> +#endif /* GPU_CODEGEN */
>
> Added: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c (added)
> +++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,16 @@
> +int A[128][128];
> +
> +int gpu_pure() {
> +  int i,j;
> +
> +  for(i = 0; i < 128; i++)
> +    for(j = 0; j < 128; j++)
> +      A[i][j] = i*128 + j;
> +
> +  return 0;
> +}
> +
> +int main() {
> +  int b = gpu_pure();
> +  return 0;
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll (added)
> +++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,65 @@
> +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
> +; ModuleID = '2d_innermost_parallel.s'
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-unknown-linux-gnu"
> +
> + at A = common global [128 x [128 x i32]] zeroinitializer, align 16
> +
> +define i32 @gpu_pure() nounwind uwtable {
> +entry:
> +  br label %for.cond
> +
> +for.cond:                                         ; preds = %for.inc6, %entry
> +  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
> +  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
> +  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
> +  br i1 %exitcond6, label %for.body, label %for.end8
> +
> +for.body:                                         ; preds = %for.cond
> +  br label %for.cond1
> +
> +for.cond1:                                        ; preds = %for.inc, %for.body
> +  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
> +  %lftr.wideiv = trunc i64 %indvars.iv to i32
> +  %exitcond = icmp ne i32 %lftr.wideiv, 128
> +  br i1 %exitcond, label %for.body3, label %for.end
> +
> +for.body3:                                        ; preds = %for.cond1
> +  %tmp = shl nsw i64 %indvars.iv2, 7
> +  %tmp7 = add nsw i64 %tmp, %indvars.iv
> +  %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
> +  %tmp8 = trunc i64 %tmp7 to i32
> +  store i32 %tmp8, i32* %arrayidx5, align 4
> +  br label %for.inc
> +
> +for.inc:                                          ; preds = %for.body3
> +  %indvars.iv.next = add i64 %indvars.iv, 1
> +  br label %for.cond1
> +
> +for.end:                                          ; preds = %for.cond1
> +  br label %for.inc6
> +
> +for.inc6:                                         ; preds = %for.end
> +  %indvars.iv.next3 = add i64 %indvars.iv2, 1
> +  br label %for.cond
> +
> +for.end8:                                         ; preds = %for.cond
> +  ret i32 0
> +}
> +
> +define i32 @main() nounwind uwtable {
> +entry:
> +  %call = call i32 @gpu_pure()
> +  ret i32 0
> +}
> +
> +; CHECK:  call void @polly_initDevice
> +; CHECK:  call void @polly_getPTXModule
> +; CHECK:  call void @polly_getPTXKernelEntry
> +; CHECK:  call void @polly_allocateMemoryForHostAndDevice
> +; CHECK:  call void @polly_setKernelParameters
> +; CHECK:  call void @polly_startTimerByCudaEvent
> +; CHECK:  call void @polly_launchKernel
> +; CHECK:  call void @polly_copyFromDeviceToHost
> +; CHECK:  call void @polly_stopTimerByCudaEvent
> +; CHECK:  call void @polly_cleanupGPGPUResources
>
> Added: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c (added)
> +++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,17 @@
> +int A[128][128];
> +
> +int gpu_no_pure() {
> +  int i,j,k;
> +
> +  for(i = 0; i < 128; i++)
> +    for(j = 0; j < 128; j++)
> +      for(k = 0; k < 256; k++)
> +        A[i][j] += i*123/(k+1)+5-j*k-123;
> +
> +  return 0;
> +}
> +
> +int main() {
> +  int b = gpu_no_pure();
> +  return 0;
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll (added)
> +++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,88 @@
> +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
> +; ModuleID = '3d_innermost_non_parallel.s'
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-unknown-linux-gnu"
> +
> + at A = common global [128 x [128 x i32]] zeroinitializer, align 16
> +
> +define i32 @gpu_no_pure() nounwind uwtable {
> +entry:
> +  br label %for.cond
> +
> +for.cond:                                         ; preds = %for.inc16, %entry
> +  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
> +  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
> +  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
> +  br i1 %exitcond6, label %for.body, label %for.end18
> +
> +for.body:                                         ; preds = %for.cond
> +  br label %for.cond1
> +
> +for.cond1:                                        ; preds = %for.inc13, %for.body
> +  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
> +  %lftr.wideiv = trunc i64 %indvars.iv to i32
> +  %exitcond1 = icmp ne i32 %lftr.wideiv, 128
> +  br i1 %exitcond1, label %for.body3, label %for.end15
> +
> +for.body3:                                        ; preds = %for.cond1
> +  br label %for.cond4
> +
> +for.cond4:                                        ; preds = %for.inc, %for.body3
> +  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
> +  %exitcond = icmp ne i32 %k.0, 256
> +  br i1 %exitcond, label %for.body6, label %for.end
> +
> +for.body6:                                        ; preds = %for.cond4
> +  %tmp = mul nsw i64 %indvars.iv2, 123
> +  %add = add nsw i32 %k.0, 1
> +  %tmp7 = trunc i64 %tmp to i32
> +  %div = sdiv i32 %tmp7, %add
> +  %add7 = add nsw i32 %div, 5
> +  %tmp8 = trunc i64 %indvars.iv to i32
> +  %mul8 = mul nsw i32 %tmp8, %k.0
> +  %sub = sub nsw i32 %add7, %mul8
> +  %sub9 = add nsw i32 %sub, -123
> +  %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
> +  %tmp9 = load i32* %arrayidx11, align 4
> +  %add12 = add nsw i32 %tmp9, %sub9
> +  store i32 %add12, i32* %arrayidx11, align 4
> +  br label %for.inc
> +
> +for.inc:                                          ; preds = %for.body6
> +  %inc = add nsw i32 %k.0, 1
> +  br label %for.cond4
> +
> +for.end:                                          ; preds = %for.cond4
> +  br label %for.inc13
> +
> +for.inc13:                                        ; preds = %for.end
> +  %indvars.iv.next = add i64 %indvars.iv, 1
> +  br label %for.cond1
> +
> +for.end15:                                        ; preds = %for.cond1
> +  br label %for.inc16
> +
> +for.inc16:                                        ; preds = %for.end15
> +  %indvars.iv.next3 = add i64 %indvars.iv2, 1
> +  br label %for.cond
> +
> +for.end18:                                        ; preds = %for.cond
> +  ret i32 0
> +}
> +
> +define i32 @main() nounwind uwtable {
> +entry:
> +  %call = call i32 @gpu_no_pure()
> +  ret i32 0
> +}
> +
> +; CHECK:  call void @polly_initDevice
> +; CHECK:  call void @polly_getPTXModule
> +; CHECK:  call void @polly_getPTXKernelEntry
> +; CHECK:  call void @polly_allocateMemoryForHostAndDevice
> +; CHECK:  call void @polly_setKernelParameters
> +; CHECK:  call void @polly_startTimerByCudaEvent
> +; CHECK:  call void @polly_launchKernel
> +; CHECK:  call void @polly_copyFromDeviceToHost
> +; CHECK:  call void @polly_stopTimerByCudaEvent
> +; CHECK:  call void @polly_cleanupGPGPUResources
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,21 @@
> +{
> +   "context" : "{  :  }",
> +   "name" : "for.cond => for.end18",
> +   "statements" : [
> +      {
> +         "accesses" : [
> +            {
> +               "kind" : "read",
> +               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> +            },
> +            {
> +               "kind" : "write",
> +               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> +            }
> +         ],
> +         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
> +         "name" : "Stmt_for_body6",
> +         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
> +      }
> +   ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop.transformed%2Bgpu?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,21 @@
> +{
> +   "context" : "{  :  }",
> +   "name" : "for.cond => for.end18",
> +   "statements" : [
> +      {
> +         "accesses" : [
> +            {
> +               "kind" : "read",
> +               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> +            },
> +            {
> +               "kind" : "write",
> +               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> +            }
> +         ],
> +         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
> +         "name" : "Stmt_for_body6",
> +         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
> +      }
> +   ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,17 @@
> +{
> +   "context" : "{  :  }",
> +   "name" : "for.cond => for.end8",
> +   "statements" : [
> +      {
> +         "accesses" : [
> +            {
> +               "kind" : "write",
> +               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
> +            }
> +         ],
> +         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
> +         "name" : "Stmt_for_body3",
> +         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
> +      }
> +   ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop.transformed%2Bgpu?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,17 @@
> +{
> +   "context" : "{  :  }",
> +   "name" : "for.cond => for.end8",
> +   "statements" : [
> +      {
> +         "accesses" : [
> +            {
> +               "kind" : "write",
> +               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
> +            }
> +         ],
> +         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
> +         "name" : "Stmt_for_body3",
> +         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
> +      }
> +   ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/lit.local.cfg
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/lit.local.cfg?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/lit.local.cfg (added)
> +++ polly/trunk/test/CodeGen/GPGPU/lit.local.cfg Fri Aug  3 07:50:07 2012
> @@ -0,0 +1,5 @@
> +config.suffixes = ['.ll']
> +
> +gpgpu = config.root.enable_gpgpu_codegen
> +if gpgpu not in ['TRUE', 'true'] :
> +    config.unsupported = True
>
> Modified: polly/trunk/test/lit.site.cfg.in
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/lit.site.cfg.in?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/test/lit.site.cfg.in (original)
> +++ polly/trunk/test/lit.site.cfg.in Fri Aug  3 07:50:07 2012
> @@ -7,6 +7,7 @@
>  config.polly_obj_root = "@POLLY_BINARY_DIR@"
>  config.polly_lib_dir = "@POLLY_LIB_DIR@"
>  config.target_triple = "@TARGET_TRIPLE@"
> +config.enable_gpgpu_codegen = "@CUDALIB_FOUND@"
>  lit.params['build_config'] = "@POLLY_SOURCE_DIR@/test"
>
>  ## Check the current platform with regex
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

-- 
Qualcomm Innovation Center, Inc is a member of Code Aurora Forum