[llvm-commits] [polly] r161239 - in /polly/trunk: ./ autoconf/ include/polly/ include/polly/CodeGen/ include/polly/Config/ lib/CodeGen/ test/ test/CodeGen/GPGPU/
Sebastian Pop
spop at codeaurora.org
Mon Aug 20 22:24:41 PDT 2012
Hi,
On Fri, Aug 3, 2012 at 7:50 AM, Tobias Grosser
<grosser at fim.uni-passau.de> wrote:
> Author: grosser
> Date: Fri Aug 3 07:50:07 2012
> New Revision: 161239
>
> URL: http://llvm.org/viewvc/llvm-project?rev=161239&view=rev
> Log:
> Add preliminary implementation for GPGPU code generation.
>
> Translate the selected parallel loop body into a ptx string and run it with the
> cuda driver API. We limit this preliminary implementation to target the
> following special test cases:
>
> - Support only 2-dimensional parallel loops with or without only one innermost
> non-parallel loop.
> - Support write memory access to only one array in a SCoP.
>
> The patch was committed with smaller changes to the build system:
>
> There is now a flag to enable gpu code generation explictly. This was required
> as we need the llvm.codegen() patch applied on the llvm sources, to compile this
> feature correctly. Also, enabling gpu code generation does not require cuda.
> This requirement was removed to allow 'make polly-test' runs, even without an
> installed cuda runtime.
>
> Contributed by: Yabin Hu <yabin.hwu at gmail.com>
>
> Added:
> polly/trunk/include/polly/CodeGen/PTXGenerator.h
> polly/trunk/lib/CodeGen/PTXGenerator.cpp
> polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
> polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
> polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
> polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
> polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
> polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
> polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
> polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
> polly/trunk/test/CodeGen/GPGPU/lit.local.cfg
> Modified:
> polly/trunk/CMakeLists.txt
> polly/trunk/autoconf/configure.ac
> polly/trunk/configure
> polly/trunk/include/polly/Config/config.h.cmake
> polly/trunk/include/polly/Config/config.h.in
> polly/trunk/include/polly/ScopInfo.h
> polly/trunk/lib/CodeGen/CMakeLists.txt
> polly/trunk/lib/CodeGen/CodeGeneration.cpp
> polly/trunk/lib/CodeGen/Makefile
> polly/trunk/test/lit.site.cfg.in
>
> Modified: polly/trunk/CMakeLists.txt
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/CMakeLists.txt?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/CMakeLists.txt (original)
> +++ polly/trunk/CMakeLists.txt Fri Aug 3 07:50:07 2012
> @@ -75,7 +75,14 @@
> FIND_PACKAGE(Isl REQUIRED)
> FIND_PACKAGE(Gmp REQUIRED)
> FIND_PACKAGE(Pluto)
> -FIND_PACKAGE(CUDA)
> +
> +option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
> +if (POLLY_ENABLE_GPGPU_CODEGEN)
> + # Do not require CUDA, as GPU code generation test cases can be run without
> + # a cuda library.
> + FIND_PACKAGE(CUDA)
> + set(GPU_CODEGEN TRUE)
> +endif(POLLY_ENABLE_GPGPU_CODEGEN)
>
> option(POLLY_ENABLE_OPENSCOP "Enable Openscop library for scop import/export" ON)
> if (POLLY_ENABLE_OPENSCOP)
>
> Modified: polly/trunk/autoconf/configure.ac
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/autoconf/configure.ac?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/autoconf/configure.ac (original)
> +++ polly/trunk/autoconf/configure.ac Fri Aug 3 07:50:07 2012
> @@ -120,7 +120,20 @@
> AC_SUBST(scoplib_rpath)
>
> dnl Check if CUDA lib there
> +dnl Disable the build of polly, even if it is checked out into tools/polly.
> +AC_ARG_ENABLE(polly_gpu_codegen,
> + AS_HELP_STRING([--enable-polly-gpu-codegen],
> + [Enable GPU code generation in Polly(default is NO)]),,
> + enableval=default)
> +case "$enableval" in
> + yes) AC_DEFINE([GPU_CODEGEN],[1], [Define if gpu codegen is enabled]) ;;
> + no) AC_DEFINE([GPU_CODEGEN],[0], [Define if gpu codegen is enabled]) ;;
> + default) AC_DEFINE([GPU_CODEGEN],[0], [Define if gpu codegen is enabled]) ;;
In all cases, you define GPU_CODEGEN to either 0 or 1.
(By the way, the string description is accurate: it always enables GPU
codegen...
see below...)
> + *) AC_MSG_ERROR([Invalid setting for --enable-polly-gpu-codegen. Use "yes" or "no"]) ;;
> +esac
> +
> find_lib_and_headers([cuda], [cuda.h], [cuda])
> +
> AS_IF([test "x$cuda_found" = "xyes"],
> [AC_DEFINE([CUDALIB_FOUND],[1],[Define if cudalib found])])
>
>
> Modified: polly/trunk/configure
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/configure?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/configure (original)
> +++ polly/trunk/configure Fri Aug 3 07:50:07 2012
> @@ -654,6 +654,7 @@
> with_pluto
> with_openscop
> with_scoplib
> +enable_polly_gpu_codegen
> with_cuda
> '
> ac_precious_vars='build_alias
> @@ -1272,6 +1273,13 @@
> esac
> cat <<\_ACEOF
>
> +Optional Features:
> + --disable-option-checking ignore unrecognized --enable/--with options
> + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no)
> + --enable-FEATURE[=ARG] include FEATURE [ARG=yes]
> + --enable-polly-gpu-codegen
> + Enable GPU code generation in Polly(default is NO)
> +
> Optional Packages:
> --with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
> --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
> @@ -3002,6 +3010,26 @@
>
>
>
> +# Check whether --enable-polly_gpu_codegen was given.
> +if test "${enable_polly_gpu_codegen+set}" = set; then :
> + enableval=$enable_polly_gpu_codegen;
> +else
> + enableval=default
> +fi
> +
> +case "$enableval" in
> + yes)
> +$as_echo "#define GPU_CODEGEN 1" >>confdefs.h
> + ;;
> + no)
> +$as_echo "#define GPU_CODEGEN 0" >>confdefs.h
> + ;;
> + default)
> +$as_echo "#define GPU_CODEGEN 0" >>confdefs.h
> + ;;
> + *) as_fn_error $? "Invalid setting for --enable-polly-gpu-codegen. Use \"yes\" or \"no\"" "$LINENO" 5 ;;
> +esac
> +
>
> ac_ext=cpp
> ac_cpp='$CXXCPP $CPPFLAGS'
> @@ -3081,6 +3109,7 @@
> ac_compiler_gnu=$ac_cv_c_compiler_gnu
>
>
> +
> if test "x$cuda_found" = "xyes"; then :
>
> $as_echo "#define CUDALIB_FOUND 1" >>confdefs.h
>
> Added: polly/trunk/include/polly/CodeGen/PTXGenerator.h
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PTXGenerator.h?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/include/polly/CodeGen/PTXGenerator.h (added)
> +++ polly/trunk/include/polly/CodeGen/PTXGenerator.h Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,197 @@
> +//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file contains functions to create GPGPU parallel loops as LLVM-IR.
> +//
> +//===----------------------------------------------------------------------===//
> +#ifndef POLLY_CODEGEN_PTXGENERATOR_H
> +#define POLLY_CODEGEN_PTXGENERATOR_H
> +
> +#include "polly/Config/config.h"
> +
> +#ifdef GPU_CODEGEN
And here you test whether GPU_CODEGEN is defined, that is always true:
it is either 0 or 1.
However in this code you do not test whether it is 0 or 1.
This breaks compiles of Polly with configure and when the rest of the
PTX toolchain is not available.
Sebastian
> +#include "llvm/IRBuilder.h"
> +#include "llvm/ADT/SetVector.h"
> +
> +#include <map>
> +
> +namespace llvm {
> + class Value;
> + class Pass;
> + class BasicBlock;
> +}
> +
> +namespace polly {
> +using namespace llvm;
> +
> +class PTXGenerator {
> +public:
> + typedef std::map<Value*, Value*> ValueToValueMapTy;
> +
> + PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple);
> +
> + /// @brief Create a GPGPU parallel loop.
> + ///
> + /// @param UsedValues A set of LLVM-IR Values that should be available to
> + /// the new loop body.
> + /// @param OriginalIVS The new values of the original induction variables.
> + /// @param VMap This map is filled by createParallelLoop(). It
> + /// maps the values in UsedValues to Values through which
> + /// their content is available within the loop body.
> + /// @param LoopBody A pointer to an iterator that is set to point to the
> + /// body of the created loop. It should be used to insert
> + /// instructions that form the actual loop body.
> + void startGeneration(SetVector<Value*> &UsedValues,
> + SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap,
> + BasicBlock::iterator *LoopBody);
> +
> + /// @brief Execute the post-operations to build a GPGPU parallel loop.
> + ///
> + void finishGeneration(Function *SubFunction);
> +
> + /// @brief Set the parameters for launching PTX kernel.
> + ///
> + /// @param GridW A value of the width of a GPU grid.
> + /// @param GridH A value of the height of a GPU grid.
> + /// @param BlockW A value of the width of a GPU block.
> + /// @param BlockH A value of the height of a GPU block.
> + void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) {
> + GridWidth = GridW;
> + GridHeight = GridH;
> + BlockWidth = BlockW;
> + BlockHeight = BlockH;
> + }
> +
> + /// @brief Set the size of the output array.
> + ///
> + /// This size is used to allocate memory on the device and the host.
> + ///
> + /// @param Bytes Output array size in bytes.
> + void setOutputBytes(unsigned Bytes) {
> + OutputBytes = Bytes;
> + }
> +
> +private:
> + IRBuilder<> &Builder;
> + Pass *P;
> +
> + /// @brief The target triple of the device.
> + const std::string &GPUTriple;
> +
> + ///@brief Parameters used for launching PTX kernel.
> + int GridWidth, GridHeight, BlockWidth, BlockHeight;
> +
> + /// @brief Size of the output array in bytes.
> + unsigned OutputBytes;
> +
> + /// @brief Polly's GPU data types.
> + StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy;
> +
> + void InitializeGPUDataTypes();
> + IntegerType *getInt64Type(); // i64
> + PointerType *getI8PtrType(); // char *
> + PointerType *getPtrI8PtrType(); // char **
> + PointerType *getFloatPtrType(); // float *
> + PointerType *getGPUContextPtrType(); // %struct.PollyGPUContextT *
> + PointerType *getGPUModulePtrType(); // %struct.PollyGPUModuleT *
> + PointerType *getGPUDevicePtrType(); // %struct.PollyGPUDeviceT *
> + PointerType *getPtrGPUDevicePtrType(); // %struct.PollyGPUDevicePtrT *
> + PointerType *getGPUFunctionPtrType(); // %struct.PollyGPUFunctionT *
> + PointerType *getGPUEventPtrType(); // %struct.PollyGPUEventT *
> +
> + Module *getModule();
> +
> + /// @brief Create the kernel string containing LLVM IR.
> + ///
> + /// @param SubFunction A pointer to the device code function.
> + /// @return A global string variable containing the LLVM IR codes
> + // of the SubFunction.
> + Value *createPTXKernelFunction(Function *SubFunction);
> +
> + /// @brief Get the entry name of the device kernel function.
> + ///
> + /// @param SubFunction A pointer to the device code function.
> + /// @return A global string variable containing the entry name of
> + /// the SubFunction.
> + Value *getPTXKernelEntryName(Function *SubFunction);
> +
> + void createCallInitDevice(Value *Context, Value *Device);
> + void createCallGetPTXModule(Value *Buffer, Value *Module);
> + void createCallGetPTXKernelEntry(Value *Entry, Value *Module,
> + Value *Kernel);
> + void createCallAllocateMemoryForHostAndDevice(Value *HostData,
> + Value *DeviceData,
> + Value *Size);
> + void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData,
> + Value *Size);
> + void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData,
> + Value *Size);
> + void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth,
> + Value *BlockHeight, Value *DeviceData);
> + void createCallLaunchKernel(Value *Kernel, Value *GridWidth,
> + Value *GridHeight);
> + void createCallStartTimerByCudaEvent(Value *StartEvent,
> + Value *StopEvent);
> + void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent,
> + Value *Timer);
> + void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData,
> + Value *Module, Value *Context,
> + Value *Kernel);
> +
> + /// @brief Create the CUDA subfunction.
> + ///
> + /// @param UsedValues A set of LLVM-IR Values that should be available to
> + /// the new loop body.
> + /// @param VMap This map that is filled by createSubfunction(). It
> + /// maps the values in UsedValues to Values through which
> + /// their content is available within the loop body.
> + /// @param OriginalIVS The new values of the original induction variables.
> + /// @param SubFunction The newly created SubFunction is returned here.
> + void createSubfunction(SetVector<Value*> &UsedValues,
> + SetVector<Value*> &OriginalIVS,
> + ValueToValueMapTy &VMap,
> + Function **SubFunction);
> +
> + /// @brief Create the definition of the CUDA subfunction.
> + ///
> + /// @param NumArgs The number of parameters of this subfunction. This is
> + /// usually set to the number of memory accesses which
> + /// will be copied from host to device.
> + Function *createSubfunctionDefinition(int NumArgs);
> +
> + /// @brief Extract all the ptx related subfunctions into a new module.
> + ///
> + /// @param M Current module.
> + /// @return The generated module containing only gpu related
> + /// subfunctions.
> + Module *extractPTXFunctionsFromModule(const Module *M);
> +
> + /// @brief Get the Value of CUDA block width.
> + Value *getCUDABlockWidth();
> +
> + /// @brief Get the Value of CUDA block height.
> + Value *getCUDABlockHeight();
> +
> + /// @brief Get the Value of CUDA Gird width.
> + Value *getCUDAGridWidth();
> +
> + /// @brief Get the Value of CUDA grid height.
> + Value *getCUDAGridHeight();
> +
> + /// @brief Get the Value of the bytes of the output array.
> + Value *getOutputArraySizeInBytes();
> +
> + /// @brief Erase the ptx-related subfunctions and declarations.
> + ///
> + /// @param SubFunction A pointer to the device code function.
> + void eraseUnusedFunctions(Function *SubFunction);
> +};
> +} // end namespace polly
> +#endif /* GPU_CODEGEN */
> +#endif /* POLLY_CODEGEN_PTXGENERATOR_H */
>
> Modified: polly/trunk/include/polly/Config/config.h.cmake
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Config/config.h.cmake?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/Config/config.h.cmake (original)
> +++ polly/trunk/include/polly/Config/config.h.cmake Fri Aug 3 07:50:07 2012
> @@ -19,5 +19,6 @@
> #cmakedefine PLUTO_FOUND
> #cmakedefine SCOPLIB_FOUND
> #cmakedefine CUDALIB_FOUND
> +#cmakedefine GPU_CODEGEN
>
> #endif
>
> Modified: polly/trunk/include/polly/Config/config.h.in
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/Config/config.h.in?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/Config/config.h.in (original)
> +++ polly/trunk/include/polly/Config/config.h.in Fri Aug 3 07:50:07 2012
> @@ -9,6 +9,9 @@
> /* Define if cudalib found */
> #undef CUDALIB_FOUND
>
> +/* Define if gpu codegen is enabled */
> +#undef GPU_CODEGEN
> +
> /* Define if ISL has a code generator */
> #undef ISL_CODEGEN_FOUND
>
>
> Modified: polly/trunk/include/polly/ScopInfo.h
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScopInfo.h?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/include/polly/ScopInfo.h (original)
> +++ polly/trunk/include/polly/ScopInfo.h Fri Aug 3 07:50:07 2012
> @@ -125,6 +125,9 @@
> /// @brief Is this a read memory access?
> bool isRead() const { return Type == MemoryAccess::Read; }
>
> + /// @brief Is this a write memory access?
> + bool isWrite() const { return Type == MemoryAccess::Write; }
> +
> isl_map *getAccessRelation() const;
>
> /// @brief Get an isl string representing this access function.
>
> Modified: polly/trunk/lib/CodeGen/CMakeLists.txt
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CMakeLists.txt?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/CMakeLists.txt (original)
> +++ polly/trunk/lib/CodeGen/CMakeLists.txt Fri Aug 3 07:50:07 2012
> @@ -9,10 +9,16 @@
> IslCodeGeneration.cpp)
> endif (ISL_CODEGEN_FOUND)
>
> +if (GPU_CODEGEN)
> + set (GPGPU_CODEGEN_FILES
> + PTXGenerator.cpp)
> +endif (GPU_CODEGEN)
> +
> add_polly_library(PollyCodeGen
> BlockGenerators.cpp
> ${CLOOG_FILES}
> ${ISL_CODEGEN_FILES}
> LoopGenerators.cpp
> Utils.cpp
> + ${GPGPU_CODEGEN_FILES}
> )
>
> Modified: polly/trunk/lib/CodeGen/CodeGeneration.cpp
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CodeGeneration.cpp?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/CodeGeneration.cpp (original)
> +++ polly/trunk/lib/CodeGen/CodeGeneration.cpp Fri Aug 3 07:50:07 2012
> @@ -31,6 +31,7 @@
> #include "polly/CodeGen/CodeGeneration.h"
> #include "polly/CodeGen/BlockGenerators.h"
> #include "polly/CodeGen/LoopGenerators.h"
> +#include "polly/CodeGen/PTXGenerator.h"
> #include "polly/CodeGen/Utils.h"
> #include "polly/Support/GICHelper.h"
>
> @@ -65,6 +66,19 @@
> cl::value_desc("OpenMP code generation enabled if true"),
> cl::init(false), cl::ZeroOrMore);
>
> +#ifdef GPU_CODEGEN
> +static cl::opt<bool>
> +GPGPU("enable-polly-gpgpu",
> + cl::desc("Generate GPU parallel code"), cl::Hidden,
> + cl::value_desc("GPGPU code generation enabled if true"),
> + cl::init(false), cl::ZeroOrMore);
> +
> +static cl::opt<std::string>
> +GPUTriple("polly-gpgpu-triple",
> + cl::desc("Target triple for GPU code generation"),
> + cl::Hidden, cl::init(""));
> +#endif /* GPU_CODEGEN */
> +
> static cl::opt<bool>
> AtLeastOnce("enable-polly-atLeastOnce",
> cl::desc("Give polly the hint, that every loop is executed at least"
> @@ -284,6 +298,27 @@
> /// statement.
> void codegenForOpenMP(const clast_for *f);
>
> +#ifdef GPU_CODEGEN
> + /// @brief Create GPGPU device memory access values.
> + ///
> + /// Create a list of values that will be set to be parameters of the GPGPU
> + /// subfunction. These parameters represent device memory base addresses
> + /// and the size in bytes.
> + SetVector<Value*> getGPUValues(unsigned &OutputBytes);
> +
> + /// @brief Create a GPU parallel for loop.
> + ///
> + /// This loop reflects a loop as if it would have been created by a GPU
> + /// statement.
> + void codegenForGPGPU(const clast_for *F);
> +
> + /// @brief Get innermost for loop.
> + const clast_stmt *getScheduleInfo(const clast_for *F,
> + std::vector<int> &NumIters,
> + unsigned &LoopDepth,
> + unsigned &NonPLoopDepth);
> +#endif /* GPU_CODEGEN */
> +
> /// @brief Check if a loop is parallel
> ///
> /// Detect if a clast_for loop can be executed in parallel.
> @@ -530,6 +565,163 @@
> Builder.SetInsertPoint(AfterLoop);
> }
>
> +#ifdef GPU_CODEGEN
> +static unsigned getArraySizeInBytes(const ArrayType *AT) {
> + unsigned Bytes = AT->getNumElements();
> + if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
> + Bytes *= getArraySizeInBytes(T);
> + else
> + Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
> +
> + return Bytes;
> +}
> +
> +SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
> + SetVector<Value*> Values;
> + OutputBytes = 0;
> +
> + // Record the memory reference base addresses.
> + for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
> + ScopStmt *Stmt = *SI;
> + for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
> + E = Stmt->memacc_end(); I != E; ++I) {
> + Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
> + Values.insert((BaseAddr));
> +
> + // FIXME: we assume that there is one and only one array to be written
> + // in a SCoP.
> + int NumWrites = 0;
> + if ((*I)->isWrite()) {
> + ++NumWrites;
> + assert(NumWrites <= 1 &&
> + "We support at most one array to be written in a SCoP.");
> + if (const PointerType * PT =
> + dyn_cast<PointerType>(BaseAddr->getType())) {
> + Type *T = PT->getArrayElementType();
> + const ArrayType *ATy = dyn_cast<ArrayType>(T);
> + OutputBytes = getArraySizeInBytes(ATy);
> + }
> + }
> + }
> + }
> +
> + return Values;
> +}
> +
> +const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
> + std::vector<int> &NumIters,
> + unsigned &LoopDepth,
> + unsigned &NonPLoopDepth) {
> + clast_stmt *Stmt = (clast_stmt *)F;
> + const clast_for *Result;
> + bool NonParaFlag = false;
> + LoopDepth = 0;
> + NonPLoopDepth = 0;
> +
> + while (Stmt) {
> + if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
> + const clast_for *T = (clast_for *) Stmt;
> + if (isParallelFor(T)) {
> + if (!NonParaFlag) {
> + NumIters.push_back(getNumberOfIterations(T));
> + Result = T;
> + }
> + } else
> + NonParaFlag = true;
> +
> + Stmt = T->body;
> + LoopDepth++;
> + continue;
> + }
> + Stmt = Stmt->next;
> + }
> +
> + assert(NumIters.size() == 4 &&
> + "The loops should be tiled into 4-depth parallel loops and an "
> + "innermost non-parallel one (if exist).");
> + NonPLoopDepth = LoopDepth - NumIters.size();
> + assert(NonPLoopDepth <= 1
> + && "We support only one innermost non-parallel loop currently.");
> + return (const clast_stmt *)Result->body;
> +}
> +
> +void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
> + BasicBlock::iterator LoopBody;
> + SetVector<Value *> Values;
> + SetVector<Value *> IVS;
> + std::vector<int> NumIterations;
> + PTXGenerator::ValueToValueMapTy VMap;
> +
> + assert(!GPUTriple.empty()
> + && "Target triple should be set properly for GPGPU code generation.");
> + PTXGenerator PTXGen(Builder, P, GPUTriple);
> +
> + // Get original IVS and ScopStmt
> + unsigned TiledLoopDepth, NonPLoopDepth;
> + const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
> + TiledLoopDepth, NonPLoopDepth);
> + const clast_stmt *TmpStmt;
> + const clast_user_stmt *U;
> + const clast_for *InnerFor;
> + if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
> + InnerFor = (const clast_for *)InnerStmt;
> + TmpStmt = InnerFor->body;
> + } else
> + TmpStmt = InnerStmt;
> + U = (const clast_user_stmt *) TmpStmt;
> + ScopStmt *Statement = (ScopStmt *) U->statement->usr;
> + for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
> + const Value* IV = Statement->getInductionVariableForDimension(i);
> + IVS.insert(const_cast<Value *>(IV));
> + }
> +
> + unsigned OutBytes;
> + Values = getGPUValues(OutBytes);
> + PTXGen.setOutputBytes(OutBytes);
> + PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
> +
> + BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
> + Builder.SetInsertPoint(LoopBody);
> +
> + BasicBlock *AfterBB = 0;
> + if (NonPLoopDepth) {
> + Value *LowerBound, *UpperBound, *IV, *Stride;
> + Type *IntPtrTy = getIntPtrTy();
> + LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
> + UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
> + Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
> + IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
> + const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
> + Value *OldIV = const_cast<Value *>(OldIV_);
> + VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
> + }
> +
> + updateWithValueMap(VMap, /* reverse */ false);
> + BlockGenerator::generate(Builder, *Statement, ValueMap, P);
> + updateWithValueMap(VMap, /* reverse */ true);
> +
> + if (AfterBB)
> + Builder.SetInsertPoint(AfterBB->begin());
> +
> + // FIXME: The replacement of the host base address with the parameter of ptx
> + // subfunction should have been done by updateWithValueMap. We use the
> + // following codes to avoid affecting other parts of Polly. This should be
> + // fixed later.
> + Function *FN = Builder.GetInsertBlock()->getParent();
> + for (unsigned j = 0; j < Values.size(); j++) {
> + Value *baseAddr = Values[j];
> + for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
> + for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
> + I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
> + }
> + }
> + Builder.SetInsertPoint(AfterLoop);
> + PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
> + NumIterations[2], NumIterations[3]);
> + PTXGen.finishGeneration(FN);
> +}
> +#endif
> +
> bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
> const clast_stmt *stmt = f->body;
>
> @@ -647,6 +839,18 @@
> }
> }
>
> +#ifdef GPU_CODEGEN
> + if (GPGPU && isParallelFor(f)) {
> + if (!parallelCodeGeneration) {
> + parallelCodeGeneration = true;
> + parallelLoops.push_back(f->iterator);
> + codegenForGPGPU(f);
> + parallelCodeGeneration = false;
> + return;
> + }
> + }
> +#endif
> +
> codegenForSequential(f);
> }
>
>
> Modified: polly/trunk/lib/CodeGen/Makefile
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/Makefile?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/lib/CodeGen/Makefile (original)
> +++ polly/trunk/lib/CodeGen/Makefile Fri Aug 3 07:50:07 2012
> @@ -10,6 +10,8 @@
>
> CPP.Flags += $(POLLY_INC)
>
> +include $(LEVEL)/Makefile.config
> +
> #
> # Include Makefile.common so we know what to do.
> #
>
> Added: polly/trunk/lib/CodeGen/PTXGenerator.cpp
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PTXGenerator.cpp?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/lib/CodeGen/PTXGenerator.cpp (added)
> +++ polly/trunk/lib/CodeGen/PTXGenerator.cpp Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,663 @@
> +//===------ PTXGenerator.cpp - IR helper to create loops -----------------===//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file contains functions to create GPU parallel codes as LLVM-IR.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#include "polly/CodeGen/PTXGenerator.h"
> +
> +#ifdef GPU_CODEGEN
> +#include "polly/ScopDetection.h"
> +#include "polly/ScopInfo.h"
> +
> +#include "llvm/Intrinsics.h"
> +#include "llvm/Module.h"
> +#include "llvm/PassManager.h"
> +#include "llvm/ADT/SetVector.h"
> +#include "llvm/Analysis/Dominators.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/FormattedStream.h"
> +#include "llvm/Support/TargetRegistry.h"
> +#include "llvm/Target/TargetData.h"
> +#include "llvm/Target/TargetMachine.h"
> +#include "llvm/Transforms/Utils/BasicBlockUtils.h"
> +#include "llvm/Transforms/Utils/Cloning.h"
> +
> +using namespace llvm;
> +using namespace polly;
> +
> +PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P,
> + const std::string &Triple):
> + Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
> + BlockWidth(1), BlockHeight(1), OutputBytes(0) {
> +
> + InitializeGPUDataTypes();
> +}
> +
> +Module *PTXGenerator::getModule() {
> + return Builder.GetInsertBlock()->getParent()->getParent();
> +}
> +
> +Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
> + assert(NumArgs == 1 && "we support only one array access now.");
> +
> + Module *M = getModule();
> + Function *F = Builder.GetInsertBlock()->getParent();
> + std::vector<Type*> Arguments;
> + for (int i = 0; i < NumArgs; i++)
> + Arguments.push_back(Builder.getInt8PtrTy());
> + FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
> + Function *FN = Function::Create(FT, Function::InternalLinkage,
> + F->getName() + "_ptx_subfn", M);
> + FN->setCallingConv(CallingConv::PTX_Kernel);
> +
> + // Do not run any optimization pass on the new function.
> + P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
> +
> + for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
> + AI->setName("ptx.Array");
> +
> + return FN;
> +}
> +
> +void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues,
> + SetVector<Value*> &OriginalIVS,
> + PTXGenerator::ValueToValueMapTy &VMap,
> + Function **SubFunction) {
> + Function *FN = createSubfunctionDefinition(UsedValues.size());
> + Module *M = getModule();
> + LLVMContext &Context = FN->getContext();
> + IntegerType *Ty = Builder.getInt64Ty();
> +
> + // Store the previous basic block.
> + BasicBlock *PrevBB = Builder.GetInsertBlock();
> +
> + // Create basic blocks.
> + BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
> + BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
> + BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
> +
> + DominatorTree &DT = P->getAnalysis<DominatorTree>();
> + DT.addNewBlock(HeaderBB, PrevBB);
> + DT.addNewBlock(ExitBB, HeaderBB);
> + DT.addNewBlock(BodyBB, HeaderBB);
> +
> + Builder.SetInsertPoint(HeaderBB);
> +
> + // Insert VMap items with maps of array base address on the host to base
> + // address on the device.
> + Function::arg_iterator AI = FN->arg_begin();
> + for (unsigned j = 0; j < UsedValues.size(); j++) {
> + Value *BaseAddr = UsedValues[j];
> + Type *ArrayTy = BaseAddr->getType();
> + Value *Param = Builder.CreateBitCast(AI, ArrayTy);
> + VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param));
> + AI++;
> + }
> +
> + // FIXME: These intrinsics should be inserted on-demand. However, we insert
> + // them all currently for simplicity.
> + Function *GetNctaidX =
> + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
> + Function *GetNctaidY =
> + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
> + Function *GetCtaidX =
> + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
> + Function *GetCtaidY =
> + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
> + Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
> + Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
> + Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
> + Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
> +
> + Value *GridWidth = Builder.CreateCall(GetNctaidX);
> + GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
> + Value *GridHeight = Builder.CreateCall(GetNctaidY);
> + GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
> + Value *BlockWidth = Builder.CreateCall(GetNtidX);
> + BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
> + Value *BlockHeight = Builder.CreateCall(GetNtidY);
> + BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
> + Value *BIDx = Builder.CreateCall(GetCtaidX);
> + BIDx = Builder.CreateIntCast(BIDx, Ty, false);
> + Value *BIDy = Builder.CreateCall(GetCtaidY);
> + BIDy = Builder.CreateIntCast(BIDy, Ty, false);
> + Value *TIDx = Builder.CreateCall(GetTidX);
> + TIDx = Builder.CreateIntCast(TIDx, Ty, false);
> + Value *TIDy = Builder.CreateCall(GetTidY);
> + TIDy = Builder.CreateIntCast(TIDy, Ty, false);
> +
> + Builder.CreateBr(BodyBB);
> + Builder.SetInsertPoint(BodyBB);
> +
> + unsigned NumDims = OriginalIVS.size();
> + std::vector<Value *> Substitutions;
> + Value *BlockID, *ThreadID;
> + switch (NumDims) {
> + case 1: {
> + Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight,
> + "p_gpu_blocksize");
> + BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
> + BlockID = Builder.CreateAdd(BlockID, BIDx);
> + BlockID = Builder.CreateMul(BlockID, BlockSize);
> + ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
> + ThreadID = Builder.CreateAdd(ThreadID, TIDx);
> + ThreadID = Builder.CreateAdd(ThreadID, BlockID);
> + Substitutions.push_back(ThreadID);
> + break;
> + }
> + case 2: {
> + BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
> + BlockID = Builder.CreateAdd(BlockID, BIDx);
> + Substitutions.push_back(BlockID);
> + ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
> + ThreadID = Builder.CreateAdd(ThreadID, TIDx);
> + Substitutions.push_back(ThreadID);
> + break;
> + }
> + case 3: {
> + BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
> + BlockID = Builder.CreateAdd(BlockID, BIDx);
> + Substitutions.push_back(BlockID);
> + Substitutions.push_back(TIDy);
> + Substitutions.push_back(TIDx);
> + break;
> + }
> + case 4: {
> + Substitutions.push_back(BIDy);
> + Substitutions.push_back(BIDx);
> + Substitutions.push_back(TIDy);
> + Substitutions.push_back(TIDx);
> + break;
> + }
> + default:
> + assert(true &&
> + "We cannot transform parallel loops whose depth is larger than 4.");
> + return;
> + }
> +
> + assert(OriginalIVS.size() == Substitutions.size()
> + && "The size of IVS should be equal to the size of substitutions.");
> + for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
> + VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i],
> + Substitutions[i]));
> + }
> +
> + Builder.CreateBr(ExitBB);
> + Builder.SetInsertPoint(--Builder.GetInsertPoint());
> + BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
> +
> + // Add the termination of the ptx-device subfunction.
> + Builder.SetInsertPoint(ExitBB);
> + Builder.CreateRetVoid();
> +
> + Builder.SetInsertPoint(LoopBody);
> + *SubFunction = FN;
> +}
> +
> +void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues,
> + SetVector<Value*> &OriginalIVS,
> + ValueToValueMapTy &VMap,
> + BasicBlock::iterator *LoopBody) {
> + Function *SubFunction;
> + BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
> + createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
> + *LoopBody = Builder.GetInsertPoint();
> + Builder.SetInsertPoint(PrevInsertPoint);
> +}
> +
> +IntegerType *PTXGenerator::getInt64Type() {
> + return Builder.getInt64Ty();
> +}
> +
> +PointerType *PTXGenerator::getI8PtrType() {
> + return PointerType::getUnqual(Builder.getInt8Ty());
> +}
> +
> +PointerType *PTXGenerator::getPtrI8PtrType() {
> + return PointerType::getUnqual(getI8PtrType());
> +}
> +
> +PointerType *PTXGenerator::getFloatPtrType() {
> + return llvm::Type::getFloatPtrTy(getModule()->getContext());
> +}
> +
> +PointerType *PTXGenerator::getGPUContextPtrType() {
> + return PointerType::getUnqual(ContextTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUModulePtrType() {
> + return PointerType::getUnqual(ModuleTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUDevicePtrType() {
> + return PointerType::getUnqual(DeviceTy);
> +}
> +
> +PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
> + return PointerType::getUnqual(DevDataTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUFunctionPtrType() {
> + return PointerType::getUnqual(KernelTy);
> +}
> +
> +PointerType *PTXGenerator::getGPUEventPtrType() {
> + return PointerType::getUnqual(EventTy);
> +}
> +
> +void PTXGenerator::InitializeGPUDataTypes() {
> + LLVMContext &Context = getModule()->getContext();
> +
> + ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
> + ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
> + KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
> + DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
> + DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT");
> + EventTy = StructType::create(Context, "struct.PollyGPUEventT");
> +}
> +
> +void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
> + const char *Name = "polly_initDevice";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
> + Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall2(F, Context, Device);
> +}
> +
> +void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
> + const char *Name = "polly_getPTXModule";
> + llvm::Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getI8PtrType());
> + Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall2(F, Buffer, Module);
> +}
> +
> +void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
> + Value *Kernel) {
> + const char *Name = "polly_getPTXKernelEntry";
> + llvm::Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getI8PtrType());
> + Args.push_back(getGPUModulePtrType());
> + Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall3(F, Entry, Module, Kernel);
> +}
> +
> +void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
> + Value *DeviceData,
> + Value *Size) {
> + const char *Name = "polly_allocateMemoryForHostAndDevice";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getPtrI8PtrType());
> + Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
> + Args.push_back(getInt64Type());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall3(F, HostData, DeviceData, Size);
> +}
> +
> +void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
> + Value *HostData,
> + Value *Size) {
> + const char *Name = "polly_copyFromHostToDevice";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getPtrGPUDevicePtrType());
> + Args.push_back(getI8PtrType());
> + Args.push_back(getInt64Type());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall3(F, DeviceData, HostData, Size);
> +}
> +
> +void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
> + Value *DeviceData,
> + Value *Size) {
> + const char *Name = "polly_copyFromDeviceToHost";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getI8PtrType());
> + Args.push_back(getPtrGPUDevicePtrType());
> + Args.push_back(getInt64Type());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall3(F, HostData, DeviceData, Size);
> +}
> +
> +void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
> + Value *BlockWidth,
> + Value *BlockHeight,
> + Value *DeviceData) {
> + const char *Name = "polly_setKernelParameters";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getGPUFunctionPtrType());
> + Args.push_back(getInt64Type());
> + Args.push_back(getInt64Type());
> + Args.push_back(getPtrGPUDevicePtrType());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
> +}
> +
> +void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
> + Value *GridHeight) {
> + const char *Name = "polly_launchKernel";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getGPUFunctionPtrType());
> + Args.push_back(getInt64Type());
> + Args.push_back(getInt64Type());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
> +}
> +
> +void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
> + Value *StopEvent) {
> + const char *Name = "polly_startTimerByCudaEvent";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
> + Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall2(F, StartEvent, StopEvent);
> +}
> +
> +void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
> + Value *StopEvent,
> + Value *Timer) {
> + const char *Name = "polly_stopTimerByCudaEvent";
> + Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getGPUEventPtrType());
> + Args.push_back(getGPUEventPtrType());
> + Args.push_back(getFloatPtrType());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
> +}
> +
> +void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
> + Value *DeviceData,
> + Value *Module,
> + Value *Context,
> + Value *Kernel) {
> + const char *Name = "polly_cleanupGPGPUResources";
> + llvm::Module *M = getModule();
> + Function *F = M->getFunction(Name);
> +
> + // If F is not available, declare it.
> + if (!F) {
> + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
> + std::vector<Type*> Args;
> + Args.push_back(getI8PtrType());
> + Args.push_back(getPtrGPUDevicePtrType());
> + Args.push_back(getGPUModulePtrType());
> + Args.push_back(getGPUContextPtrType());
> + Args.push_back(getGPUFunctionPtrType());
> + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
> + F = Function::Create(Ty, Linkage, Name, M);
> + }
> +
> + Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
> +}
> +
> +Value *PTXGenerator::getCUDAGridWidth() {
> + return ConstantInt::get(getInt64Type(), GridWidth);
> +}
> +
> +Value *PTXGenerator::getCUDAGridHeight() {
> + return ConstantInt::get(getInt64Type(), GridHeight);
> +}
> +
> +Value *PTXGenerator::getCUDABlockWidth() {
> + return ConstantInt::get(getInt64Type(), BlockWidth);
> +}
> +
> +Value *PTXGenerator::getCUDABlockHeight() {
> + return ConstantInt::get(getInt64Type(), BlockHeight);
> +}
> +
> +Value *PTXGenerator::getOutputArraySizeInBytes() {
> + return ConstantInt::get(getInt64Type(), OutputBytes);
> +}
> +
> +Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
> + Module *M = getModule();
> + std::string LLVMKernelStr;
> + raw_string_ostream NameROS(LLVMKernelStr);
> + formatted_raw_ostream FOS(NameROS);
> + FOS << "target triple = \"" << GPUTriple <<"\"\n";
> + SubFunction->print(FOS);
> +
> + // Insert ptx intrinsics into the kernel string.
> + for (Module::iterator I = M->begin(), E = M->end(); I != E; ) {
> + Function *F = I++;
> + // Function must be a prototype and unused.
> + if (F->isDeclaration() && F->isIntrinsic()) {
> + switch (F->getIntrinsicID()) {
> + case Intrinsic::ptx_read_nctaid_x:
> + case Intrinsic::ptx_read_nctaid_y:
> + case Intrinsic::ptx_read_ctaid_x:
> + case Intrinsic::ptx_read_ctaid_y:
> + case Intrinsic::ptx_read_ntid_x:
> + case Intrinsic::ptx_read_ntid_y:
> + case Intrinsic::ptx_read_tid_x:
> + case Intrinsic::ptx_read_tid_y:
> + F->print(FOS);
> + break;
> + default:
> + break;
> + }
> + }
> + }
> +
> + Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr,
> + "llvm_kernel");
> + Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
> + Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");
> +
> + Function *GetDeviceKernel = Intrinsic::getDeclaration(M,
> + Intrinsic::codegen);
> +
> + return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
> +}
> +
> +Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
> + StringRef Entry = SubFunction->getName();
> + return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
> +}
> +
> +void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
> + Module *M = getModule();
> + SubFunction->eraseFromParent();
> +
> + if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) {
> + FuncPTXReadNCtaidX->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) {
> + FuncPTXReadNCtaidY->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) {
> + FuncPTXReadCtaidX->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) {
> + FuncPTXReadCtaidY->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) {
> + FuncPTXReadNTidX->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) {
> + FuncPTXReadNTidY->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) {
> + FuncPTXReadTidX->eraseFromParent();
> + }
> +
> + if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) {
> + FuncPTXReadTidY->eraseFromParent();
> + }
> +}
> +
> +void PTXGenerator::finishGeneration(Function *F) {
> + // Define data used by the GPURuntime library.
> + AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0,
> + "phcontext");
> + AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0,
> + "phdevice");
> + AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0,
> + "phmodule");
> + AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0,
> + "phkernel");
> + AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
> + "pstart_timer");
> + AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
> + "pstop_timer");
> + AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0,
> + "pdevice_data");
> + AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0,
> + "phost_data");
> + Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
> + AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
> +
> + // Initialize the GPU device.
> + createCallInitDevice(PtrCUContext, PtrCUDevice);
> +
> + // Create the GPU kernel module and entry function.
> + Value *PTXString = createPTXKernelFunction(F);
> + Value *PTXEntry = getPTXKernelEntryName(F);
> + createCallGetPTXModule(PTXString, PtrCUModule);
> + LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
> + createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
> +
> + // Allocate device memory and its corresponding host memory.
> + createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
> + getOutputArraySizeInBytes());
> +
> + // Get the pointer to the device memory and set the GPU execution parameters.
> + LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
> + LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
> + createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
> + getCUDABlockHeight(), DData);
> +
> + // Create the start and end timer and record the start time.
> + createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
> +
> + // Launch the GPU kernel.
> + createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
> +
> + // Copy the results back from the GPU to the host.
> + LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
> + createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
> +
> + // Record the end time.
> + LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
> + LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
> + createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent,
> + PtrElapsedTimes);
> +
> + // Cleanup all the resources used.
> + LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
> + createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext,
> + CUKernel);
> +
> + // Erase the ptx kernel and device subfunctions and ptx intrinsics from
> + // current module.
> + eraseUnusedFunctions(F);
> +}
> +#endif /* GPU_CODEGEN */
>
> Added: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c (added)
> +++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,16 @@
> +int A[128][128];
> +
> +int gpu_pure() {
> + int i,j;
> +
> + for(i = 0; i < 128; i++)
> + for(j = 0; j < 128; j++)
> + A[i][j] = i*128 + j;
> +
> + return 0;
> +}
> +
> +int main() {
> + int b = gpu_pure();
> + return 0;
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll (added)
> +++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,65 @@
> +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
> +; ModuleID = '2d_innermost_parallel.s'
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-unknown-linux-gnu"
> +
> + at A = common global [128 x [128 x i32]] zeroinitializer, align 16
> +
> +define i32 @gpu_pure() nounwind uwtable {
> +entry:
> + br label %for.cond
> +
> +for.cond: ; preds = %for.inc6, %entry
> + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
> + %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
> + %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
> + br i1 %exitcond6, label %for.body, label %for.end8
> +
> +for.body: ; preds = %for.cond
> + br label %for.cond1
> +
> +for.cond1: ; preds = %for.inc, %for.body
> + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
> + %lftr.wideiv = trunc i64 %indvars.iv to i32
> + %exitcond = icmp ne i32 %lftr.wideiv, 128
> + br i1 %exitcond, label %for.body3, label %for.end
> +
> +for.body3: ; preds = %for.cond1
> + %tmp = shl nsw i64 %indvars.iv2, 7
> + %tmp7 = add nsw i64 %tmp, %indvars.iv
> + %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
> + %tmp8 = trunc i64 %tmp7 to i32
> + store i32 %tmp8, i32* %arrayidx5, align 4
> + br label %for.inc
> +
> +for.inc: ; preds = %for.body3
> + %indvars.iv.next = add i64 %indvars.iv, 1
> + br label %for.cond1
> +
> +for.end: ; preds = %for.cond1
> + br label %for.inc6
> +
> +for.inc6: ; preds = %for.end
> + %indvars.iv.next3 = add i64 %indvars.iv2, 1
> + br label %for.cond
> +
> +for.end8: ; preds = %for.cond
> + ret i32 0
> +}
> +
> +define i32 @main() nounwind uwtable {
> +entry:
> + %call = call i32 @gpu_pure()
> + ret i32 0
> +}
> +
> +; CHECK: call void @polly_initDevice
> +; CHECK: call void @polly_getPTXModule
> +; CHECK: call void @polly_getPTXKernelEntry
> +; CHECK: call void @polly_allocateMemoryForHostAndDevice
> +; CHECK: call void @polly_setKernelParameters
> +; CHECK: call void @polly_startTimerByCudaEvent
> +; CHECK: call void @polly_launchKernel
> +; CHECK: call void @polly_copyFromDeviceToHost
> +; CHECK: call void @polly_stopTimerByCudaEvent
> +; CHECK: call void @polly_cleanupGPGPUResources
>
> Added: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c (added)
> +++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,17 @@
> +int A[128][128];
> +
> +int gpu_no_pure() {
> + int i,j,k;
> +
> + for(i = 0; i < 128; i++)
> + for(j = 0; j < 128; j++)
> + for(k = 0; k < 256; k++)
> + A[i][j] += i*123/(k+1)+5-j*k-123;
> +
> + return 0;
> +}
> +
> +int main() {
> + int b = gpu_no_pure();
> + return 0;
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll (added)
> +++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,88 @@
> +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
> +; ModuleID = '3d_innermost_non_parallel.s'
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-unknown-linux-gnu"
> +
> + at A = common global [128 x [128 x i32]] zeroinitializer, align 16
> +
> +define i32 @gpu_no_pure() nounwind uwtable {
> +entry:
> + br label %for.cond
> +
> +for.cond: ; preds = %for.inc16, %entry
> + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
> + %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
> + %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
> + br i1 %exitcond6, label %for.body, label %for.end18
> +
> +for.body: ; preds = %for.cond
> + br label %for.cond1
> +
> +for.cond1: ; preds = %for.inc13, %for.body
> + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
> + %lftr.wideiv = trunc i64 %indvars.iv to i32
> + %exitcond1 = icmp ne i32 %lftr.wideiv, 128
> + br i1 %exitcond1, label %for.body3, label %for.end15
> +
> +for.body3: ; preds = %for.cond1
> + br label %for.cond4
> +
> +for.cond4: ; preds = %for.inc, %for.body3
> + %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
> + %exitcond = icmp ne i32 %k.0, 256
> + br i1 %exitcond, label %for.body6, label %for.end
> +
> +for.body6: ; preds = %for.cond4
> + %tmp = mul nsw i64 %indvars.iv2, 123
> + %add = add nsw i32 %k.0, 1
> + %tmp7 = trunc i64 %tmp to i32
> + %div = sdiv i32 %tmp7, %add
> + %add7 = add nsw i32 %div, 5
> + %tmp8 = trunc i64 %indvars.iv to i32
> + %mul8 = mul nsw i32 %tmp8, %k.0
> + %sub = sub nsw i32 %add7, %mul8
> + %sub9 = add nsw i32 %sub, -123
> + %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
> + %tmp9 = load i32* %arrayidx11, align 4
> + %add12 = add nsw i32 %tmp9, %sub9
> + store i32 %add12, i32* %arrayidx11, align 4
> + br label %for.inc
> +
> +for.inc: ; preds = %for.body6
> + %inc = add nsw i32 %k.0, 1
> + br label %for.cond4
> +
> +for.end: ; preds = %for.cond4
> + br label %for.inc13
> +
> +for.inc13: ; preds = %for.end
> + %indvars.iv.next = add i64 %indvars.iv, 1
> + br label %for.cond1
> +
> +for.end15: ; preds = %for.cond1
> + br label %for.inc16
> +
> +for.inc16: ; preds = %for.end15
> + %indvars.iv.next3 = add i64 %indvars.iv2, 1
> + br label %for.cond
> +
> +for.end18: ; preds = %for.cond
> + ret i32 0
> +}
> +
> +define i32 @main() nounwind uwtable {
> +entry:
> + %call = call i32 @gpu_no_pure()
> + ret i32 0
> +}
> +
> +; CHECK: call void @polly_initDevice
> +; CHECK: call void @polly_getPTXModule
> +; CHECK: call void @polly_getPTXKernelEntry
> +; CHECK: call void @polly_allocateMemoryForHostAndDevice
> +; CHECK: call void @polly_setKernelParameters
> +; CHECK: call void @polly_startTimerByCudaEvent
> +; CHECK: call void @polly_launchKernel
> +; CHECK: call void @polly_copyFromDeviceToHost
> +; CHECK: call void @polly_stopTimerByCudaEvent
> +; CHECK: call void @polly_cleanupGPGPUResources
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,21 @@
> +{
> + "context" : "{ : }",
> + "name" : "for.cond => for.end18",
> + "statements" : [
> + {
> + "accesses" : [
> + {
> + "kind" : "read",
> + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> + },
> + {
> + "kind" : "write",
> + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> + }
> + ],
> + "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
> + "name" : "Stmt_for_body6",
> + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
> + }
> + ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop.transformed%2Bgpu?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,21 @@
> +{
> + "context" : "{ : }",
> + "name" : "for.cond => for.end18",
> + "statements" : [
> + {
> + "accesses" : [
> + {
> + "kind" : "read",
> + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> + },
> + {
> + "kind" : "write",
> + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
> + }
> + ],
> + "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
> + "name" : "Stmt_for_body6",
> + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
> + }
> + ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,17 @@
> +{
> + "context" : "{ : }",
> + "name" : "for.cond => for.end8",
> + "statements" : [
> + {
> + "accesses" : [
> + {
> + "kind" : "write",
> + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
> + }
> + ],
> + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
> + "name" : "Stmt_for_body3",
> + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
> + }
> + ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop.transformed%2Bgpu?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu (added)
> +++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,17 @@
> +{
> + "context" : "{ : }",
> + "name" : "for.cond => for.end8",
> + "statements" : [
> + {
> + "accesses" : [
> + {
> + "kind" : "write",
> + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
> + }
> + ],
> + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
> + "name" : "Stmt_for_body3",
> + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
> + }
> + ]
> +}
>
> Added: polly/trunk/test/CodeGen/GPGPU/lit.local.cfg
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/lit.local.cfg?rev=161239&view=auto
> ==============================================================================
> --- polly/trunk/test/CodeGen/GPGPU/lit.local.cfg (added)
> +++ polly/trunk/test/CodeGen/GPGPU/lit.local.cfg Fri Aug 3 07:50:07 2012
> @@ -0,0 +1,5 @@
> +config.suffixes = ['.ll']
> +
> +gpgpu = config.root.enable_gpgpu_codegen
> +if gpgpu not in ['TRUE', 'true'] :
> + config.unsupported = True
>
> Modified: polly/trunk/test/lit.site.cfg.in
> URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/lit.site.cfg.in?rev=161239&r1=161238&r2=161239&view=diff
> ==============================================================================
> --- polly/trunk/test/lit.site.cfg.in (original)
> +++ polly/trunk/test/lit.site.cfg.in Fri Aug 3 07:50:07 2012
> @@ -7,6 +7,7 @@
> config.polly_obj_root = "@POLLY_BINARY_DIR@"
> config.polly_lib_dir = "@POLLY_LIB_DIR@"
> config.target_triple = "@TARGET_TRIPLE@"
> +config.enable_gpgpu_codegen = "@CUDALIB_FOUND@"
> lit.params['build_config'] = "@POLLY_SOURCE_DIR@/test"
>
> ## Check the current platform with regex
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
--
Qualcomm Innovation Center, Inc is a member of Code Aurora Forum
More information about the llvm-commits
mailing list