[llvm-commits] [polly] r160165 - in /polly/trunk: include/polly/ include/polly/CodeGen/ lib/CodeGen/ test/CodeGen/GPGPU/
Tobias Grosser
grosser at fim.uni-passau.de
Fri Jul 13 00:44:56 PDT 2012
Author: grosser
Date: Fri Jul 13 02:44:56 2012
New Revision: 160165
URL: http://llvm.org/viewvc/llvm-project?rev=160165&view=rev
Log:
Revert "Add preliminary implementation for GPGPU code generation."
I did not take into account, that this patch fails to compile without the
llvm.codegen patch applied. This breaks buildbots.
I revert this until we found a solution to commit this without buildbots
complaining.
This reverts commit cb43ab80e94434e780a66be3b9a6ad466822fe33.
Removed:
polly/trunk/include/polly/CodeGen/PTXGenerator.h
polly/trunk/lib/CodeGen/PTXGenerator.cpp
polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
Modified:
polly/trunk/include/polly/ScopInfo.h
polly/trunk/lib/CodeGen/CMakeLists.txt
polly/trunk/lib/CodeGen/CodeGeneration.cpp
Removed: polly/trunk/include/polly/CodeGen/PTXGenerator.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/CodeGen/PTXGenerator.h?rev=160164&view=auto
==============================================================================
--- polly/trunk/include/polly/CodeGen/PTXGenerator.h (original)
+++ polly/trunk/include/polly/CodeGen/PTXGenerator.h (removed)
@@ -1,193 +0,0 @@
-//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains functions to create GPGPU parallel loops as LLVM-IR.
-//
-//===----------------------------------------------------------------------===//
-#ifndef POLLY_CODEGEN_PTXGENERATOR_H
-#define POLLY_CODEGEN_PTXGENERATOR_H
-
-#include "llvm/IRBuilder.h"
-#include "llvm/ADT/SetVector.h"
-
-#include <map>
-
-namespace llvm {
- class Value;
- class Pass;
- class BasicBlock;
-}
-
-namespace polly {
-using namespace llvm;
-
-class PTXGenerator {
-public:
- typedef std::map<Value*, Value*> ValueToValueMapTy;
-
- PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple);
-
- /// @brief Create a GPGPU parallel loop.
- ///
- /// @param UsedValues A set of LLVM-IR Values that should be available to
- /// the new loop body.
- /// @param OriginalIVS The new values of the original induction variables.
- /// @param VMap This map is filled by createParallelLoop(). It
- /// maps the values in UsedValues to Values through which
- /// their content is available within the loop body.
- /// @param LoopBody A pointer to an iterator that is set to point to the
- /// body of the created loop. It should be used to insert
- /// instructions that form the actual loop body.
- void startGeneration(SetVector<Value*> &UsedValues,
- SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap,
- BasicBlock::iterator *LoopBody);
-
- /// @brief Execute the post-operations to build a GPGPU parallel loop.
- ///
- void finishGeneration(Function *SubFunction);
-
- /// @brief Set the parameters for launching PTX kernel.
- ///
- /// @param GridW A value of the width of a GPU grid.
- /// @param GridH A value of the height of a GPU grid.
- /// @param BlockW A value of the width of a GPU block.
- /// @param BlockH A value of the height of a GPU block.
- void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) {
- GridWidth = GridW;
- GridHeight = GridH;
- BlockWidth = BlockW;
- BlockHeight = BlockH;
- }
-
- /// @brief Set the size of the output array.
- ///
- /// This size is used to allocate memory on the device and the host.
- ///
- /// @param Bytes Output array size in bytes.
- void setOutputBytes(unsigned Bytes) {
- OutputBytes = Bytes;
- }
-
-private:
- IRBuilder<> &Builder;
- Pass *P;
-
- /// @brief The target triple of the device.
- const std::string &GPUTriple;
-
- /// @brief Parameters used for launching PTX kernel.
- int GridWidth, GridHeight, BlockWidth, BlockHeight;
-
- /// @brief Size of the output array in bytes.
- unsigned OutputBytes;
-
- /// @brief Polly's GPU data types.
- StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy;
-
- void InitializeGPUDataTypes();
- IntegerType *getInt64Type(); // i64
- PointerType *getI8PtrType(); // char *
- PointerType *getPtrI8PtrType(); // char **
- PointerType *getFloatPtrType(); // float *
- PointerType *getGPUContextPtrType(); // %struct.PollyGPUContextT *
- PointerType *getGPUModulePtrType(); // %struct.PollyGPUModuleT *
- PointerType *getGPUDevicePtrType(); // %struct.PollyGPUDeviceT *
- PointerType *getPtrGPUDevicePtrType(); // %struct.PollyGPUDevicePtrT *
- PointerType *getGPUFunctionPtrType(); // %struct.PollyGPUFunctionT *
- PointerType *getGPUEventPtrType(); // %struct.PollyGPUEventT *
-
- Module *getModule();
-
- /// @brief Create the kernel string containing LLVM IR.
- ///
- /// @param SubFunction A pointer to the device code function.
- /// @return A global string variable containing the LLVM IR codes
- // of the SubFunction.
- Value *createPTXKernelFunction(Function *SubFunction);
-
- /// @brief Get the entry name of the device kernel function.
- ///
- /// @param SubFunction A pointer to the device code function.
- /// @return A global string variable containing the entry name of
- /// the SubFunction.
- Value *getPTXKernelEntryName(Function *SubFunction);
-
- void createCallInitDevice(Value *Context, Value *Device);
- void createCallGetPTXModule(Value *Buffer, Value *Module);
- void createCallGetPTXKernelEntry(Value *Entry, Value *Module,
- Value *Kernel);
- void createCallAllocateMemoryForHostAndDevice(Value *HostData,
- Value *DeviceData,
- Value *Size);
- void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData,
- Value *Size);
- void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData,
- Value *Size);
- void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth,
- Value *BlockHeight, Value *DeviceData);
- void createCallLaunchKernel(Value *Kernel, Value *GridWidth,
- Value *GridHeight);
- void createCallStartTimerByCudaEvent(Value *StartEvent,
- Value *StopEvent);
- void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent,
- Value *Timer);
- void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData,
- Value *Module, Value *Context,
- Value *Kernel);
-
- /// @brief Create the CUDA subfunction.
- ///
- /// @param UsedValues A set of LLVM-IR Values that should be available to
- /// the new loop body.
- /// @param VMap This map that is filled by createSubfunction(). It
- /// maps the values in UsedValues to Values through which
- /// their content is available within the loop body.
- /// @param OriginalIVS The new values of the original induction variables.
- /// @param SubFunction The newly created SubFunction is returned here.
- void createSubfunction(SetVector<Value*> &UsedValues,
- SetVector<Value*> &OriginalIVS,
- ValueToValueMapTy &VMap,
- Function **SubFunction);
-
- /// @brief Create the definition of the CUDA subfunction.
- ///
- /// @param NumArgs The number of parameters of this subfunction. This is
- /// usually set to the number of memory accesses which
- /// will be copied from host to device.
- Function *createSubfunctionDefinition(int NumArgs);
-
- /// @brief Extract all the ptx related subfunctions into a new module.
- ///
- /// @param M Current module.
- /// @return The generated module containing only gpu related
- /// subfunctions.
- Module *extractPTXFunctionsFromModule(const Module *M);
-
- /// @brief Get the Value of CUDA block width.
- Value *getCUDABlockWidth();
-
- /// @brief Get the Value of CUDA block height.
- Value *getCUDABlockHeight();
-
- /// @brief Get the Value of CUDA Gird width.
- Value *getCUDAGridWidth();
-
- /// @brief Get the Value of CUDA grid height.
- Value *getCUDAGridHeight();
-
- /// @brief Get the Value of the bytes of the output array.
- Value *getOutputArraySizeInBytes();
-
- /// @brief Erase the ptx-related subfunctions and declarations.
- ///
- /// @param SubFunction A pointer to the device code function.
- void eraseUnusedFunctions(Function *SubFunction);
-};
-} // end namespace polly
-#endif /* POLLY_CODEGEN_PTXGENERATOR_H */
Modified: polly/trunk/include/polly/ScopInfo.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/include/polly/ScopInfo.h?rev=160165&r1=160164&r2=160165&view=diff
==============================================================================
--- polly/trunk/include/polly/ScopInfo.h (original)
+++ polly/trunk/include/polly/ScopInfo.h Fri Jul 13 02:44:56 2012
@@ -125,9 +125,6 @@
/// @brief Is this a read memory access?
bool isRead() const { return Type == MemoryAccess::Read; }
- /// @brief Is this a write memory access?
- bool isWrite() const { return Type == MemoryAccess::Write; }
-
isl_map *getAccessRelation() const;
/// @brief Get an isl string representing this access function.
Modified: polly/trunk/lib/CodeGen/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CMakeLists.txt?rev=160165&r1=160164&r2=160165&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/CMakeLists.txt (original)
+++ polly/trunk/lib/CodeGen/CMakeLists.txt Fri Jul 13 02:44:56 2012
@@ -15,5 +15,4 @@
${ISL_CODEGEN_FILES}
LoopGenerators.cpp
Utils.cpp
- PTXGenerator.cpp
)
Modified: polly/trunk/lib/CodeGen/CodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/CodeGeneration.cpp?rev=160165&r1=160164&r2=160165&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/CodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/CodeGeneration.cpp Fri Jul 13 02:44:56 2012
@@ -31,7 +31,6 @@
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/CodeGen/BlockGenerators.h"
#include "polly/CodeGen/LoopGenerators.h"
-#include "polly/CodeGen/PTXGenerator.h"
#include "polly/CodeGen/Utils.h"
#include "polly/Support/GICHelper.h"
@@ -67,17 +66,6 @@
cl::init(false), cl::ZeroOrMore);
static cl::opt<bool>
-GPGPU("enable-polly-gpgpu",
- cl::desc("Generate GPU parallel code"), cl::Hidden,
- cl::value_desc("GPGPU code generation enabled if true"),
- cl::init(false), cl::ZeroOrMore);
-
-static cl::opt<std::string>
-GPUTriple("polly-gpgpu-triple",
- cl::desc("Target triple for GPU code generation"),
- cl::Hidden, cl::init(""));
-
-static cl::opt<bool>
AtLeastOnce("enable-polly-atLeastOnce",
cl::desc("Give polly the hint, that every loop is executed at least"
"once"), cl::Hidden,
@@ -296,25 +284,6 @@
/// statement.
void codegenForOpenMP(const clast_for *f);
- /// @brief Create GPGPU device memory access values.
- ///
- /// Create a list of values that will be set to be parameters of the GPGPU
- /// subfunction. These parameters represent device memory base addresses
- /// and the size in bytes.
- SetVector<Value*> getGPUValues(unsigned &OutputBytes);
-
- /// @brief Create a GPU parallel for loop.
- ///
- /// This loop reflects a loop as if it would have been created by a GPU
- /// statement.
- void codegenForGPGPU(const clast_for *F);
-
- /// @brief Get innermost statement for the transformed loops.
- const clast_stmt *getScheduleInfo(const clast_for *F,
- std::vector<int> &NumIters,
- unsigned &LoopDepth,
- unsigned &NonPLoopDepth);
-
/// @brief Check if a loop is parallel
///
/// Detect if a clast_for loop can be executed in parallel.
@@ -561,161 +530,6 @@
Builder.SetInsertPoint(AfterLoop);
}
-static unsigned getArraySizeInBytes(const ArrayType *AT) {
- unsigned Bytes = AT->getNumElements();
- if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
- Bytes *= getArraySizeInBytes(T);
- else
- Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
-
- return Bytes;
-}
-
-SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
- SetVector<Value*> Values;
- OutputBytes = 0;
-
- // Record the memory reference base addresses.
- for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
- ScopStmt *Stmt = *SI;
- for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
- E = Stmt->memacc_end(); I != E; ++I) {
- Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
- Values.insert((BaseAddr));
-
- // FIXME: we assume that there is one and only one array to be written
- // in a SCoP.
- int NumWrites = 0;
- if ((*I)->isWrite()) {
- ++NumWrites;
- assert(NumWrites <= 1 &&
- "We support at most one array to be written in a SCoP.");
- if (const PointerType * PT =
- dyn_cast<PointerType>(BaseAddr->getType())) {
- Type *T = PT->getArrayElementType();
- const ArrayType *ATy = dyn_cast<ArrayType>(T);
- OutputBytes = getArraySizeInBytes(ATy);
- }
- }
- }
- }
-
- return Values;
-}
-
-const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
- std::vector<int> &NumIters,
- unsigned &LoopDepth,
- unsigned &NonPLoopDepth) {
- clast_stmt *Stmt = (clast_stmt *)F;
- const clast_for *Result;
- bool NonParaFlag = false;
- LoopDepth = 0;
- NonPLoopDepth = 0;
-
- while (Stmt) {
- if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
- const clast_for *T = (clast_for *) Stmt;
- if (isParallelFor(T)) {
- if (!NonParaFlag) {
- NumIters.push_back(getNumberOfIterations(T));
- Result = T;
- }
- } else
- NonParaFlag = true;
-
- Stmt = T->body;
- LoopDepth++;
- continue;
- }
- Stmt = Stmt->next;
- }
-
- assert(NumIters.size() == 4 &&
- "The loops should be tiled into 4-depth parallel loops and an "
- "innermost non-parallel one (if exist).");
- NonPLoopDepth = LoopDepth - NumIters.size();
- assert(NonPLoopDepth <= 1
- && "We support only one innermost non-parallel loop currently.");
- return (const clast_stmt *)Result->body;
-}
-
-void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
- BasicBlock::iterator LoopBody;
- SetVector<Value *> Values;
- SetVector<Value *> IVS;
- std::vector<int> NumIterations;
- PTXGenerator::ValueToValueMapTy VMap;
-
- assert(!GPUTriple.empty()
- && "Target triple should be set properly for GPGPU code generation.");
- PTXGenerator PTXGen(Builder, P, GPUTriple);
-
- // Get original IVS and ScopStmt
- unsigned TiledLoopDepth, NonPLoopDepth;
- const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
- TiledLoopDepth, NonPLoopDepth);
- const clast_stmt *TmpStmt;
- const clast_user_stmt *U;
- const clast_for *InnerFor;
- if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
- InnerFor = (const clast_for *)InnerStmt;
- TmpStmt = InnerFor->body;
- } else
- TmpStmt = InnerStmt;
- U = (const clast_user_stmt *) TmpStmt;
- ScopStmt *Statement = (ScopStmt *) U->statement->usr;
- for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
- const Value* IV = Statement->getInductionVariableForDimension(i);
- IVS.insert(const_cast<Value *>(IV));
- }
-
- unsigned OutBytes;
- Values = getGPUValues(OutBytes);
- PTXGen.setOutputBytes(OutBytes);
- PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
-
- BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
- Builder.SetInsertPoint(LoopBody);
-
- BasicBlock *AfterBB = 0;
- if (NonPLoopDepth) {
- Value *LowerBound, *UpperBound, *IV, *Stride;
- Type *IntPtrTy = getIntPtrTy();
- LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
- UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
- Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
- IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
- const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
- Value *OldIV = const_cast<Value *>(OldIV_);
- VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
- }
-
- updateWithValueMap(VMap, /* reverse */ false);
- BlockGenerator::generate(Builder, *Statement, ValueMap, P);
- updateWithValueMap(VMap, /* reverse */ true);
-
- if (AfterBB)
- Builder.SetInsertPoint(AfterBB->begin());
-
- // FIXME: The replacement of the host base address with the parameter of ptx
- // subfunction should have been done by updateWithValueMap. We use the
- // following codes to avoid affecting other parts of Polly. This should be
- // fixed later.
- Function *FN = Builder.GetInsertBlock()->getParent();
- for (unsigned j = 0; j < Values.size(); j++) {
- Value *baseAddr = Values[j];
- for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
- for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
- I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
- }
- }
- Builder.SetInsertPoint(AfterLoop);
- PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
- NumIterations[2], NumIterations[3]);
- PTXGen.finishGeneration(FN);
-}
-
bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
const clast_stmt *stmt = f->body;
@@ -833,16 +647,6 @@
}
}
- if (GPGPU && isParallelFor(f)) {
- if (!parallelCodeGeneration) {
- parallelCodeGeneration = true;
- parallelLoops.push_back(f->iterator);
- codegenForGPGPU(f);
- parallelCodeGeneration = false;
- return;
- }
- }
-
codegenForSequential(f);
}
Removed: polly/trunk/lib/CodeGen/PTXGenerator.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PTXGenerator.cpp?rev=160164&view=auto
==============================================================================
--- polly/trunk/lib/CodeGen/PTXGenerator.cpp (original)
+++ polly/trunk/lib/CodeGen/PTXGenerator.cpp (removed)
@@ -1,652 +0,0 @@
-//===------ PTXGenerator.cpp - IR helper to create loops -----------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains functions to create GPU parallel codes as LLVM-IR.
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/CodeGen/PTXGenerator.h"
-#include "polly/ScopDetection.h"
-#include "polly/ScopInfo.h"
-
-#include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-
-using namespace llvm;
-using namespace polly;
-
-PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P,
- const std::string &Triple):
- Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
- BlockWidth(1), BlockHeight(1), OutputBytes(0) {
-
- InitializeGPUDataTypes();
-}
-
-Module *PTXGenerator::getModule() {
- return Builder.GetInsertBlock()->getParent()->getParent();
-}
-
-Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
- assert(NumArgs == 1 && "we support only one array access now.");
-
- Module *M = getModule();
- Function *F = Builder.GetInsertBlock()->getParent();
- std::vector<Type*> Arguments;
- for (int i = 0; i < NumArgs; i++)
- Arguments.push_back(Builder.getInt8PtrTy());
- FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
- Function *FN = Function::Create(FT, Function::InternalLinkage,
- F->getName() + "_ptx_subfn", M);
- FN->setCallingConv(CallingConv::PTX_Kernel);
-
- // Do not run any optimization pass on the new function.
- P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
-
- for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
- AI->setName("ptx.Array");
-
- return FN;
-}
-
-void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues,
- SetVector<Value*> &OriginalIVS,
- PTXGenerator::ValueToValueMapTy &VMap,
- Function **SubFunction) {
- Function *FN = createSubfunctionDefinition(UsedValues.size());
- Module *M = getModule();
- LLVMContext &Context = FN->getContext();
- IntegerType *Ty = Builder.getInt64Ty();
-
- // Store the previous basic block.
- BasicBlock *PrevBB = Builder.GetInsertBlock();
-
- // Create basic blocks.
- BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
- BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
- BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
-
- DominatorTree &DT = P->getAnalysis<DominatorTree>();
- DT.addNewBlock(HeaderBB, PrevBB);
- DT.addNewBlock(ExitBB, HeaderBB);
- DT.addNewBlock(BodyBB, HeaderBB);
-
- Builder.SetInsertPoint(HeaderBB);
-
- // Insert VMap items with maps of array base address on the host to base
- // address on the device.
- Function::arg_iterator AI = FN->arg_begin();
- for (unsigned j = 0; j < UsedValues.size(); j++) {
- Value *BaseAddr = UsedValues[j];
- Type *ArrayTy = BaseAddr->getType();
- Value *Param = Builder.CreateBitCast(AI, ArrayTy);
- VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param));
- AI++;
- }
-
- // FIXME: These intrinsics should be inserted on-demand. However, we insert
- // them all currently for simplicity.
- Function *GetNctaidX =
- Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
- Function *GetNctaidY =
- Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
- Function *GetCtaidX =
- Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
- Function *GetCtaidY =
- Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
- Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
- Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
- Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
- Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
-
- Value *GridWidth = Builder.CreateCall(GetNctaidX);
- GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
- Value *GridHeight = Builder.CreateCall(GetNctaidY);
- GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
- Value *BlockWidth = Builder.CreateCall(GetNtidX);
- BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
- Value *BlockHeight = Builder.CreateCall(GetNtidY);
- BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
- Value *BIDx = Builder.CreateCall(GetCtaidX);
- BIDx = Builder.CreateIntCast(BIDx, Ty, false);
- Value *BIDy = Builder.CreateCall(GetCtaidY);
- BIDy = Builder.CreateIntCast(BIDy, Ty, false);
- Value *TIDx = Builder.CreateCall(GetTidX);
- TIDx = Builder.CreateIntCast(TIDx, Ty, false);
- Value *TIDy = Builder.CreateCall(GetTidY);
- TIDy = Builder.CreateIntCast(TIDy, Ty, false);
-
- Builder.CreateBr(BodyBB);
- Builder.SetInsertPoint(BodyBB);
-
- unsigned NumDims = OriginalIVS.size();
- std::vector<Value *> Substitutions;
- Value *BlockID, *ThreadID;
- switch (NumDims) {
- case 1: {
- Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight,
- "p_gpu_blocksize");
- BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
- BlockID = Builder.CreateAdd(BlockID, BIDx);
- BlockID = Builder.CreateMul(BlockID, BlockSize);
- ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
- ThreadID = Builder.CreateAdd(ThreadID, TIDx);
- ThreadID = Builder.CreateAdd(ThreadID, BlockID);
- Substitutions.push_back(ThreadID);
- break;
- }
- case 2: {
- BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
- BlockID = Builder.CreateAdd(BlockID, BIDx);
- Substitutions.push_back(BlockID);
- ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
- ThreadID = Builder.CreateAdd(ThreadID, TIDx);
- Substitutions.push_back(ThreadID);
- break;
- }
- case 3: {
- BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
- BlockID = Builder.CreateAdd(BlockID, BIDx);
- Substitutions.push_back(BlockID);
- Substitutions.push_back(TIDy);
- Substitutions.push_back(TIDx);
- break;
- }
- case 4: {
- Substitutions.push_back(BIDy);
- Substitutions.push_back(BIDx);
- Substitutions.push_back(TIDy);
- Substitutions.push_back(TIDx);
- break;
- }
- default:
- assert(true &&
- "We cannot transform parallel loops whose depth is larger than 4.");
- return;
- }
-
- assert(OriginalIVS.size() == Substitutions.size()
- && "The size of IVS should be equal to the size of substitutions.");
- for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
- VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i],
- Substitutions[i]));
- }
-
- Builder.CreateBr(ExitBB);
- Builder.SetInsertPoint(--Builder.GetInsertPoint());
- BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
-
- // Add the termination of the ptx-device subfunction.
- Builder.SetInsertPoint(ExitBB);
- Builder.CreateRetVoid();
-
- Builder.SetInsertPoint(LoopBody);
- *SubFunction = FN;
-}
-
-void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues,
- SetVector<Value*> &OriginalIVS,
- ValueToValueMapTy &VMap,
- BasicBlock::iterator *LoopBody) {
- Function *SubFunction;
- BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
- createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
- *LoopBody = Builder.GetInsertPoint();
- Builder.SetInsertPoint(PrevInsertPoint);
-}
-
-IntegerType *PTXGenerator::getInt64Type() {
- return Builder.getInt64Ty();
-}
-
-PointerType *PTXGenerator::getI8PtrType() {
- return PointerType::getUnqual(Builder.getInt8Ty());
-}
-
-PointerType *PTXGenerator::getPtrI8PtrType() {
- return PointerType::getUnqual(getI8PtrType());
-}
-
-PointerType *PTXGenerator::getFloatPtrType() {
- return llvm::Type::getFloatPtrTy(getModule()->getContext());
-}
-
-PointerType *PTXGenerator::getGPUContextPtrType() {
- return PointerType::getUnqual(ContextTy);
-}
-
-PointerType *PTXGenerator::getGPUModulePtrType() {
- return PointerType::getUnqual(ModuleTy);
-}
-
-PointerType *PTXGenerator::getGPUDevicePtrType() {
- return PointerType::getUnqual(DeviceTy);
-}
-
-PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
- return PointerType::getUnqual(DevDataTy);
-}
-
-PointerType *PTXGenerator::getGPUFunctionPtrType() {
- return PointerType::getUnqual(KernelTy);
-}
-
-PointerType *PTXGenerator::getGPUEventPtrType() {
- return PointerType::getUnqual(EventTy);
-}
-
-void PTXGenerator::InitializeGPUDataTypes() {
- LLVMContext &Context = getModule()->getContext();
-
- ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
- ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
- KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
- DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
- DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT");
- EventTy = StructType::create(Context, "struct.PollyGPUEventT");
-}
-
-void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
- const char *Name = "polly_initDevice";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
- Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall2(F, Context, Device);
-}
-
-void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
- const char *Name = "polly_getPTXModule";
- llvm::Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getI8PtrType());
- Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall2(F, Buffer, Module);
-}
-
-void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
- Value *Kernel) {
- const char *Name = "polly_getPTXKernelEntry";
- llvm::Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getI8PtrType());
- Args.push_back(getGPUModulePtrType());
- Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall3(F, Entry, Module, Kernel);
-}
-
-void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
- Value *DeviceData,
- Value *Size) {
- const char *Name = "polly_allocateMemoryForHostAndDevice";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getPtrI8PtrType());
- Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
- Args.push_back(getInt64Type());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall3(F, HostData, DeviceData, Size);
-}
-
-void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
- Value *HostData,
- Value *Size) {
- const char *Name = "polly_copyFromHostToDevice";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getPtrGPUDevicePtrType());
- Args.push_back(getI8PtrType());
- Args.push_back(getInt64Type());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall3(F, DeviceData, HostData, Size);
-}
-
-void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
- Value *DeviceData,
- Value *Size) {
- const char *Name = "polly_copyFromDeviceToHost";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getI8PtrType());
- Args.push_back(getPtrGPUDevicePtrType());
- Args.push_back(getInt64Type());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall3(F, HostData, DeviceData, Size);
-}
-
-void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
- Value *BlockWidth,
- Value *BlockHeight,
- Value *DeviceData) {
- const char *Name = "polly_setKernelParameters";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getGPUFunctionPtrType());
- Args.push_back(getInt64Type());
- Args.push_back(getInt64Type());
- Args.push_back(getPtrGPUDevicePtrType());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
-}
-
-void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
- Value *GridHeight) {
- const char *Name = "polly_launchKernel";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getGPUFunctionPtrType());
- Args.push_back(getInt64Type());
- Args.push_back(getInt64Type());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
-}
-
-void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
- Value *StopEvent) {
- const char *Name = "polly_startTimerByCudaEvent";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
- Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall2(F, StartEvent, StopEvent);
-}
-
-void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
- Value *StopEvent,
- Value *Timer) {
- const char *Name = "polly_stopTimerByCudaEvent";
- Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getGPUEventPtrType());
- Args.push_back(getGPUEventPtrType());
- Args.push_back(getFloatPtrType());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
-}
-
-void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
- Value *DeviceData,
- Value *Module,
- Value *Context,
- Value *Kernel) {
- const char *Name = "polly_cleanupGPGPUResources";
- llvm::Module *M = getModule();
- Function *F = M->getFunction(Name);
-
- // If F is not available, declare it.
- if (!F) {
- GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
- std::vector<Type*> Args;
- Args.push_back(getI8PtrType());
- Args.push_back(getPtrGPUDevicePtrType());
- Args.push_back(getGPUModulePtrType());
- Args.push_back(getGPUContextPtrType());
- Args.push_back(getGPUFunctionPtrType());
- FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
- F = Function::Create(Ty, Linkage, Name, M);
- }
-
- Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
-}
-
-Value *PTXGenerator::getCUDAGridWidth() {
- return ConstantInt::get(getInt64Type(), GridWidth);
-}
-
-Value *PTXGenerator::getCUDAGridHeight() {
- return ConstantInt::get(getInt64Type(), GridHeight);
-}
-
-Value *PTXGenerator::getCUDABlockWidth() {
- return ConstantInt::get(getInt64Type(), BlockWidth);
-}
-
-Value *PTXGenerator::getCUDABlockHeight() {
- return ConstantInt::get(getInt64Type(), BlockHeight);
-}
-
-Value *PTXGenerator::getOutputArraySizeInBytes() {
- return ConstantInt::get(getInt64Type(), OutputBytes);
-}
-
-Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
- Module *M = getModule();
- std::string LLVMKernelStr;
- raw_string_ostream NameROS(LLVMKernelStr);
- formatted_raw_ostream FOS(NameROS);
- FOS << "target triple = \"" << GPUTriple <<"\"\n";
- SubFunction->print(FOS);
-
- // Insert ptx intrinsics into the kernel string.
- for (Module::iterator I = M->begin(), E = M->end(); I != E; ) {
- Function *F = I++;
- // Function must be a prototype and unused.
- if (F->isDeclaration() && F->isIntrinsic()) {
- switch (F->getIntrinsicID()) {
- case Intrinsic::ptx_read_nctaid_x:
- case Intrinsic::ptx_read_nctaid_y:
- case Intrinsic::ptx_read_ctaid_x:
- case Intrinsic::ptx_read_ctaid_y:
- case Intrinsic::ptx_read_ntid_x:
- case Intrinsic::ptx_read_ntid_y:
- case Intrinsic::ptx_read_tid_x:
- case Intrinsic::ptx_read_tid_y:
- F->print(FOS);
- break;
- default:
- break;
- }
- }
- }
-
- Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr,
- "llvm_kernel");
- Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
- Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");
-
- Function *GetDeviceKernel = Intrinsic::getDeclaration(M,
- Intrinsic::codegen);
-
- return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
-}
-
-Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
- StringRef Entry = SubFunction->getName();
- return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
-}
-
-void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
- Module *M = getModule();
- SubFunction->eraseFromParent();
-
- if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x"))
- FuncPTXReadNCtaidX->eraseFromParent();
-
- if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y"))
- FuncPTXReadNCtaidY->eraseFromParent();
-
- if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x"))
- FuncPTXReadCtaidX->eraseFromParent();
-
- if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y"))
- FuncPTXReadCtaidY->eraseFromParent();
-
- if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x"))
- FuncPTXReadNTidX->eraseFromParent();
-
- if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y"))
- FuncPTXReadNTidY->eraseFromParent();
-
- if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x"))
- FuncPTXReadTidX->eraseFromParent();
-
- if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y"))
- FuncPTXReadTidY->eraseFromParent();
-}
-
-void PTXGenerator::finishGeneration(Function *F) {
- // Define data used by the GPURuntime library.
- AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0,
- "phcontext");
- AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0,
- "phdevice");
- AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0,
- "phmodule");
- AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0,
- "phkernel");
- AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
- "pstart_timer");
- AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
- "pstop_timer");
- AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0,
- "pdevice_data");
- AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0,
- "phost_data");
- Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
- AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
-
- // Initialize the GPU device.
- createCallInitDevice(PtrCUContext, PtrCUDevice);
-
- // Create the GPU kernel module and entry function.
- Value *PTXString = createPTXKernelFunction(F);
- Value *PTXEntry = getPTXKernelEntryName(F);
- createCallGetPTXModule(PTXString, PtrCUModule);
- LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
- createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
-
- // Allocate device memory and its corresponding host memory.
- createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
- getOutputArraySizeInBytes());
-
- // Get the pointer to the device memory and set the GPU execution parameters.
- LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
- LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
- createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
- getCUDABlockHeight(), DData);
-
- // Create the start and end timer and record the start time.
- createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
-
- // Launch the GPU kernel.
- createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
-
- // Copy the results back from the GPU to the host.
- LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
- createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
-
- // Record the end time.
- LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
- LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
- createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent,
- PtrElapsedTimes);
-
- // Cleanup all the resources used.
- LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
- createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext,
- CUKernel);
-
- // Erase the ptx kernel and device subfunctions and ptx intrinsics from
- // current module.
- eraseUnusedFunctions(F);
-}
Removed: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c (original)
+++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.c (removed)
@@ -1,16 +0,0 @@
-int A[128][128];
-
-int gpu_pure() {
- int i,j;
-
- for(i = 0; i < 128; i++)
- for(j = 0; j < 128; j++)
- A[i][j] = i*128 + j;
-
- return 0;
-}
-
-int main() {
- int b = gpu_pure();
- return 0;
-}
Removed: polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll (original)
+++ polly/trunk/test/CodeGen/GPGPU/2d_innermost_parallel.ll (removed)
@@ -1,65 +0,0 @@
-; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
-; ModuleID = '2d_innermost_parallel.s'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
- at A = common global [128 x [128 x i32]] zeroinitializer, align 16
-
-define i32 @gpu_pure() nounwind uwtable {
-entry:
- br label %for.cond
-
-for.cond: ; preds = %for.inc6, %entry
- %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
- %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
- %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
- br i1 %exitcond6, label %for.body, label %for.end8
-
-for.body: ; preds = %for.cond
- br label %for.cond1
-
-for.cond1: ; preds = %for.inc, %for.body
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
- %lftr.wideiv = trunc i64 %indvars.iv to i32
- %exitcond = icmp ne i32 %lftr.wideiv, 128
- br i1 %exitcond, label %for.body3, label %for.end
-
-for.body3: ; preds = %for.cond1
- %tmp = shl nsw i64 %indvars.iv2, 7
- %tmp7 = add nsw i64 %tmp, %indvars.iv
- %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
- %tmp8 = trunc i64 %tmp7 to i32
- store i32 %tmp8, i32* %arrayidx5, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body3
- %indvars.iv.next = add i64 %indvars.iv, 1
- br label %for.cond1
-
-for.end: ; preds = %for.cond1
- br label %for.inc6
-
-for.inc6: ; preds = %for.end
- %indvars.iv.next3 = add i64 %indvars.iv2, 1
- br label %for.cond
-
-for.end8: ; preds = %for.cond
- ret i32 0
-}
-
-define i32 @main() nounwind uwtable {
-entry:
- %call = call i32 @gpu_pure()
- ret i32 0
-}
-
-; CHECK: call void @polly_initDevice
-; CHECK: call void @polly_getPTXModule
-; CHECK: call void @polly_getPTXKernelEntry
-; CHECK: call void @polly_allocateMemoryForHostAndDevice
-; CHECK: call void @polly_setKernelParameters
-; CHECK: call void @polly_startTimerByCudaEvent
-; CHECK: call void @polly_launchKernel
-; CHECK: call void @polly_copyFromDeviceToHost
-; CHECK: call void @polly_stopTimerByCudaEvent
-; CHECK: call void @polly_cleanupGPGPUResources
Removed: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c (original)
+++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.c (removed)
@@ -1,17 +0,0 @@
-int A[128][128];
-
-int gpu_no_pure() {
- int i,j,k;
-
- for(i = 0; i < 128; i++)
- for(j = 0; j < 128; j++)
- for(k = 0; k < 256; k++)
- A[i][j] += i*123/(k+1)+5-j*k-123;
-
- return 0;
-}
-
-int main() {
- int b = gpu_no_pure();
- return 0;
-}
Removed: polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll (original)
+++ polly/trunk/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll (removed)
@@ -1,88 +0,0 @@
-; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
-; ModuleID = '3d_innermost_non_parallel.s'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
- at A = common global [128 x [128 x i32]] zeroinitializer, align 16
-
-define i32 @gpu_no_pure() nounwind uwtable {
-entry:
- br label %for.cond
-
-for.cond: ; preds = %for.inc16, %entry
- %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
- %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
- %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
- br i1 %exitcond6, label %for.body, label %for.end18
-
-for.body: ; preds = %for.cond
- br label %for.cond1
-
-for.cond1: ; preds = %for.inc13, %for.body
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
- %lftr.wideiv = trunc i64 %indvars.iv to i32
- %exitcond1 = icmp ne i32 %lftr.wideiv, 128
- br i1 %exitcond1, label %for.body3, label %for.end15
-
-for.body3: ; preds = %for.cond1
- br label %for.cond4
-
-for.cond4: ; preds = %for.inc, %for.body3
- %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
- %exitcond = icmp ne i32 %k.0, 256
- br i1 %exitcond, label %for.body6, label %for.end
-
-for.body6: ; preds = %for.cond4
- %tmp = mul nsw i64 %indvars.iv2, 123
- %add = add nsw i32 %k.0, 1
- %tmp7 = trunc i64 %tmp to i32
- %div = sdiv i32 %tmp7, %add
- %add7 = add nsw i32 %div, 5
- %tmp8 = trunc i64 %indvars.iv to i32
- %mul8 = mul nsw i32 %tmp8, %k.0
- %sub = sub nsw i32 %add7, %mul8
- %sub9 = add nsw i32 %sub, -123
- %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
- %tmp9 = load i32* %arrayidx11, align 4
- %add12 = add nsw i32 %tmp9, %sub9
- store i32 %add12, i32* %arrayidx11, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body6
- %inc = add nsw i32 %k.0, 1
- br label %for.cond4
-
-for.end: ; preds = %for.cond4
- br label %for.inc13
-
-for.inc13: ; preds = %for.end
- %indvars.iv.next = add i64 %indvars.iv, 1
- br label %for.cond1
-
-for.end15: ; preds = %for.cond1
- br label %for.inc16
-
-for.inc16: ; preds = %for.end15
- %indvars.iv.next3 = add i64 %indvars.iv2, 1
- br label %for.cond
-
-for.end18: ; preds = %for.cond
- ret i32 0
-}
-
-define i32 @main() nounwind uwtable {
-entry:
- %call = call i32 @gpu_no_pure()
- ret i32 0
-}
-
-; CHECK: call void @polly_initDevice
-; CHECK: call void @polly_getPTXModule
-; CHECK: call void @polly_getPTXKernelEntry
-; CHECK: call void @polly_allocateMemoryForHostAndDevice
-; CHECK: call void @polly_setKernelParameters
-; CHECK: call void @polly_startTimerByCudaEvent
-; CHECK: call void @polly_launchKernel
-; CHECK: call void @polly_copyFromDeviceToHost
-; CHECK: call void @polly_stopTimerByCudaEvent
-; CHECK: call void @polly_cleanupGPGPUResources
Removed: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop (original)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop (removed)
@@ -1,21 +0,0 @@
-{
- "context" : "{ : }",
- "name" : "for.cond => for.end18",
- "statements" : [
- {
- "accesses" : [
- {
- "kind" : "read",
- "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
- },
- {
- "kind" : "write",
- "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
- }
- ],
- "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
- "name" : "Stmt_for_body6",
- "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
- }
- ]
-}
Removed: polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%25for.cond---%25for.end18.jscop.transformed%2Bgpu?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu (original)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu (removed)
@@ -1,21 +0,0 @@
-{
- "context" : "{ : }",
- "name" : "for.cond => for.end18",
- "statements" : [
- {
- "accesses" : [
- {
- "kind" : "read",
- "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
- },
- {
- "kind" : "write",
- "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
- }
- ],
- "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
- "name" : "Stmt_for_body6",
- "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
- }
- ]
-}
Removed: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop (original)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop (removed)
@@ -1,17 +0,0 @@
-{
- "context" : "{ : }",
- "name" : "for.cond => for.end8",
- "statements" : [
- {
- "accesses" : [
- {
- "kind" : "write",
- "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
- }
- ],
- "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
- "name" : "Stmt_for_body3",
- "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
- }
- ]
-}
Removed: polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/CodeGen/GPGPU/gpu_pure___%25for.cond---%25for.end8.jscop.transformed%2Bgpu?rev=160164&view=auto
==============================================================================
--- polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu (original)
+++ polly/trunk/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu (removed)
@@ -1,17 +0,0 @@
-{
- "context" : "{ : }",
- "name" : "for.cond => for.end8",
- "statements" : [
- {
- "accesses" : [
- {
- "kind" : "write",
- "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
- }
- ],
- "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
- "name" : "Stmt_for_body3",
- "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
- }
- ]
-}
More information about the llvm-commits
mailing list