[polly] r311080 - [ManagedMemoryRewrite] Learn how to rewrite global arrays, allocas.
Siddharth Bhat via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 17 04:22:52 PDT 2017
Author: bollu
Date: Thu Aug 17 04:22:52 2017
New Revision: 311080
URL: http://llvm.org/viewvc/llvm-project?rev=311080&view=rev
Log:
[ManagedMemoryRewrite] Learn how to rewrite global arrays, allocas.
- If we have global arrays, we would like to rewrite them to global
pointers which are allocated using `cudaMallocManaged`.
- If we have allocas in a function, we would like to rewrite them to
heap-allocations with `cudaMallocManaged` and `cudaFree`.
- With these rewrite mechanisms, we can offload _any_ function to the
GPU with no code rewrite whatsover.
Differential Revision: https://reviews.llvm.org/D36516
Added:
polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll
polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll
Modified:
polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp
Modified: polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp?rev=311080&r1=311079&r2=311080&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp (original)
+++ polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp Thu Aug 17 04:22:52 2017
@@ -1,5 +1,4 @@
-//===------ ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory.
-//---===//
+//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
//
// The LLVM Compiler Infrastructure
//
@@ -30,6 +29,7 @@
#include "polly/Support/SCEVValidator.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -43,14 +43,25 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+static cl::opt<bool> RewriteAllocas(
+ "polly-acc-rewrite-allocas",
+ cl::desc(
+ "Ask the managed memory rewriter to also rewrite alloca instructions"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<bool> IgnoreLinkageForGlobals(
+ "polly-acc-rewrite-ignore-linkage-for-globals",
+ cl::desc(
+ "By default, we only rewrite globals with internal linkage. This flag "
+ "enables rewriting of globals regardless of linkage"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
namespace {
-static llvm::Function *GetOrCreatePollyMallocManaged(Module &M) {
- // TODO: should I allow this pass to be a standalone pass that
- // doesn't care if PollyManagedMemory is enabled or not?
- assert(PollyManagedMemory &&
- "One should only rewrite malloc & free to"
- "polly_{malloc,free}Managed with managed memory enabled.");
+static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
const char *Name = "polly_mallocManaged";
Function *F = M.getFunction(Name);
@@ -67,12 +78,7 @@ static llvm::Function *GetOrCreatePollyM
return F;
}
-static llvm::Function *GetOrCreatePollyFreeManaged(Module &M) {
- // TODO: should I allow this pass to be a standalone pass that
- // doesn't care if PollyManagedMemory is enabled or not?
- assert(PollyManagedMemory &&
- "One should only rewrite malloc & free to"
- "polly_{malloc,free}Managed with managed memory enabled.");
+static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
const char *Name = "polly_freeManaged";
Function *F = M.getFunction(Name);
@@ -89,17 +95,242 @@ static llvm::Function *GetOrCreatePollyF
return F;
}
+// Expand a constant expression `Cur`, which is used at instruction `Parent`
+// at index `index`.
+// Since a constant expression can expand to multiple instructions, store all
+// the expands into a set called `Expands`.
+// Note that this goes inorder on the constant expression tree.
+// A * ((B * D) + C)
+// will be processed with first A, then B * D, then B, then D, and then C.
+// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
+// have something like this:
+// *
+// / \
+// \ /
+// (D)
+//
+// For the purposes of this expansion, we expand the two occurences of D
+// separately. Therefore, we expand the DAG into the tree:
+// *
+// / \
+// D D
+// TODO: We don't _have_to do this, but this is the simplest solution.
+// We can write a solution that keeps track of which constants have been
+// already expanded.
+static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
+ Instruction *Parent, int index,
+ SmallPtrSet<Instruction *, 4> &Expands) {
+ assert(Cur && "invalid constant expression passed");
+
+ Instruction *I = Cur->getAsInstruction();
+ Expands.insert(I);
+ Parent->setOperand(index, I);
+
+ assert(I && "unable to convert ConstantExpr to Instruction");
+ // The things that `Parent` uses (its operands) should be created
+ // before `Parent`.
+ Builder.SetInsertPoint(Parent);
+ Builder.Insert(I);
+
+ DEBUG(dbgs() << "Expanding ConstantExpression: " << *Cur
+ << " | in Instruction: " << *I << "\n";);
+ for (unsigned i = 0; i < Cur->getNumOperands(); i++) {
+ Value *Op = Cur->getOperand(i);
+ assert(isa<Constant>(Op) && "constant must have a constant operand");
+
+ if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
+ expandConstantExpr(CExprOp, Builder, I, i, Expands);
+ }
+}
+
+// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
+// `ConstantExpr`s that are used in the `Inst`.
+// Note that `replaceAllUsesWith` is insufficient for this purpose because it
+// does not rewrite values in `ConstantExpr`s.
+static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal,
+ PollyIRBuilder &Builder) {
+
+ // This contains a set of instructions in which OldVal must be replaced.
+ // We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
+ // from `Inst`s arguments.
+ // We need to go through this process because `replaceAllUsesWith` does not
+ // actually edit `ConstantExpr`s.
+ SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};
+
+ // Expand all `ConstantExpr`s and place it in `InstsToVisit`.
+ for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
+ Value *Operand = Inst->getOperand(i);
+ if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
+ expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
+ }
+
+ // Now visit each instruction and use `replaceUsesOfWith`. We know that
+ // will work because `I` cannot have any `ConstantExpr` within it.
+ for (Instruction *I : InstsToVisit)
+ I->replaceUsesOfWith(OldVal, NewVal);
+}
+
+// Given a value `Current`, return all Instructions that may contain `Current`
+// in an expression.
+// We need this auxiliary function, because if we have a
+// `Constant` that is a user of `V`, we need to recurse into the
+// `Constant`s uses to gather the root instruciton.
+static void getInstructionUsersOfValue(Value *V,
+ SmallVector<Instruction *, 4> &Owners) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ Owners.push_back(I);
+ } else {
+ // Anything that is a `User` must be a constant or an instruction.
+ auto *C = cast<Constant>(V);
+ for (Use &CUse : C->uses())
+ getInstructionUsersOfValue(CUse.getUser(), Owners);
+ }
+}
+
+static void
+replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
+ SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
+ // We only want arrays.
+ ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getType()->getElementType());
+ if (!ArrayTy)
+ return;
+ Type *ElemTy = ArrayTy->getElementType();
+ PointerType *ElemPtrTy = ElemTy->getPointerTo();
+
+ // We only wish to replace arrays that are visible in the module they
+ // inhabit. Otherwise, our type edit from [T] to T* would be illegal across
+ // modules.
+ const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() ||
+ Array.hasInternalLinkage() ||
+ IgnoreLinkageForGlobals;
+ if (!OnlyVisibleInsideModule)
+ return;
+
+ if (!Array.hasInitializer() ||
+ !isa<ConstantAggregateZero>(Array.getInitializer()))
+ return;
+
+ // At this point, we have committed to replacing this array.
+ ReplacedGlobals.insert(&Array);
+
+ std::string NewName = (Array.getName() + Twine(".toptr")).str();
+ GlobalVariable *ReplacementToArr =
+ cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
+ ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));
+
+ Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
+ Twine FnName = Array.getName() + ".constructor";
+ PollyIRBuilder Builder(M.getContext());
+ FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
+ const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ Function *F = Function::Create(Ty, Linkage, FnName, &M);
+ BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
+ Builder.SetInsertPoint(Start);
+
+ int ArraySizeInt = DL.getTypeAllocSizeInBits(ArrayTy) / 8;
+ Value *ArraySize = Builder.getInt64(ArraySizeInt);
+ ArraySize->setName("array.size");
+
+ Value *AllocatedMemRaw =
+ Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
+ Value *AllocatedMemTyped =
+ Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
+ Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
+ Builder.CreateRetVoid();
+
+ const int Priority = 0;
+ appendToGlobalCtors(M, F, Priority, ReplacementToArr);
+
+ SmallVector<Instruction *, 4> ArrayUserInstructions;
+ // Get all instructions that use array. We need to do this weird thing
+ // because `Constant`s that contain this array neeed to be expanded into
+ // instructions so that we can replace their parameters. `Constant`s cannot
+ // be edited easily, so we choose to convert all `Constant`s to
+ // `Instruction`s and handle all of the uses of `Array` uniformly.
+ for (Use &ArrayUse : Array.uses())
+ getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);
+
+ for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
+
+ Builder.SetInsertPoint(UserOfArrayInst);
+ // <ty>** -> <ty>*
+ Value *ArrPtrLoaded = Builder.CreateLoad(ReplacementToArr, "arrptr.load");
+ // <ty>* -> [ty]*
+ Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
+ ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
+ rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
+ }
+}
+
+// We return all `allocas` that may need to be converted to a call to
+// cudaMallocManaged.
+static void getAllocasToBeManaged(Function &F,
+ SmallSet<AllocaInst *, 4> &Allocas) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ auto *Alloca = dyn_cast<AllocaInst>(&I);
+ if (!Alloca)
+ continue;
+ dbgs() << "Checking if " << *Alloca << "may be captured: ";
+
+ if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
+ /* StoreCaptures */ true)) {
+ Allocas.insert(Alloca);
+ DEBUG(dbgs() << "YES (captured)\n");
+ } else {
+ DEBUG(dbgs() << "NO (not captured)\n");
+ }
+ }
+ }
+}
+
+static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
+ const DataLayout &DL) {
+ DEBUG(dbgs() << "rewriting: " << *Alloca << " to managed mem.\n");
+ Module *M = Alloca->getModule();
+ assert(M && "Alloca does not have a module");
+
+ PollyIRBuilder Builder(M->getContext());
+ Builder.SetInsertPoint(Alloca);
+
+ Value *MallocManagedFn = getOrCreatePollyMallocManaged(*Alloca->getModule());
+ const int Size = DL.getTypeAllocSize(Alloca->getType()->getElementType());
+ Value *SizeVal = Builder.getInt64(Size);
+ Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
+ Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());
+
+ Function *F = Alloca->getFunction();
+ assert(F && "Alloca has invalid function");
+
+ Bitcasted->takeName(Alloca);
+ Alloca->replaceAllUsesWith(Bitcasted);
+ Alloca->eraseFromParent();
+
+ for (BasicBlock &BB : *F) {
+ ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!Return)
+ continue;
+ Builder.SetInsertPoint(Return);
+
+ Value *FreeManagedFn = getOrCreatePollyFreeManaged(*M);
+ Builder.CreateCall(FreeManagedFn, {RawManagedMem});
+ }
+}
+
class ManagedMemoryRewritePass : public ModulePass {
public:
static char ID;
GPUArch Architecture;
GPURuntime Runtime;
+
ManagedMemoryRewritePass() : ModulePass(ID) {}
virtual bool runOnModule(Module &M) {
+ const DataLayout &DL = M.getDataLayout();
+
Function *Malloc = M.getFunction("malloc");
if (Malloc) {
- Function *PollyMallocManaged = GetOrCreatePollyMallocManaged(M);
+ Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
assert(PollyMallocManaged && "unable to create polly_mallocManaged");
Malloc->replaceAllUsesWith(PollyMallocManaged);
Malloc->eraseFromParent();
@@ -108,12 +339,28 @@ public:
Function *Free = M.getFunction("free");
if (Free) {
- Function *PollyFreeManaged = GetOrCreatePollyFreeManaged(M);
+ Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
assert(PollyFreeManaged && "unable to create polly_freeManaged");
Free->replaceAllUsesWith(PollyFreeManaged);
Free->eraseFromParent();
}
+ SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
+ for (GlobalVariable &Global : M.globals())
+ replaceGlobalArray(M, DL, Global, GlobalsToErase);
+ for (GlobalVariable *G : GlobalsToErase)
+ G->eraseFromParent();
+
+ // Rewrite allocas to cudaMallocs if we are asked to do so.
+ if (RewriteAllocas) {
+ SmallSet<AllocaInst *, 4> AllocasToBeManaged;
+ for (Function &F : M.functions())
+ getAllocasToBeManaged(F, AllocasToBeManaged);
+
+ for (AllocaInst *Alloca : AllocasToBeManaged)
+ rewriteAllocaAsManagedMemory(Alloca, DL);
+ }
+
return true;
}
};
Added: polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll?rev=311080&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll (added)
+++ polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll Thu Aug 17 04:22:52 2017
@@ -0,0 +1,61 @@
+; RUN: opt %loadPolly -analyze -polly-process-unprofitable \
+; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP
+
+; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \
+; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
+; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR
+
+; REQUIRES: pollyacc
+
+; SCOP: Function: f
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth: 1
+; SCOP: i32 MemRef_arr[*];
+
+; Check that we generate a constructor call for @A.toptr
+; HOST-IR-NOT: %arr = alloca [100 x i32]
+
+source_filename = "test.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+
+define void @f() {
+entry:
+ %arr = alloca [100 x i32]
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %entry.split, %for.body
+ %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %arr, i64 0, i64 %indvars.iv1
+ store i32 42, i32* %arrayidx, align 4, !tbaa !3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 100
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
+
+attributes #0 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
Added: polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll?rev=311080&view=auto
==============================================================================
--- polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll (added)
+++ polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll Thu Aug 17 04:22:52 2017
@@ -0,0 +1,73 @@
+; RUN: opt %loadPolly -analyze -polly-process-unprofitable \
+; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP
+
+; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \
+; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
+; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
+
+; REQUIRES: pollyacc
+
+; SCOP: Function: f
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth: 1
+; SCOP: i32 MemRef_A[*];
+
+; Check that we generate a constructor call for @A.toptr
+; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.constructor, i8* bitcast (i32** @A.toptr to i8*) }]
+
+; Check that we generate a constructor
+; 4 bytes * 100 = 400
+; HOST-IR: define void @.constructor() {
+; HOST-IR-NEXT: entry:
+; HOST-IR-NEXT: %mem.raw = call i8* @polly_mallocManaged(i64 400)
+; HOST-IR-NEXT: %mem.typed = bitcast i8* %mem.raw to i32*
+; HOST-IR-NEXT: store i32* %mem.typed, i32** @A.toptr
+; HOST-IR-NEXT: ret void
+; HOST-IR-NEXT: }
+
+; HOST-IR-NOT: @A
+
+source_filename = "test.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+ at A = internal global [100 x i32] zeroinitializer, align 16
+
+define void @f() {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %entry.split, %for.body
+ %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @A, i64 0, i64 %indvars.iv1
+ store i32 42, i32* %arrayidx, align 4, !tbaa !3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 100
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
+
+attributes #0 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
More information about the llvm-commits
mailing list