[llvm] 91ef930 - [GlobalOpt] Remove preallocated calls when possible

Arthur Eubanks via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 18 10:01:36 PDT 2020


Author: Arthur Eubanks
Date: 2020-06-18T09:56:13-07:00
New Revision: 91ef9305268760727e8cc90e2542a803621b2336

URL: https://github.com/llvm/llvm-project/commit/91ef9305268760727e8cc90e2542a803621b2336
DIFF: https://github.com/llvm/llvm-project/commit/91ef9305268760727e8cc90e2542a803621b2336.diff

LOG: [GlobalOpt] Remove preallocated calls when possible

When possible (e.g. internal linkage), strip preallocated attribute off
parameters/arguments.
This requires removing the "preallocated" operand bundle from the call
site, replacing @llvm.call.preallocated.arg() with an alloca and a
bitcast to i8*, and removing the @llvm.call.preallocated.setup(). Since
@llvm.call.preallocated.arg() can be called multiple times with the same
arg index, we create an alloca per arg index.
We add a @llvm.stacksave() where the @llvm.call.preallocated.setup() was
and a @llvm.stackrestore() after the preallocated call to prevent the
stack from blowing up. This is valid because the argument would normally
not exist on the stack after the call before the transformation.

This does not currently handle all possible preallocated calls. We will
need to figure out where to put @llvm.stackrestore() in the cases where
there is no obvious place to put it, for example conditional
preallocated calls, invokes.

This sort of transformation may need to be moved to somewhere more
accessible to accomodate similar transformations (like inlining) in the
future.

Reviewers: efriedma, hans

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80951

Added: 
    llvm/test/Transforms/GlobalOpt/preallocated.ll

Modified: 
    llvm/lib/Transforms/IPO/GlobalOpt.cpp
    llvm/test/Transforms/GlobalOpt/fastcc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 0257be156f30..853aad5e207e 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -2270,6 +2271,118 @@ hasOnlyColdCalls(Function &F,
   return true;
 }
 
+static bool hasMustTailCallers(Function *F) {
+  for (User *U : F->users()) {
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB) {
+      assert(isa<BlockAddress>(U) &&
+             "Expected either CallBase or BlockAddress");
+      continue;
+    }
+    if (CB->isMustTailCall())
+      return true;
+  }
+  return false;
+}
+
+static bool hasInvokeCallers(Function *F) {
+  for (User *U : F->users())
+    if (isa<InvokeInst>(U))
+      return true;
+  return false;
+}
+
+static void RemovePreallocated(Function *F) {
+  RemoveAttribute(F, Attribute::Preallocated);
+
+  auto *M = F->getParent();
+
+  IRBuilder<> Builder(M->getContext());
+
+  // Cannot modify users() while iterating over it, so make a copy.
+  SmallVector<User *, 4> PreallocatedCalls(F->users());
+  for (User *U : PreallocatedCalls) {
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB)
+      continue;
+
+    assert(
+        !CB->isMustTailCall() &&
+        "Shouldn't call RemotePreallocated() on a musttail preallocated call");
+    // Create copy of call without "preallocated" operand bundle.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CB->getOperandBundlesAsDefs(OpBundles);
+    CallBase *PreallocatedSetup = nullptr;
+    for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) {
+      if (It->getTag() == "preallocated") {
+        PreallocatedSetup = cast<CallBase>(*It->input_begin());
+        OpBundles.erase(It);
+        break;
+      }
+    }
+    assert(PreallocatedSetup && "Did not find preallocated bundle");
+    uint64_t ArgCount =
+        cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue();
+    CallBase *NewCB = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
+      NewCB = InvokeInst::Create(II, OpBundles, CB);
+    } else {
+      CallInst *CI = cast<CallInst>(CB);
+      NewCB = CallInst::Create(CI, OpBundles, CB);
+    }
+    CB->replaceAllUsesWith(NewCB);
+    NewCB->takeName(CB);
+    CB->eraseFromParent();
+
+    Builder.SetInsertPoint(PreallocatedSetup);
+    auto *StackSave =
+        Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+
+    Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
+    Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
+                       StackSave);
+
+    // Replace @llvm.call.preallocated.arg() with alloca.
+    // Cannot modify users() while iterating over it, so make a copy.
+    // @llvm.call.preallocated.arg() can be called with the same index multiple
+    // times. So for each @llvm.call.preallocated.arg(), we see if we have
+    // already created a Value* for the index, and if not, create an alloca and
+    // bitcast right after the @llvm.call.preallocated.setup() so that it
+    // dominates all uses.
+    SmallVector<Value *, 2> ArgAllocas(ArgCount);
+    SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users());
+    for (auto *User : PreallocatedArgs) {
+      auto *UseCall = cast<CallBase>(User);
+      assert(UseCall->getCalledFunction()->getIntrinsicID() ==
+                 Intrinsic::call_preallocated_arg &&
+             "preallocated token use was not a llvm.call.preallocated.arg");
+      uint64_t AllocArgIndex =
+          cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue();
+      Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
+      if (!AllocaReplacement) {
+        auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
+        auto *ArgType = UseCall
+                            ->getAttribute(AttributeList::FunctionIndex,
+                                           Attribute::Preallocated)
+                            .getValueAsType();
+        auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
+        Builder.SetInsertPoint(InsertBefore);
+        auto *Alloca =
+            Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
+        auto *BitCast = Builder.CreateBitCast(
+            Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName());
+        ArgAllocas[AllocArgIndex] = BitCast;
+        AllocaReplacement = BitCast;
+      }
+
+      UseCall->replaceAllUsesWith(AllocaReplacement);
+      UseCall->eraseFromParent();
+    }
+    // Remove @llvm.call.preallocated.setup().
+    cast<Instruction>(PreallocatedSetup)->eraseFromParent();
+  }
+}
+
 static bool
 OptimizeFunctions(Module &M,
                   function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2333,13 +2446,23 @@ OptimizeFunctions(Module &M,
     // wouldn't be safe in the presence of inalloca.
     // FIXME: We should also hoist alloca affected by this to the entry
     // block if possible.
-    // FIXME: handle preallocated
     if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
         !F->hasAddressTaken()) {
       RemoveAttribute(F, Attribute::InAlloca);
       Changed = true;
     }
 
+    // FIXME: handle invokes
+    // FIXME: handle musttail
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+      if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
+          !hasInvokeCallers(F)) {
+        RemovePreallocated(F);
+        Changed = true;
+      }
+      continue;
+    }
+
     if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
       NumInternalFunc++;
       TargetTransformInfo &TTI = GetTTI(*F);

diff  --git a/llvm/test/Transforms/GlobalOpt/fastcc.ll b/llvm/test/Transforms/GlobalOpt/fastcc.ll
index 7bf3e9700f1c..9c9076d0155b 100644
--- a/llvm/test/Transforms/GlobalOpt/fastcc.ll
+++ b/llvm/test/Transforms/GlobalOpt/fastcc.ll
@@ -36,8 +36,7 @@ define internal i32 @inalloca(i32* inalloca %p) {
 }
 
 define internal i32 @preallocated(i32* preallocated(i32) %p) {
-; TODO: handle preallocated:
-; CHECK-NOT-LABEL: define internal fastcc i32 @preallocated(i32* %p)
+; CHECK-LABEL: define internal fastcc i32 @preallocated(i32* %p)
   %rv = load i32, i32* %p
   ret i32 %rv
 }
@@ -50,21 +49,21 @@ define void @call_things() {
   call i32 @j(i32* %m)
   %args = alloca inalloca i32
   call i32 @inalloca(i32* inalloca %args)
-  ; TODO: handle preallocated
-  ;%c = call token @llvm.call.preallocated.setup(i32 1)
-  ;%N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
-  ;%n = bitcast i8* %N to i32*
-   ;call i32 @preallocated(i32* preallocated(i32) %n) ["preallocated"(token %c)]
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  %n = bitcast i8* %N to i32*
+  call i32 @preallocated(i32* preallocated(i32) %n) ["preallocated"(token %c)]
   ret void
 }
-
- at llvm.used = appending global [1 x i8*] [
-   i8* bitcast (i32(i32*)* @j to i8*)
-], section "llvm.metadata"
-
 ; CHECK-LABEL: define void @call_things()
 ; CHECK: call fastcc i32 @f
 ; CHECK: call fastcc i32 @g
 ; CHECK: call coldcc i32 @h
 ; CHECK: call i32 @j
 ; CHECK: call fastcc i32 @inalloca(i32* %args)
+; CHECK-NOT: llvm.call.preallocated
+; CHECK: call fastcc i32 @preallocated(i32* %n)
+
+ at llvm.used = appending global [1 x i8*] [
+   i8* bitcast (i32(i32*)* @j to i8*)
+], section "llvm.metadata"

diff  --git a/llvm/test/Transforms/GlobalOpt/preallocated.ll b/llvm/test/Transforms/GlobalOpt/preallocated.ll
new file mode 100644
index 000000000000..972ce8177928
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/preallocated.ll
@@ -0,0 +1,88 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+declare i32 @__CxxFrameHandler3(...)
+
+; Don't touch functions with any musttail calls
+define internal i32 @preallocated_musttail(i32* preallocated(i32) %p) {
+; CHECK-LABEL: define internal i32 @preallocated_musttail(i32* preallocated(i32) %p)
+  %rv = load i32, i32* %p
+  ret i32 %rv
+}
+
+define i32 @call_preallocated_musttail(i32* preallocated(i32) %a) {
+  %r = musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a)
+  ret i32 %r
+}
+; CHECK-LABEL: define i32 @call_preallocated_musttail(i32* preallocated(i32) %a)
+; CHECK: musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a)
+
+define i32 @call_preallocated_musttail_without_musttail() {
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  %n = bitcast i8* %N to i32*
+  %r = call i32 @preallocated_musttail(i32* preallocated(i32) %n) ["preallocated"(token %c)]
+  ret i32 %r
+}
+; CHECK-LABEL: define i32 @call_preallocated_musttail_without_musttail()
+; CHECK: call i32 @preallocated_musttail(i32* preallocated(i32) %n)
+
+; Check that only one alloca per preallocated arg
+define internal i32 @preallocated(i32* preallocated(i32) %a) {
+; CHECK-LABEL: define internal fastcc i32 @preallocated(i32* %a)
+  %rv = load i32, i32* %a
+  ret i32 %rv
+}
+
+declare void @foo(i8*)
+
+define i32 @call_preallocated_multiple_args() {
+; CHECK-LABEL: define i32 @call_preallocated_multiple_args()
+; CHECK-NEXT: [[SS:%[0-9a-zA-Z_]+]] = call i8* @llvm.stacksave()
+; CHECK-NEXT: [[ARG0:%[0-9a-zA-Z_]+]] = alloca i32
+; CHECK-NEXT: [[ARG1:%[0-9a-zA-Z_]+]] = bitcast i32* [[ARG0]] to i8*
+; CHECK-NEXT: call void @foo(i8* [[ARG1]])
+; CHECK-NEXT: call void @foo(i8* [[ARG1]])
+; CHECK-NEXT: call void @foo(i8* [[ARG1]])
+; CHECK-NEXT: [[ARG2:%[0-9a-zA-Z_]+]] = bitcast i8* [[ARG1]] to i32*
+; CHECK-NEXT: call fastcc i32 @preallocated(i32* [[ARG2]])
+; CHECK-NEXT: call void @llvm.stackrestore(i8* [[SS]])
+; CHECK-NEXT: ret
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %a1 = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  call void @foo(i8* %a1)
+  %a2 = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  call void @foo(i8* %a2)
+  %a3 = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  call void @foo(i8* %a3)
+  %b = bitcast i8* %a3 to i32*
+  %r = call i32 @preallocated(i32* preallocated(i32) %b) ["preallocated"(token %c)]
+  ret i32 %r
+}
+
+; Don't touch functions with any invokes
+define internal i32 @preallocated_invoke(i32* preallocated(i32) %p) {
+; CHECK-LABEL: define internal i32 @preallocated_invoke(i32* preallocated(i32) %p)
+  %rv = load i32, i32* %p
+  ret i32 %rv
+}
+
+define i32 @call_preallocated_invoke() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  %b = bitcast i8* %a to i32*
+  %r = invoke i32 @preallocated_invoke(i32* preallocated(i32) %b) ["preallocated"(token %c)]
+       to label %conta unwind label %contb
+conta:
+  ret i32 %r
+contb:
+  %s = catchswitch within none [label %catch] unwind to caller
+catch:
+  %p = catchpad within %s []
+  catchret from %p to label %cont
+cont:
+  ret i32 42
+}
+; CHECK-LABEL: define i32 @call_preallocated_invoke()
+; CHECK: invoke i32 @preallocated_invoke(i32* preallocated(i32) %b)


        


More information about the llvm-commits mailing list