[llvm-commits] [llvm] r47265 - in /llvm/trunk: lib/Transforms/Scalar/GVN.cpp test/Transforms/GVN/sret.ll

Mon Feb 18 01:24:54 PST 2008

Author: resistor
Date: Mon Feb 18 03:24:53 2008
New Revision: 47265

URL: http://llvm.org/viewvc/llvm-project?rev=47265&view=rev
Log:
Add support to GVN for performing sret return slot optimization.  This means that, if an sret function tail calls
another sret function, it should pass its own sret parameter to the tail callee, allowing it to fill in the correct
return value.  llvm-gcc does not emit this by default.  Instead, it allocates space in the caller for the sret of
the tail call and then uses memcpy to copy the result into the caller's sret parameter.  This optimization detects
and optimizes that case.

Added:
    llvm/trunk/test/Transforms/GVN/sret.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/GVN.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/GVN.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/GVN.cpp?rev=47265&r1=47264&r2=47265&view=diff

==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/GVN.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/GVN.cpp Mon Feb 18 03:24:53 2008
@@ -21,6 +21,7 @@
 #include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Instructions.h"
+#include "llvm/ParameterAttributes.h"
 #include "llvm/Value.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -738,6 +739,8 @@
     bool processNonLocalLoad(LoadInst* L,
                              SmallVector<Instruction*, 4>& toErase);
     bool processMemCpy(MemCpyInst* M, SmallVector<Instruction*, 4>& toErase);
+    bool performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+                                SmallVector<Instruction*, 4>& toErase);
     Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
                             DenseMap<BasicBlock*, Value*> &Phis,
                             bool top_level = false);
@@ -1048,6 +1051,62 @@
   return deletedLoad;
 }
 
+/// performReturnSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a return slot optimization by having
+/// the call write its result directly into the callees return parameter
+/// rather than using memcpy
+bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+                                 SmallVector<Instruction*, 4>& toErase) {
+  // Check that we're copying to an argument...
+  Value* cpyDest = cpy->getDest();
+  if (!isa<Argument>(cpyDest))
+    return false;
+  
+  // And that the argument is the return slot
+  Argument* sretArg = cast<Argument>(cpyDest);
+  if (!sretArg->hasStructRetAttr())
+    return false;
+  
+  // Make sure the return slot is otherwise dead
+  std::set<User*> useList(sretArg->use_begin(), sretArg->use_end());
+  while (!useList.empty()) {
+    User* UI = *useList.begin();
+    
+    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+      useList.insert(UI->use_begin(), UI->use_end());
+      useList.erase(UI);
+    } else if (UI == cpy)
+      useList.erase(UI);
+    else
+      return false;
+  }
+  
+  // Make sure the call cannot modify the return slot in some unpredicted way
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.getModRefInfo(C, cpy->getRawDest(), ~0UL) != AliasAnalysis::NoModRef)
+    return false;
+  
+  // If all checks passed, then we can perform the transformation
+  CallSite CS = CallSite::get(C);
+  for (unsigned i = 0; i < CS.arg_size(); ++i) {
+    if (CS.paramHasAttr(i+1, ParamAttr::StructRet)) {
+      if (CS.getArgument(i)->getType() != cpyDest->getType())
+        return false;
+      
+      CS.setArgument(i, cpyDest);
+      break;
+    }
+  }
+  
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  MD.dropInstruction(C);
+  
+  // Remove the memcpy
+  toErase.push_back(cpy);
+  
+  return true;
+}
+
 /// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
 /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
 /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
@@ -1059,9 +1118,14 @@
   // First, we have to check that the dependency is another memcpy
   Instruction* dep = MD.getDependency(M);
   if  (dep == MemoryDependenceAnalysis::None ||
-       dep == MemoryDependenceAnalysis::NonLocal ||
-       !isa<MemCpyInst>(dep))
+       dep == MemoryDependenceAnalysis::NonLocal)
     return false;
+  else if (!isa<MemCpyInst>(dep)) {
+    if (CallInst* C = dyn_cast<CallInst>(dep))
+      return performReturnSlotOptzn(M, C, toErase);
+    else
+      return false;
+  }
   
   // We can only transforms memcpy's where the dest of one is the source of the
   // other

Added: llvm/trunk/test/Transforms/GVN/sret.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/GVN/sret.ll?rev=47265&view=auto

==============================================================================
--- llvm/trunk/test/Transforms/GVN/sret.ll (added)
+++ llvm/trunk/test/Transforms/GVN/sret.ll Mon Feb 18 03:24:53 2008
@@ -0,0 +1,28 @@
+; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep memcpy | count 1
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin9"
+
+define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval  %z) nounwind  {
+entry:
+	%iz = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=3]
+	%memtmp = alloca { x86_fp80, x86_fp80 }, align 16		; <{ x86_fp80, x86_fp80 }*> [#uses=2]
+	%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1		; <x86_fp80*> [#uses=1]
+	%tmp2 = load x86_fp80* %tmp1, align 16		; <x86_fp80> [#uses=1]
+	%tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2		; <x86_fp80> [#uses=1]
+	%tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1		; <x86_fp80*> [#uses=1]
+	%real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0		; <x86_fp80*> [#uses=1]
+	%tmp7 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0		; <x86_fp80*> [#uses=1]
+	%tmp8 = load x86_fp80* %tmp7, align 16		; <x86_fp80> [#uses=1]
+	store x86_fp80 %tmp3, x86_fp80* %real, align 16
+	store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16
+	call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %memtmp, { x86_fp80, x86_fp80 }* byval  %iz ) nounwind 
+	%memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8*		; <i8*> [#uses=1]
+	%agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8*		; <i8*> [#uses=1]
+	call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )
+	ret void
+}
+
+declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval ) nounwind 
+
+declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind