[PATCH] Optimize double storing by memset; memcpy (Take two)

Wed Mar 6 08:03:35 PST 2013

A common idiom in some code is to do the following:
  memset(dest, 0, dest_size);
  memcpy(dest, src, src_size);

This patch implements a rewrite to avoid storing to the same location twice. The above code is rewritten as:

 memcpy(dest, src, dest_size);
 memset((char *)dest + dest_size, setVal,
         src_size > dest_size ? src_size - dest_size : 0);

This is tricker than it seems---not the transformation, that is pretty straightforward. The issue is all of the transformations that happen from the innocent looking source above. On some systems, memset and memcpy are macros that get transformed into safety checking versions, which eventually result in llvm.memset and llvm.memcpy in the IR. This transformation occurs in SimplifyLibCalls, when directly invoked by instcombine. However, as originally written, MemCpyOptimizer is only run once and before the instcombine which exposes the intrinsics. I have added another run of MemCpyOptimizer that runs after SimplifyLibCalls. The numbers from running the nightly test on an unloaded machine indicate that there is no overall degradation in compile time. See the numbers below.

Before this change was put in:
CC_Time (geo mean): 0.548188 (ave) 255.628775
CC_Real_Time (geo mean): 0.630460 (ave) 264.708950
Exec_Time (geo mean): 0.027258 (ave) 59.208838
Exec_Real_time (geo mean): 0.062727 (ave) 60.841512

After the change was put in:
CC_Time (geo mean): 0.682091 (ave) 398.181300
CC_Real_Time (geo mean): 0.792249 (ave) 410.853200
Exec_Time (geo mean): 0.017159 (ave) 57.512063
Exec_Real_time (geo mean): 0.044320 (ave) 59.543125

Note: the geometric mean is so different from the arithmetic means, as there are lots of nightly tests that run for tiny fractions of a second.


http://llvm-reviews.chandlerc.com/D498

Files:
  test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll
  lib/Transforms/Scalar/MemCpyOptimizer.cpp
  lib/Transforms/IPO/PassManagerBuilder.cpp

Index: test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll
===================================================================

--- test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll
+++ test/Transforms/MemCpyOpt/memSetMemCpyArgRewrite.ll
@@ -0,0 +1,16 @@
+; RUN: opt -memcpyopt %s -S | FileCheck %s
+define void @foo(i8* %src, i32 %src_size, i8* %dest, i32 %dest_size, i8 %setVal) {
+; CHECK:  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %dest_size, i32 1, i1 true)
+; CHECK:  %NewDest = getelementptr i8* %dest, i32 %dest_size
+; CHECK:  %SrcSizeGT = icmp ugt i32 %dest_size, %dest_size
+; CHECK:  %LenDiff = sub i32 %dest_size, %dest_size
+; CHECK:  %NewLen = select i1 %SrcSizeGT, i32 %LenDiff, i32 0
+; CHECK:  call void @llvm.memset.p0i8.i32(i8* %NewDest, i8 %SetValTrunc, i32 %NewLen, i32 1, i1 true)
+
+  call void @llvm.memset.p0i8.i32(i8* %dest, i8 %setVal, i32 %dest_size, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %dest_size, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i32(i8* , i8, i32, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp
===================================================================
--- lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -335,6 +335,7 @@
                               uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
     bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                        uint64_t MSize);
+    bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
     bool processByValArgument(CallSite CS, unsigned ArgNo);
     Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                       Value *ByteVal);
@@ -800,13 +801,89 @@
   return true;
 }
 
+/// \brief We've found that the (upward scanning) memory dependence of
+/// \p MemCpy is \p MemSet.  Try to simplify \p MemSet to only set \p MemCpy's
+/// out that aren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+/// memset(dest, setVal, dest_size);
+/// memcpy(dest, src, src_size);
+/// \endcode
+///into:
+/// \code
+/// memcpy(dest, src, dest_size);
+/// memset((char *)dest + dest_size, setVal,
+///         src_size > dest_size ? src_size - dest_size : 0);
+/// \endcode
+bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+                                              MemSetInst *MemSet) {
+  // We can only transforms memset/memcpy with the same destinations.
+  if (MemSet->getDest() != MemCpy->getDest() || MemSet->isVolatile())
+    return false;
+
+  // Just blat out:
+  // call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %dest_size,
+  //                                      i32 1, i1 false)
+  // %newDest = getelementptr i8* %dest, i32 %src_size
+  // %srcSizeGT = icmp ugt i32 %src_size, %dest_size
+  // %lenDiff = sub i32 %src_size, %dest_size
+  // %newLen = select i1 %srcSizeGT, i32 %lenDiff, i32 0
+  // %setValTrunc = trunc i32 %setVal to i8
+  // call void @llvm.memset.p0i8.i32(i8* %newDest, i8 %setValTrunc,
+  //                                 i32 %newLen, i33 1, i1 false)
+
+  // first, get all of the variables needed: dest, dest_size, src, src_size,
+  // setVal
+  Value *Dest = MemSet->getOperand(0);
+  Value *DestSize = MemSet->getOperand(2);
+  Value *Src = MemCpy->getOperand(1);
+  Value *SrcSize = MemCpy->getOperand(2);
+  Value *SetVal = MemSet->getOperand(1);
+  unsigned Align = MemCpy->getAlignment();
+  Value *IsVolatile = MemCpy->getOperand(4);
+  IRBuilder<> Builder(MemCpy);
+
+  Builder.CreateMemCpy(Dest, Src, DestSize, Align, IsVolatile);
+
+  Value *NewDest = Builder.CreateGEP(Dest, SrcSize);
+  NewDest->setName("NewDest");
+  Value *SrcSizeGT = Builder.CreateICmpUGT(SrcSize, DestSize);
+  SrcSizeGT->setName("SrcSizeGT");
+  Value* LenDiff = Builder.CreateSub(SrcSize, DestSize);
+  LenDiff->setName("LenDiff");
+  LLVMContext &Ctx = MemCpy->getContext();
+  Constant *Zero = ConstantInt::getNullValue(LenDiff->getType());
+  Value * NewLen = Builder.CreateSelect(SrcSizeGT, LenDiff, Zero);
+  NewLen->setName("NewLen");
+  Value *SetValTrunc = Builder.CreateTrunc(SetVal, Type::getInt8Ty(Ctx));
+  SetValTrunc->setName("SetValTrunc");
+  CallInst *NewMemSet =
+    Builder.CreateMemSet(NewDest, SetValTrunc, NewLen, Align, IsVolatile);
+
+  MD->removeInstruction(MemCpy);
+  MemCpy->eraseFromParent();
+  MD->removeInstruction(MemSet);
+  MemSet->eraseFromParent();
+  DEBUG(NewMemSet->getParent()->dump());
+  
+  return true;
+}
 
 /// processMemCpy - perform simplification of memcpy's.  If we have memcpy A
 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
 bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+  AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M);
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,
+                                                         M, M->getParent());
+  if (SrcDepInfo.isClobber())
+    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+      if (processMemSetMemCpyDependence(M, MDep))
+        return true;
+
   // We can only optimize statically-sized memcpy's that are non-volatile.
   ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
   if (CopySize == 0 || M->isVolatile()) return false;
@@ -831,7 +908,7 @@
         return true;
       }
 
-  // The are two possible optimizations we can do for memcpy:
+  // There are two possible optimizations we can do for memcpy:
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
   MemDepResult DepInfo = MD->getDependency(M);
@@ -847,9 +924,6 @@
     }
   }
 
-  AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M);
-  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,
-                                                         M, M->getParent());
   if (SrcDepInfo.isClobber()) {
     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
       return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
@@ -998,6 +1072,7 @@
 // function.
 //
 bool MemCpyOpt::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "top of MemCpyOpt::runOnFunction\n");
   bool MadeChange = false;
   MD = &getAnalysis<MemoryDependenceAnalysis>();
   TD = getAnalysisIfAvailable<DataLayout>();
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -20,7 +20,9 @@
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
@@ -114,6 +116,7 @@
 }
 
 void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
+  DEBUG(dbgs() << "top of PassManagerBuilder::populateModulePassManager\n");
   // If all optimizations are disabled, just run the always-inline pass.
   if (OptLevel == 0) {
     if (Inliner) {
@@ -201,6 +204,9 @@
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
   MPM.add(createInstructionCombiningPass());
+  // Run MemCpyOptimization after instcombine, as it calls SimplifyLIbCalls,
+  // which can create llvm.mem* intrinsics, which MemCpyOptimization matches on.
+  MPM.add(createMemCpyOptPass());
   MPM.add(createJumpThreadingPass());         // Thread jumps
   MPM.add(createCorrelatedValuePropagationPass());
   MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D498.1.patch
Type: text/x-patch
Size: 8100 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130306/e678de94/attachment.bin>