[llvm-commits] [llvm] r164119 - in /llvm/trunk: lib/Transforms/IPO/PassManagerBuilder.cpp lib/Transforms/Scalar/SROA.cpp test/Transforms/SROA/fca.ll

Tue Sep 18 05:57:43 PDT 2012

Author: chandlerc
Date: Tue Sep 18 07:57:43 2012
New Revision: 164119

URL: http://llvm.org/viewvc/llvm-project?rev=164119&view=rev
Log:
Add a major missing piece to the new SROA pass: aggressive splitting of
FCAs. This is essential in order to promote allocas that are used in
struct returns by frontends like Clang. The FCA load would block the
rest of the pass from firing, resulting is significant regressions with
the bullet benchmark in the nightly test suite.

Thanks to Duncan for repeated discussions about how best to do this, and
to both him and Benjamin for review.

This appears to have blocked many places where the pass tries to fire,
and so I'm expect somewhat different results with this fix added.

As with the last big patch, I'm including a change to enable the SROA by
default *temporarily*. Ben is going to remove this as soon as the LNT
bots pick up the patch. I'm just trying to get a round of LNT numbers
from the stable machines in the lab.

NOTE: Four clang tests are expected to fail in the brief window where
this is enabled. Sorry for the noise!

Added:
    llvm/trunk/test/Transforms/SROA/fca.ll
Modified:
    llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp
    llvm/trunk/lib/Transforms/Scalar/SROA.cpp

Modified: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp?rev=164119&r1=164118&r2=164119&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp (original)
+++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp Tue Sep 18 07:57:43 2012
@@ -41,7 +41,7 @@
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
 
 static cl::opt<bool> UseNewSROA("use-new-sroa",
-  cl::init(false), cl::Hidden,
+  cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
 
 PassManagerBuilder::PassManagerBuilder() {

Modified: llvm/trunk/lib/Transforms/Scalar/SROA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/SROA.cpp?rev=164119&r1=164118&r2=164119&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/SROA.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/SROA.cpp Tue Sep 18 07:57:43 2012
@@ -569,14 +569,19 @@
   }
 
   bool visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
     return handleLoadOrStore(LI.getType(), LI, Offset);
   }
 
   bool visitStoreInst(StoreInst &SI) {
-    if (SI.getOperand(0) == *U)
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
       return markAsEscaping(SI);
 
-    return handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    return handleLoadOrStore(ValOp->getType(), SI, Offset);
   }
 
 
@@ -1051,10 +1056,10 @@
 Type *AllocaPartitioning::getCommonType(iterator I) const {
   Type *Ty = 0;
   for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (isa<MemIntrinsic>(*UI->User))
+    if (isa<IntrinsicInst>(*UI->User))
       continue;
     if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
-      break;
+      return 0;
 
     Type *UserTy = 0;
     if (LoadInst *LI = dyn_cast<LoadInst>(&*UI->User)) {
@@ -2409,6 +2414,208 @@
 };
 }
 
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  const TargetData &TD;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  AggLoadStoreRewriter(const TargetData &TD) : TD(TD) {}
+
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+         ++UI)
+      if (Visited.insert(*UI))
+        Queue.push_back(&UI.getUse());
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission routine.
+  ///
+  /// This method recursively splits an aggregate op (load or store) into
+  /// scalar or vector ops. It splits recursively until it hits a single value
+  /// and emits that single value operation via the template argument.
+  ///
+  /// The logic of this routine relies on GEPs and insertvalue and extractvalue
+  /// all operating with the same fundamental index list, merely formatted
+  /// differently (GEPs need actual values).
+  ///
+  /// \param IRB The builder used to form new instructions.
+  /// \param Ty  The type being split recursively into smaller ops.
+  /// \param Agg The aggregate value being built up or stored, depending on
+  /// whether this is splitting a load or a store respectively.
+  /// \param Ptr The base pointer of the original op, used as a base for GEPing
+  /// the split operations.
+  /// \param Indices The indices which to be used with insert- or extractvalue
+  /// to select the appropriate value within the aggregate for \p Ty.
+  /// \param GEPIndices The indices to a GEP instruction which will move \p Ptr
+  /// to the correct slot within the aggregate for \p Ty.
+  template <void (AggLoadStoreRewriter::*emitFunc)(
+                IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices,
+                const Twine &Name)>
+  void emitSplitOps(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                    SmallVectorImpl<unsigned> &Indices,
+                    SmallVectorImpl<Value *> &GEPIndices,
+                    const Twine &Name) {
+    if (Ty->isSingleValueType())
+      return (this->*emitFunc)(IRB, Ty, Agg, Ptr, Indices, GEPIndices, Name);
+
+    if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+      unsigned OldSize = Indices.size();
+      (void)OldSize;
+      for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; ++Idx) {
+        assert(Indices.size() == OldSize && "Did not return to the old size");
+        Indices.push_back(Idx);
+        GEPIndices.push_back(IRB.getInt32(Idx));
+        emitSplitOps<emitFunc>(IRB, ATy->getElementType(), Agg, Ptr,
+                               Indices, GEPIndices, Name + "." + Twine(Idx));
+        GEPIndices.pop_back();
+        Indices.pop_back();
+      }
+      return;
+    }
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      unsigned OldSize = Indices.size();
+      (void)OldSize;
+      for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; ++Idx) {
+        assert(Indices.size() == OldSize && "Did not return to the old size");
+        Indices.push_back(Idx);
+        GEPIndices.push_back(IRB.getInt32(Idx));
+        emitSplitOps<emitFunc>(IRB, STy->getElementType(Idx), Agg, Ptr,
+                               Indices, GEPIndices, Name + "." + Twine(Idx));
+        GEPIndices.pop_back();
+        Indices.pop_back();
+      }
+      return;
+    }
+
+    llvm_unreachable("Only arrays and structs are aggregate loadable types");
+  }
+
+  /// Emit a leaf load of a single value. This is called at the leaves of the
+  /// recursive emission to actually load values.
+  void emitLoad(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices,
+                const Twine &Name) {
+    assert(Ty->isSingleValueType());
+    // Load the single value and insert it using the indices.
+    Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
+                                                       Name + ".gep"),
+                                 Name + ".load");
+    Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+    DEBUG(dbgs() << "          to: " << *Load << "\n");
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    IRBuilder<> IRB(&LI);
+    Value *V = UndefValue::get(LI.getType());
+    SmallVector<unsigned, 4> Indices;
+    SmallVector<Value *, 4> GEPIndices;
+    GEPIndices.push_back(IRB.getInt32(0));
+    emitSplitOps<&AggLoadStoreRewriter::emitLoad>(
+      IRB, LI.getType(), V, *U, Indices, GEPIndices, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  /// Emit a leaf store of a single value. This is called at the leaves of the
+  /// recursive emission to actually produce stores.
+  void emitStore(IRBuilder<> &IRB, Type *Ty, Value *&Agg, Value *Ptr,
+                 ArrayRef<unsigned> Indices, ArrayRef<Value *> GEPIndices,
+                 const Twine &Name) {
+    assert(Ty->isSingleValueType());
+    // Extract the single value and store it using the indices.
+    Value *Store = IRB.CreateStore(
+      IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+      IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    IRBuilder<> IRB(&SI);
+    SmallVector<unsigned, 4> Indices;
+    SmallVector<Value *, 4> GEPIndices;
+    GEPIndices.push_back(IRB.getInt32(0));
+    emitSplitOps<&AggLoadStoreRewriter::emitStore>(
+      IRB, V->getType(), V, *U, Indices, GEPIndices, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
 /// \brief Try to find a partition of the aggregate type passed in for a given
 /// offset and size.
 ///
@@ -2637,18 +2844,24 @@
     return false;
   }
 
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter(*TD);
+  Changed |= AggRewriter.rewrite(AI);
+
   // Build the partition set using a recursive instruction-visiting builder.
   AllocaPartitioning P(*TD, AI);
   DEBUG(P.print(dbgs()));
   if (P.isEscaped())
-    return false;
+    return Changed;
 
   // No partitions to split. Leave the dead alloca for a later pass to clean up.
   if (P.begin() == P.end())
-    return false;
+    return Changed;
 
   // Delete all the dead users of this alloca before splitting and rewriting it.
-  bool Changed = false;
   for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
                                               DE = P.dead_user_end();
        DI != DE; ++DI) {

Added: llvm/trunk/test/Transforms/SROA/fca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SROA/fca.ll?rev=164119&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SROA/fca.ll (added)
+++ llvm/trunk/test/Transforms/SROA/fca.ll Tue Sep 18 07:57:43 2012
@@ -0,0 +1,49 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define { i32, i32 } @test0(i32 %x, i32 %y) {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+
+  store { i32, i32 } undef, { i32, i32 }* %a
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load { i32, i32 }* %a
+  ret { i32, i32 } %result
+}
+
+define { i32, i32 } @test1(i32 %x, i32 %y) {
+; FIXME: This may be too conservative. Duncan argues that we are allowed to
+; split the volatile load and store here but must produce volatile scalar loads
+; and stores from them.
+; CHECK: @test1
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+  %b = alloca { i32, i32 }
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load volatile { i32, i32 }* %a
+  store volatile { i32, i32 } %result, { i32, i32 }* %b
+  ret { i32, i32 } %result
+}