[llvm-commits] PATCH: A new SROA implementation

Wed Aug 22 15:04:42 PDT 2012

Hi Chandler,

> Index: test/Transforms/SROA/basictest.ll
> ===================================================================
> --- test/Transforms/SROA/basictest.ll	(revision 0)
> +++ test/Transforms/SROA/basictest.ll	(working copy)

...

> +define %S2 @test9(%S2 %s2) {
> +entry:
> +  %new = alloca %S2
> +  %s2.next.ptr = extractvalue %S2 %s2, 1
> +  %s2.next = load %S2* %s2.next.ptr
> +  %s2.next.s1 = extractvalue %S2 %s2.next, 0
> +  %s2.next.next = extractvalue %S2 %s2.next, 1
> +
> +  %new.s1.ptr = getelementptr %S2* %new, i64 0, i32 0
> +  store %S1* %s2.next.s1, %S1** %new.s1.ptr
> +  %new.next.ptr = getelementptr %S2* %new, i64 0, i32 1
> +  store %S2* %s2.next.next, %S2** %new.next.ptr
> +
> +  %result = load %S2* %new
> +  ret %S2 %result
> +}

you forgot to actually check something in this test.

> --- lib/Transforms/Scalar/SROA.cpp	(revision 0)
> +++ lib/Transforms/Scalar/SROA.cpp	(working copy)
> @@ -0,0 +1,2245 @@
> +//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +/// \file

\file ?

> +/// This transformation implements the well known scalar replacement of
> +/// aggregates transformation. It tries to identify promotable elements of an
> +/// aggregate alloca, and promote them to registers. It will also try to
> +/// convert uses of an element (or set of elements) of an alloca into a vector
> +/// or bitfield-style integer scalar if appropriate.
> +///
> +/// It works to do this with minimal slicing of the alloca so that regions
> +/// which are merely transfered in and out of external memory remain unchanged
> +/// and are not decomposed to scalar code.
> +///
> +/// Because this also performs alloca promotion, it can be thought of as a also

as a also -> as also

> +/// serving the purpose of SSA formation. The algorithm iterates on the
> +/// function until all opportunities for promotion have been realized.
> +///
> +//===----------------------------------------------------------------------===//
> +
> +#define DEBUG_TYPE "sroa"
> +#include "llvm/Transforms/Scalar.h"
> +#include "llvm/Constants.h"
> +#include "llvm/DIBuilder.h"
> +#include "llvm/DebugInfo.h"
> +#include "llvm/DerivedTypes.h"
> +#include "llvm/Function.h"
> +#include "llvm/GlobalVariable.h"
> +#include "llvm/IRBuilder.h"
> +#include "llvm/Instructions.h"
> +#include "llvm/IntrinsicInst.h"
> +#include "llvm/LLVMContext.h"
> +#include "llvm/Module.h"
> +#include "llvm/Operator.h"
> +#include "llvm/Pass.h"
> +#include "llvm/ADT/SetVector.h"
> +#include "llvm/ADT/SmallVector.h"
> +#include "llvm/ADT/Statistic.h"
> +#include "llvm/ADT/STLExtras.h"
> +#include "llvm/ADT/TinyPtrVector.h"
> +#include "llvm/Analysis/Dominators.h"
> +#include "llvm/Analysis/Loads.h"
> +#include "llvm/Analysis/ValueTracking.h"
> +#include "llvm/Support/CallSite.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/ErrorHandling.h"
> +#include "llvm/Support/GetElementPtrTypeIterator.h"
> +#include "llvm/Support/InstVisitor.h"
> +#include "llvm/Support/MathExtras.h"
> +#include "llvm/Support/ValueHandle.h"
> +#include "llvm/Support/raw_ostream.h"
> +#include "llvm/Target/TargetData.h"
> +#include "llvm/Transforms/Utils/Local.h"
> +#include "llvm/Transforms/Utils/PromoteMemToReg.h"
> +#include "llvm/Transforms/Utils/SSAUpdater.h"
> +using namespace llvm;
> +
> +STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
> +STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
> +STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
> +STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
> +STATISTIC(NumDeleted,         "Number of instructions deleted");
> +STATISTIC(NumVectorized,      "Number of vectorized aggregates");
> +
> +namespace {
> +/// \brief Alloca partitioning representation.
> +///
> +/// This class represents a partitioning of an alloca into slices,

It looks like it isn't a partition since the partition elements can overlap.
Or am I just confused by comments like this:

   /// We store a vector of the partitions over the alloca here. This vector is
   /// sorted by increasing begin offset, and then by decreasing end offset. See
   /// the Partition inner class for more details.

which suggests that partition elements are not disjoint (unless this is to allow
for zero length partition elements).  Perhaps it is good to document at which
stages of the logic the partition elements can overlap, and from which point on
they have been made disjoint.

  and
> +/// information about the nature of uses of each slice of the alloca. The goal
> +/// is that this information is sufficient to decide if and how to split the
> +/// alloca apart and replace slices with scalars. It is also intended that this
> +/// structure can capture the relevant information needed both due decide about

both due -> both to

> +/// and to enact these transformations.
> +class AllocaPartitioning {
> +public:
> +  struct Partition {

Is this an element of a partition, or the partition itself?  Maybe it should be
called Slice or Segment or Interval rather than Partition.

> +    uint64_t BeginOffset, EndOffset;
> +
> +    bool IsSplittable;

It would be nice to document the fields and methods of struct Partition, in
particular this one.  Also, whether offsets are in bits or bytes etc.

> +
> +    bool operator<(const Partition &RHS) const {
> +      if (BeginOffset < RHS.BeginOffset) return true;
> +      if (BeginOffset > RHS.BeginOffset) return false;
> +      if (EndOffset > RHS.EndOffset) return true;
> +      return false;
> +    }

This whether this partition is a superset of RHS, so I would have expected ">"
rather than "<".

> +    bool operator<(uint64_t RHSOffset) const {
> +      return BeginOffset < RHSOffset;
> +    }
> +    bool operator==(const Partition &RHS) const {
> +      return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
> +    }

Equality ignores IsSplittable, is this a good idea?  Maybe it is a sign that
IsSplittable shouldn't live in this struct.

> +    bool operator!=(const Partition &RHS) const { return !operator==(RHS); }
> +  };
> +
> +  struct PartitionUse {
> +    uint64_t BeginOffset, EndOffset;
> +    AssertingVH<Instruction> User;
> +    AssertingVH<Instruction> Ptr;
> +
> +    bool operator<(const PartitionUse &RHS) const {
> +      if (BeginOffset < RHS.BeginOffset) return true;
> +      if (BeginOffset > RHS.BeginOffset) return false;
> +      if (EndOffset > RHS.EndOffset) return true;
> +      return false;
> +    }
> +    bool operator<(uint64_t RHSOffset) const {
> +      return BeginOffset < RHSOffset;
> +    }
> +    bool operator==(const PartitionUse &RHS) const {
> +      return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
> +    }
> +    bool operator!=(const PartitionUse &RHS) const { return !operator==(RHS); }
> +  };

This is exactly the same as struct Partition, only the extra data (User/Ptr) is
different.  So maybe they should both be represented as a common class plus some
extra data.  Some documentation of the data fields would be great.

> +
> +  AllocaPartitioning(const TargetData &TD, AllocaInst &AI);
> +
> +  bool isEscaped() const { return PointerEscapingInstr; }
> +
> +  typedef SmallVectorImpl<Partition>::iterator iterator;
> +  iterator begin() { return Partitions.begin(); }
> +  iterator end() { return Partitions.end(); }
> +
> +  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
> +  const_iterator begin() const { return Partitions.begin(); }
> +  const_iterator end() const { return Partitions.end(); }
> +
> +  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
> +  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
> +  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
> +  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
> +  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
> +  void use_insert(unsigned Idx, use_iterator UI, const PartitionUse &U) {
> +    Uses[Idx].insert(UI, U);
> +  }
> +  void use_insert(const_iterator I, use_iterator UI, const PartitionUse &U) {
> +    Uses[I - begin()].insert(UI, U);
> +  }
> +  void use_erase(unsigned Idx, use_iterator UI) { Uses[Idx].erase(UI); }
> +  void use_erase(const_iterator I, use_iterator UI) {
> +    Uses[I - begin()].erase(UI);
> +  }
> +
> +  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
> +  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
> +  const_use_iterator use_begin(const_iterator I) const {
> +    return Uses[I - begin()].begin();
> +  }
> +  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
> +  const_use_iterator use_end(const_iterator I) const {
> +    return Uses[I - begin()].end();
> +  }
> +
> +  /// \brief MemTransferInst auxilliary data.
> +  /// This struct provides some auxilliary data about memory transfer
> +  /// intrinsics such as memcpy and memmove. These intrinsics can use two
> +  /// different ranges within the same alloca, and provide other challenges to
> +  /// correctly representing. We stash extra data to help us untangle this

correctly representing -> correctly represent.

> +  /// after the partitioning is complete.
> +  struct MemTransferOffsets {
> +    uint64_t DestBegin, DestEnd;
> +    uint64_t SourceBegin, SourceEnd;
> +    bool IsSplittable;
> +  };
> +  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
> +    return MemTransferInstData.lookup(&II);
> +  }
> +
> +  bool hasMultiplePHIOrSelectUses(Instruction &I) const {
> +    assert(isa<PHINode>(I) || isa<SelectInst>(I));
> +    return PHIOrSelectSizes.lookup(&I).second;
> +  }
> +
> +  iterator findPartitionForPHIOrSelectOperand(Instruction &I, Value *Op) {
> +    SmallDenseMap<std::pair<Instruction *, Value *>,
> +                  std::pair<unsigned, unsigned> >::const_iterator MapIt
> +      = PHIOrSelectOpMap.find(std::make_pair(&I, Op));
> +    if (MapIt == PHIOrSelectOpMap.end())
> +      return end();
> +
> +    return begin() + MapIt->second.first;
> +  }
> +  use_iterator findPartitionUseForPHIOrSelectOperand(Instruction &I,
> +                                                     Value *Op) {
> +    SmallDenseMap<std::pair<Instruction *, Value *>,
> +                  std::pair<unsigned, unsigned> >::const_iterator MapIt
> +      = PHIOrSelectOpMap.find(std::make_pair(&I, Op));
> +    assert(MapIt != PHIOrSelectOpMap.end());
> +    return Uses[MapIt->second.first].begin() + MapIt->second.second;
> +  }
> +
> +  Type *getCommonType(iterator I) const;

It would be nice to have some documentation for this method (some of the others
above aren't too self-explanatory either...).

> +
> +  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
> +  void printUsers(raw_ostream &OS, const_iterator I,
> +                  StringRef Indent = "  ") const;
> +  void print(raw_ostream &OS) const;
> +  void dump(const_iterator I) const LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED;
> +  void dump() const LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED;
> +
> +private:
> +  template <typename DerivedT, typename RetT = void> class BuilderBase;
> +  class PartitionBuilder;
> +  friend class AllocaPartitioning::PartitionBuilder;
> +  class UseBuilder;
> +  friend class AllocaPartitioning::UseBuilder;
> +
> +  /// \brief Handle to alloca instruction to simplify method interfaces.
> +  AllocaInst &AI;
> +
> +  /// \brief The instruction responsible for this alloca having no partitioning.
> +  /// When an instruction (potentially) escapes the pointer to the alloca, we
> +  /// store a pointer to that here and abort trying to partition the alloca.
> +  /// This will be null if the alloca is partitioned successfully.
> +  Instruction *PointerEscapingInstr;
> +
> +  /// \brief The partitions of the alloca.
> +  ///
> +  /// We store a vector of the partitions over the alloca here. This vector is
> +  /// sorted by increasing begin offset, and then by decreasing end offset. See
> +  /// the Partition inner class for more details.
> +  SmallVector<Partition, 8> Partitions;
> +
> +  /// \brief The uses of the partitions.
> +  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
> +
> +  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
> +
> +  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
> +  SmallDenseMap<std::pair<Instruction *, Value *>,
> +                std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
> +
> +  void splitAndMergePartitions();
> +};
> +}
> +
> +template <typename DerivedT, typename RetT>
> +class AllocaPartitioning::BuilderBase
> +    : public InstVisitor<DerivedT, RetT> {
> +public:
> +  BuilderBase(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
> +      : TD(TD),
> +        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),

Presumably you only create one of these for alloca's with a sized type?  (Are
alloca's with an unsized type even valid?).  OK, I just saw below that you bail
out early for alloca's with an unsized type.

> +        P(P) {
> +    enqueueUsers(AI, 0);
> +  }
> +
> +protected:
> +  const TargetData &TD;
> +  const uint64_t AllocSize;
> +  AllocaPartitioning &P;
> +
> +  struct OffsetUse {
> +    Instruction *Use, *Ptr;
> +    uint64_t Offset;
> +  };
> +  SmallVector<OffsetUse, 8> Queue;
> +
> +  // The active pointer value and offset while visiting.
> +  Instruction *Ptr;
> +  uint64_t Offset;
> +
> +  void enqueueUsers(Instruction &I, uint64_t UserOffset) {
> +    SmallPtrSet<User *, 8> UserSet;
> +    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
> +         UI != UE; ++UI) {
> +      if (!UserSet.insert(*UI))
> +        continue;
> +
> +      OffsetUse OU = { cast<Instruction>(*UI), &I, UserOffset };
> +      Queue.push_back(OU);
> +    }
> +  }
> +
> +  bool computeConstantGEPOffset(GetElementPtrInst &GEPI, uint64_t &GEPOffset) {
> +    GEPOffset = Offset;
> +    for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
> +         GTI != GTE; ++GTI) {
> +      ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
> +      if (!OpC)
> +        return false;
> +      if (OpC->isZero())
> +        continue;
> +
> +      // Handle a struct index, which adds its field offset to the pointer.
> +      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
> +        unsigned ElementIdx = OpC->getZExtValue();
> +        const StructLayout *SL = TD.getStructLayout(STy);
> +        GEPOffset += SL->getElementOffset(ElementIdx);
> +        continue;
> +      }
> +
> +      GEPOffset
> +        += OpC->getZExtValue() * TD.getTypeAllocSize(GTI.getIndexedType());
> +    }
> +    return true;
> +  }
> +
> +  Value *foldSelectInst(SelectInst &SI) {
> +    // If the condition being selected on is a constant or the same value is
> +    // being selected between, fold the select. Yes this does (rarely) happen
> +    // early on.

Indeed!  Scheduling instsimplify before SROA would zap this kind of sillyness.

> +    if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
> +      return SI.getOperand(1+CI->isZero());
> +    if (SI.getOperand(1) == SI.getOperand(2)) {
> +      assert(Ptr == SI.getOperand(1));
> +      return SI.getOperand(1);
> +    }
> +    return 0;
> +  }
> +};
> +
> +/// \brief Builder for the alloca partitioning.
> +///
> +/// This class builds an alloca partitioning by recursively visiting the uses
> +/// of an alloca and splitting the partitions for each load and store at each
> +/// offset.
> +class AllocaPartitioning::PartitionBuilder
> +    : public BuilderBase<PartitionBuilder, bool> {
> +  friend class InstVisitor<PartitionBuilder, bool>;
> +
> +  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
> +
> +public:
> +  PartitionBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
> +      : BuilderBase(TD, AI, P) {}
> +
> +  /// \brief Run the builder over the allocation.
> +  bool operator()() {
> +    // Note that we have to re-evaluate size on each trip through the loop as
> +    // the queue grows at the tail.
> +    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
> +      Ptr = Queue[Idx].Ptr;
> +      Offset = Queue[Idx].Offset;
> +      if (!visit(Queue[Idx].Use))
> +        return false;
> +    }
> +    return true;
> +  }
> +
> +private:
> +  bool markAsEscaping(Instruction &I) {
> +    P.PointerEscapingInstr = &I;
> +    return false;
> +  }
> +
> +  void insertUse(uint64_t Size, bool IsSplittable = false) {
> +    uint64_t BeginOffset = Offset, EndOffset = Offset + Size;
> +    // See if we can just add a user onto the last slot currently occupied.
> +    if (!P.Partitions.empty() &&
> +        P.Partitions.back().BeginOffset == BeginOffset &&
> +        P.Partitions.back().EndOffset == EndOffset) {
> +      P.Partitions.back().IsSplittable &= IsSplittable;
> +      return;
> +    }
> +
> +    Partition New = { BeginOffset, EndOffset, IsSplittable };
> +    P.Partitions.push_back(New);
> +  }
> +
> +  bool handleLoadOrStore(Type *Ty, Instruction &I) {
> +    uint64_t Size = TD.getTypeAllocSize(Ty);
> +
> +    // If this memory access can be shown to *statically* extend outside the
> +    // bounds of of the allocation, it's behavior is undefined, so simply
> +    // ignore the load.

This is pretty harsh.  For example if I have a 4 byte aligned i8 alloca and I
load an i32 out of it you are just going to discard the entire load?  I mean,
I can't really complain too much if you do since technically you are right,
but maybe it is better to be nice to users if this harshness doesn't gain you
anything much in practice.

  We even play some games to handle cases which might run
> +    // the risk of overflow.
> +    // FIXME: We should instead consider tha pointer to have escaped if this
> +    // function is being instrumented for addressing bugs or race conditions.
> +    if (Offset > AllocSize || Size > AllocSize || Offset + Size > AllocSize)

The last check covers the other two (unless you are worried that the addition
may overflow)?

> +      return true;
> +
> +    insertUse(Size);

(IsSplittable -> false)  I guess a FCA load/store is partly splittable, but is
not splittable in the sense you want...  You didn't document what splittable
means :)

> +    return true;
> +  }
> +
> +  bool visitBitCastInst(BitCastInst &BC) {
> +    enqueueUsers(BC, Offset);
> +    return true;
> +  }
> +
> +  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
> +    //unsigned IntPtrWidth = TD->getPointerSizeInBits();
> +    //assert(IntPtrWidth == Offset.getBitWidth());
> +    uint64_t GEPOffset;
> +    if (!computeConstantGEPOffset(GEPI, GEPOffset))
> +      return markAsEscaping(GEPI);
> +
> +    enqueueUsers(GEPI, GEPOffset);
> +    return true;
> +  }
> +
> +  bool visitLoadInst(LoadInst &LI) {
> +    return handleLoadOrStore(LI.getType(), LI);
> +  }
> +
> +  bool visitStoreInst(StoreInst &SI) {
> +    if (SI.getOperand(0) == Ptr)
> +      return markAsEscaping(SI);
> +
> +    return handleLoadOrStore(SI.getOperand(0)->getType(), SI);
> +  }
> +
> +
> +  bool visitMemSetInst(MemSetInst &II) {
> +    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
> +    insertUse(Length ? Length->getZExtValue() : AllocSize - Offset, true);
> +    return true;
> +  }
> +
> +  bool visitMemTransferInst(MemTransferInst &II) {
> +    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
> +    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
> +    if (!Size)
> +      // Zero-length mem transfer intrinsics can be ignored entirely.
> +      return true;
> +
> +    insertUse(Size, true);
> +    unsigned NewIdx = P.Partitions.size() - 1;
> +
> +    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
> +    Offsets.IsSplittable = true;
> +    if (Ptr != II.getRawDest()) {
> +      assert(Ptr == II.getRawSource());
> +      Offsets.SourceBegin = Offset;
> +      Offsets.SourceEnd = Offset + Size;
> +    } else {
> +      Offsets.DestBegin = Offset;
> +      Offsets.DestEnd = Offset + Size;
> +    }
> +
> +    SmallDenseMap<Instruction *, unsigned>::const_iterator PMI;
> +    bool Inserted = false;
> +    llvm::tie(PMI, Inserted)
> +      = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx));
> +    if (!Inserted) {
> +      // We've found a memory transfer intrinsic which refers to the alloca as
> +      // both a source and dest. We refuse to split these to simplify splitting
> +      // logic. If possible, SROA will still split them into separate allocas
> +      // and then re-analyze.
> +      Offsets.IsSplittable = false;
> +      P.Partitions[PMI->second].IsSplittable = false;
> +      P.Partitions[NewIdx].IsSplittable = false;
> +    }
> +
> +    return true;
> +  }
> +
> +  // Disable SRoA for any intrinsics except for lifetime invariants.
> +  bool visitIntrinsicInst(IntrinsicInst &II) {
> +    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
> +        II.getIntrinsicID() == Intrinsic::lifetime_end) {
> +      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
> +      uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue());
> +      insertUse(Size, true);
> +      return true;
> +    }
> +
> +    return markAsEscaping(II);
> +  }
> +
> +  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
> +    // We consider any PHI or select that results in a direct load or store of
> +    // the same offset to be a viable use for partitioning purposes. These uses
> +    // are considered unsplittable and the size is the maximum loaded or stored
> +    // size.

Are these restrictions because otherwise splitting is too hard?

> +    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
> +    Uses.push_back(std::make_pair(Ptr, Root));
> +    do {
> +      Instruction *I, *UsedI;
> +      llvm::tie(UsedI, I) = Uses.pop_back_val();
> +
> +      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
> +        Size = std::max(Size, TD.getTypeAllocSize(LI->getType()));
> +        continue;
> +      }
> +      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
> +        Value *Op = SI->getOperand(0);
> +        if (Op == UsedI)
> +          return SI;
> +        Size = std::max(Size, TD.getTypeAllocSize(Op->getType()));
> +        continue;
> +      }
> +
> +      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
> +        if (!GEP->hasAllZeroIndices())
> +          return GEP;
> +      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
> +                 !isa<SelectInst>(I)) {
> +        return I;
> +      }
> +
> +      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
> +           ++UI)
> +        Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
> +    } while (!Uses.empty());
> +
> +    return 0;
> +  }
> +
> +  bool visitPHINode(PHINode &PN) {
> +    // See if we already have computed info on this node.
> +    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
> +    if (PHIInfo.first) {
> +      PHIInfo.second = true;
> +      insertUse(PHIInfo.first);
> +      return true;
> +    }
> +
> +    // Check for an unsafe use of the PHI node.
> +    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
> +      return markAsEscaping(*EscapingI);
> +
> +    insertUse(PHIInfo.first);
> +    return true;
> +  }
> +
> +  bool visitSelectInst(SelectInst &SI) {
> +    if (Value *Result = foldSelectInst(SI)) {
> +      if (Result == Ptr)
> +        // If the result of the constant fold will be the pointer, recurse
> +        // through the select as if we had RAUW'ed it.
> +        enqueueUsers(SI, Offset);
> +
> +      return true;
> +    }
> +
> +    // See if we already have computed info on this node.
> +    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
> +    if (SelectInfo.first) {
> +      SelectInfo.second = true;
> +      insertUse(SelectInfo.first);
> +      return true;
> +    }
> +
> +    // Check for an unsafe use of the PHI node.
> +    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
> +      return markAsEscaping(*EscapingI);
> +
> +    insertUse(SelectInfo.first);
> +    return true;
> +  }
> +
> +  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
> +  bool visitInstruction(Instruction &I) { return markAsEscaping(I); }
> +};
> +
> +
> +/// \brief Use adder for the alloca partitioning.
> +///
> +/// This class adds the uses of an alloca to all of the partitions which it
> +/// uses. For splittable partitions, this can end up doing essentially a linear
> +/// walk of the partitions, but the number of steps remains bounded by the
> +/// total result instruction size:
> +/// - The number of partitions is a result of the number unsplittable
> +///   instructions using the alloca.
> +/// - The number of users of each partition is at worst the total number of
> +///   splittable instructions using the alloca.

Isn't it the number of splittable instructions plus one (the unsplittable
instruction that caused the partition to exist in the first place)?

> +/// Thus we will produce N * M instructions in the end, where N are the number
> +/// of unsplittable uses and M are the number of splittable. This visitor does
> +/// the exact same number of updates to the partitioning.
> +///
> +/// In the more common case, this visitor will leverage the fact that the
> +/// partition space is pre-sorted, and do a logarithmic search for the
> +/// partition needed, making the total visit a classical ((N + M) * log(N))
> +/// complexity operation.

... chopped bits I didn't get round to commenting on yet ...

> +/// \brief Try to find a partition of the aggregate type pasesd in for a given

pasesd -> passed

> +/// offset and size.
> +///
> +/// This recurses through the aggregate type and tries to compute a subtype
> +/// based on the offset and size. When the offset and size span a sub-section
> +/// of an array, it will even compute a new array type for that sub-section.
> +static Type *getTypePartition(const TargetData &TD, Type *Ty,
> +                              uint64_t Offset, uint64_t Size) {
> +  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
> +    return Ty;
> +
> +  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
> +    Type *ElementTy = SeqTy->getElementType();
> +    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
> +    uint64_t NumSkippedElements = Offset / ElementSize;
> +    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
> +      if (NumSkippedElements > ArrTy->getNumElements())
> +        return 0;

Shouldn't this be NumSkippedElements >= ArrTy->getNumElements() ?

> +    if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
> +      if (NumSkippedElements > VecTy->getNumElements())
> +        return 0;

Likewise.

If this sequential type is a pointer type, the following logic is going to come
to a strange conclusion.  For example you have a [1024xi8]* as the type.  That's
only 32/64 bits wide no matter how big the element type is...  Yet here you are
subtracting off multiples of the element type size.  Pointer types should be
handled like integers and other scalar types as far as I can see.

> +    Offset -= NumSkippedElements * ElementSize;
> +
> +    // First check if we need to recurse.
> +    if (Offset > 0 || Size < ElementSize) {
> +      // Bail if the partition ends in a different array element.
> +      if ((Offset + Size) > ElementSize)
> +        return 0;
> +      // Bail if this is a poniter element, we can't recurse through them.
> +      if (ElementTy->isPointerTy())
> +        return 0;

This seems to be confusing SeqTy being a pointer type and ElementTy being a
pointer type.  I don't see the need for this pointer type check.

> +      // Recurse through the element type trying to peel off offset bytes.
> +      return getTypePartition(TD, ElementTy, Offset, Size);
> +    }
> +    assert(Offset == 0);

Local analysis shows that this assertion can't fail.  I guess it can be
considered documentation.

> +
> +    if (Size == ElementSize)
> +      return ElementTy;
> +    assert(Size > ElementSize);

Likewise for this assert.

> +    uint64_t NumElements = Size / ElementSize;
> +    if (NumElements * ElementSize != Size)
> +      return 0;
> +    return ArrayType::get(ElementTy, NumElements);
> +  }
> +
> +  StructType *STy = dyn_cast<StructType>(Ty);
> +  if (!STy)
> +    return 0;
> +
> +  for (StructType::element_iterator EI = STy->element_begin(),
> +                                    EE = STy->element_end();
> +       EI != EE; ++EI) {

There's no need to loop.  Using StructLayout you can leap straight to the right
field, both for the start and the end.

> +    Type *ElementTy = *EI;
> +    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
> +    if (Offset >= ElementSize) {
> +      Offset -= ElementSize;
> +      continue;
> +    }

The above logic is forgetting about alignment padding shifting the start of the
next element.  Also, the struct type might be packed.  Just use StructLayout.

> +    assert(Offset < ElementSize);
> +
> +    if (Offset > 0 || Size < ElementSize) {
> +      if ((Offset + Size) > ElementSize)
> +        return 0;
> +      // Bail if this is a poniter element, we can't recurse through them.
> +      if (ElementTy->isPointerTy())
> +        return 0;

This pointer type check also doesn't make any sense to me.

> +      return getTypePartition(TD, ElementTy, Offset, Size);
> +    }
> +    assert(Offset == 0);
> +
> +    if (Size == ElementSize)
> +      return ElementTy;
> +
> +    // Try to build up a sub-structure.

The following logic is only correct for packed structs.  I suggest instead you
use StructLayout to work out the fields you start and end in, and also to check
that you don't start partially inside the start or end partially inside the end,
then grab that range of fields to make the new struct, which should be packed
iff the original one was.  I suggest you also add an assertion that the size of
the new struct is OK, and some testcases for structs with alignment padding etc.

> +    SmallVector<Type *, 4> ElementTys;
> +    ElementTys.push_back(ElementTy);
> +    Size -= ElementSize;
> +    while (++EI != EE) {
> +      if (Size == 0)
> +        return StructType::get(ElementTy->getContext(), ElementTys);
> +      ElementSize = TD.getTypeAllocSize(*EI);
> +      if (Size < ElementSize)
> +        return 0;
> +      Size -= ElementSize;
> +    }
> +    return 0;
> +  }
> +  return 0;
> +}
> +
> +bool SROA::rewriteAllocaPartition(AllocaInst &AI,
> +                                  AllocaPartitioning &P,
> +                                  AllocaPartitioning::iterator PI) {
> +  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
> +  assert(P.use_begin(PI) != P.use_end(PI));
> +
> +  // Try to compute a friendly type for this partition of the alloca. This
> +  // won't always succeed, in which case we fall back to a legal integer type
> +  // or an i8 array of an appropriate size.
> +  Type *AllocaTy = 0;
> +  if (Type *PartitionTy = P.getCommonType(PI))
> +    if (TD->getTypeAllocSize(PartitionTy) == AllocaSize)
> +      AllocaTy = PartitionTy;
> +  if (!AllocaTy && AI.getAllocatedType()->isSized())

At this point you already know that the the alloca type is sized, right?

> +    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
> +                                             PI->BeginOffset, AllocaSize))
> +      AllocaTy = PartitionTy;
> +  if ((!AllocaTy ||
> +       (AllocaTy->isArrayTy() &&
> +        AllocaTy->getArrayElementType()->isIntegerTy())) &&
> +      TD->isLegalInteger(AllocaSize * 8))
> +    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
> +  if (!AllocaTy)
> +    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
> +
> +  // Check for the case where we're going to rewrite to a new alloca of the
> +  // exact same type as the original, and with the same access offsets. In that
> +  // case, re-use the existing alloca, but still run through the rewriter to
> +  // performe phi and select speculation.
> +  AllocaInst *NewAI;
> +  if (AllocaTy == AI.getAllocatedType()) {

Is this case actually possible?  How can it occur?

> +    assert(PI->BeginOffset == 0 &&
> +           "Non-zero begin offset but same alloca type");
> +    assert(PI == P.begin() && "Begin offset is zero on later partition");
> +    NewAI = &AI;
> +  } else {
> +    // FIXME: The alignment here is almost certainly wrong, but it matches what
> +    // the old SROA pass used.

It isn't wrong as far as I can see.  The new memory may be more aligned than the
old memory slice inside the original alloca, but that's OK.  It would be wrong
if it was less aligned than the old.

> +    NewAI = new AllocaInst(AllocaTy, 0, AI.getAlignment(),
> +                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
> +                           &AI);
> +    ++NumNewAllocas;
> +  }
> +
> +  DEBUG(dbgs() << "Rewriting alloca partition "
> +               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
> +               << *NewAI << "\n");
> +
> +  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
> +                                   PI->BeginOffset, PI->EndOffset);
> +  DEBUG(dbgs() << "  rewriting ");
> +  DEBUG(P.print(dbgs(), PI, ""));
> +  if (Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI))) {
> +    DEBUG(dbgs() << "  and queuing for promotion\n");
> +    PromotableAllocas.push_back(NewAI);
> +  } else if (NewAI != &AI) {
> +    // If we can't promote the alloca, iterate on it to check for new
> +    // refinements exposed by splitting the current alloca. Don't iterate on an
> +    // alloca which didn't actually change and didn't get promoted.
> +    Worklist.insert(NewAI);
> +  }
> +  return true;
> +}
> +
> +bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
> +  bool Changed = false;
> +  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
> +       ++PI)
> +    Changed |= rewriteAllocaPartition(AI, P, PI);
> +
> +  return Changed;
> +}
> +
> +bool SROA::runOnAlloca(AllocaInst &AI) {
> +  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
> +  ++NumAllocasAnalyzed;
> +
> +  // Special case dead allocas, as they're trivial.
> +  if (AI.use_empty()) {
> +    AI.eraseFromParent();
> +    return true;
> +  }
> +
> +  // Skip alloca forms that this analysis can't handle.
> +  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
> +      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
> +    return false;
> +
> +  // First check if this is a non-aggregate type that we should simply promote.
> +  if (!AI.getAllocatedType()->isAggregateType() && isAllocaPromotable(&AI)) {
> +    DEBUG(dbgs() << "  Trivially scalar type, queuing for promotion...\n");
> +    PromotableAllocas.push_back(&AI);
> +    return false;
> +  }
> +
> +  // Build the partition set using a recursive instruction-visiting builder.
> +  AllocaPartitioning P(*TD, AI);
> +  DEBUG(P.print(dbgs()));
> +  if (P.isEscaped())
> +    return false;
> +
> +  // No partitions to split. Leave the dead alloca for a later pass to clean up.

Couldn't it be added to DeadInsts?

> +  if (P.begin() == P.end())
> +    return false;
> +
> +  return splitAlloca(AI, P);
> +}
> +
> +void SROA::deleteDeadInstructions() {
> +  DeadSplitInsts.clear();

This seems like a strange place to clear DeadSplitInsts.

> +  while (!DeadInsts.empty()) {
> +    Instruction *I = DeadInsts.pop_back_val();
> +    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
> +
> +    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
> +      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
> +        // Zero out the operand and see if it becomes trivially dead.
> +        *OI = 0;
> +        if (isInstructionTriviallyDead(U))
> +          DeadInsts.push_back(U);
> +      }
> +
> +    ++NumDeleted;
> +    I->eraseFromParent();
> +  }
> +}
> +

Ciao, Duncan.