[llvm-commits] PATCH: A new SROA implementation

Thu Aug 23 12:58:48 PDT 2012

Hi Chandler,

On 22/08/12 07:46, Chandler Carruth wrote:
> Ok, I think this patch is pretty close to being ready to go into the tree,
> modulo any detailed code review folks are willing to do. Just send me comments
> if you have them.

more comments, picking up where I stopped off last night.  Well, there is still
a chunk I didn't go through yet, maybe tomorrow.

> +namespace {
> +// InstVisitor to rewrite instructions using a particular region of an alloca
> +// to use a new alloca.
> +class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
> +                                                   bool> {
> +  // Befriend the base class so it can delegate to private visit methods.
> +  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
> +
> +  const TargetData &TD;
> +  AllocaPartitioning &P;
> +  SROA &Pass;
> +  AllocaInst &OldAI, &NewAI;
> +  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
> +
> +  // If we are rewriting an alloca partition which can be written as pure
> +  // vector operations, we stash extra information here.

It would be nice to have a summary of what is known in that case.  The later
code seems to assume that you never have to deal with only part of a vector
element for example.

> +  VectorType *VecTy;
> +  Type *ElementTy;
> +  uint64_t ElementSize;
> +
> +  // The offset of the partition user currently being rewritten.
> +  uint64_t BeginOffset, EndOffset;
> +  Instruction *OldPtr;
> +
> +  // The name prefix to use when rewriting instructions for this alloca.
> +  std::string NamePrefix;
> +
> +public:
> +  AllocaPartitionRewriter(const TargetData &TD, AllocaPartitioning &P,
> +                          AllocaPartitioning::iterator PI,
> +                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
> +                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
> +    : TD(TD), P(P), Pass(Pass),
> +      OldAI(OldAI), NewAI(NewAI),
> +      NewAllocaBeginOffset(NewBeginOffset),
> +      NewAllocaEndOffset(NewEndOffset),
> +      VecTy(), ElementTy(), ElementSize(),
> +      BeginOffset(), EndOffset() {
> +  }
> +
> +  bool visitUsers(AllocaPartitioning::const_use_iterator I,
> +                  AllocaPartitioning::const_use_iterator E) {
> +    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
> +                                NewAllocaBeginOffset, NewAllocaEndOffset,
> +                                I, E)) {
> +      ++NumVectorized;
> +      VecTy = cast<VectorType>(NewAI.getAllocatedType());
> +      ElementTy = VecTy->getElementType();
> +      ElementSize = TD.getTypeAllocSize(ElementTy);
> +    }
> +    bool CanSROA = true;
> +    for (; I != E; ++I) {
> +      BeginOffset = I->BeginOffset;
> +      EndOffset = I->EndOffset;
> +      OldPtr = I->Ptr;
> +      NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
> +      CanSROA &= visit(I->User);
> +    }
> +    if (VecTy) {
> +      assert(CanSROA);
> +      VecTy = 0;
> +      ElementTy = 0;
> +      ElementSize = 0;
> +    }
> +    return CanSROA;
> +  }
> +
> +private:
> +  // Every instruction which can end up as a user must have a rewrite rule.
> +  bool visitInstruction(Instruction &I) {
> +    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
> +    llvm_unreachable("No rewrite rule for this instruction!");
> +  }
> +
> +  Twine getName(const Twine &Suffix) {
> +    return NamePrefix + Suffix;
> +  }
> +
> +  Value *getAdjustedAllocaPtr(IRBuilder<> IRB, Type *PointerTy) {
> +    assert(BeginOffset >= NewAllocaBeginOffset);
> +    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
> +    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
> +  }
> +
> +  ConstantInt *getIndex(uint64_t Offset) {
> +    assert(VecTy && "Can only call getIndex when rewriting a vector");
> +    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
> +    uint64_t Index = RelOffset / ElementSize;
> +    assert(Index * ElementSize == RelOffset);
> +    return ConstantInt::get(Type::getInt32Ty(VecTy->getContext()), Index);
> +  }
> +
> +  void deleteIfTriviallyDead(Value *V) {
> +    Instruction *I = cast<Instruction>(V);
> +    if (isInstructionTriviallyDead(I))
> +      Pass.DeadInsts.push_back(I);
> +  }
> +
> +  Value *getValueCast(IRBuilder<> IRB, Value *V, Type *Ty) {
> +    if (V->getType()->isIntegerTy() && Ty->isPointerTy())
> +      return IRB.CreateIntToPtr(V, Ty);
> +    if (V->getType()->isPointerTy() && Ty->isIntegerTy())
> +      return IRB.CreatePtrToInt(V, Ty);
> +
> +    return IRB.CreateBitCast(V, Ty);

Maybe you can use getCastOpcode here.

> +  }
> +
> +  bool rewriteVectorizedLoadInst(IRBuilder<> IRB, LoadInst &LI, Value *OldOp) {
> +    Value *Result;
> +    if (LI.getType() == VecTy->getElementType() ||
> +        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
> +      Result
> +        = IRB.CreateExtractElement(IRB.CreateLoad(&NewAI, getName(".load")),
> +                                   getIndex(BeginOffset),
> +                                   getName(".extract"));
> +    } else {
> +      Result = IRB.CreateLoad(&NewAI, getName(".load"));
> +    }
> +    if (Result->getType() != LI.getType())
> +      Result = getValueCast(IRB, Result, LI.getType());
> +    LI.replaceAllUsesWith(Result);
> +    Pass.DeadInsts.push_back(&LI);
> +
> +    DEBUG(dbgs() << "          to: " << *Result << "\n");
> +    return true;
> +  }
> +
> +  bool visitLoadInst(LoadInst &LI) {
> +    DEBUG(dbgs() << "    original: " << LI << "\n");
> +    Value *OldOp = LI.getOperand(0);
> +    assert(OldOp == OldPtr);
> +    IRBuilder<> IRB(&LI);
> +
> +    if (VecTy)
> +      return rewriteVectorizedLoadInst(IRB, LI, OldOp);
> +
> +    Value *NewPtr = getAdjustedAllocaPtr(IRB,
> +                                         LI.getPointerOperand()->getType());
> +    LI.setOperand(0, NewPtr);
> +    DEBUG(dbgs() << "          to: " << LI << "\n");
> +
> +    deleteIfTriviallyDead(OldOp);
> +    return NewPtr == &NewAI && !LI.isVolatile();
> +  }
> +
> +  bool rewriteVectorizedStoreInst(IRBuilder<> IRB, StoreInst &SI,
> +                                  Value *OldOp) {
> +    Value *V = SI.getValueOperand();
> +    if (V->getType() == ElementTy ||
> +        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
> +      if (V->getType() != ElementTy)
> +        V = getValueCast(IRB, V, ElementTy);
> +      V = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V,
> +                                  getIndex(BeginOffset),
> +                                  getName(".insert"));
> +    } else if (V->getType() != VecTy) {
> +      V = getValueCast(IRB, V, VecTy);
> +    }
> +    StoreInst *Store = IRB.CreateStore(V, &NewAI);
> +    Pass.DeadInsts.push_back(&SI);
> +
> +    (void)Store;
> +    DEBUG(dbgs() << "          to: " << *Store << "\n");
> +    return true;
> +  }
> +
> +  bool visitStoreInst(StoreInst &SI) {
> +    DEBUG(dbgs() << "    original: " << SI << "\n");
> +    Value *OldOp = SI.getOperand(1);
> +    assert(OldOp == OldPtr);
> +    IRBuilder<> IRB(&SI);
> +
> +    if (VecTy)
> +      return rewriteVectorizedStoreInst(IRB, SI, OldOp);
> +
> +    Value *NewPtr = getAdjustedAllocaPtr(IRB,
> +                                         SI.getPointerOperand()->getType());
> +    SI.setOperand(1, NewPtr);
> +    DEBUG(dbgs() << "          to: " << SI << "\n");
> +
> +    deleteIfTriviallyDead(OldOp);
> +    return NewPtr == &NewAI && !SI.isVolatile();
> +  }
> +
> +  bool visitMemSetInst(MemSetInst &II) {
> +    DEBUG(dbgs() << "    original: " << II << "\n");
> +    IRBuilder<> IRB(&II);
> +    assert(II.getRawDest() == OldPtr);
> +
> +    // Record this instruction for deletion.
> +    if (Pass.DeadSplitInsts.insert(&II))
> +      Pass.DeadInsts.push_back(&II);
> +
> +    Type *AllocaTy = NewAI.getAllocatedType();
> +    Type *ScalarTy = AllocaTy->getScalarType();
> +
> +    // If this doesn't map cleanly onto the alloca type, and that type isn't
> +    // a single value type, just emit a memcpy.

memcpy -> memset

> +    if (!VecTy && (BeginOffset != NewAllocaBeginOffset ||
> +                   EndOffset != NewAllocaEndOffset ||
> +                   !AllocaTy->isSingleValueType() ||
> +                   !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
> +      Type *SizeTy = II.getLength()->getType();
> +      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
> +
> +      CallInst *New
> +        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
> +                                                II.getRawDest()->getType()),
> +                           II.getValue(), Size, II.getAlignment(),
> +                           II.isVolatile());
> +      (void)New;
> +      DEBUG(dbgs() << "          to: " << *New << "\n");
> +      return false;
> +    }
> +
> +    // If we can represent this as a simple value, we have to build the actual
> +    // value to store, which requires expanding the byte present in memset to
> +    // a sensible representation for the alloca type. This is essentially
> +    // splatting the byte to a sufficiently wide integer, bitcasting to the
> +    // desired scalar type, and splatting it across any desired vector type.

How useful is it for you to do this kind of thing?  Won't it happen later in
instcombine anyway?

> +    Value *V = II.getValue();
> +    IntegerType *VTy = cast<IntegerType>(V->getType());
> +    Type *IntTy = Type::getIntNTy(VTy->getContext(),
> +                                  TD.getTypeSizeInBits(ScalarTy));
> +    if (TD.getTypeSizeInBits(ScalarTy) > VTy->getBitWidth())
> +      V = IRB.CreateMul(IRB.CreateZExt(V, IntTy, getName(".zext")),
> +                        ConstantExpr::getUDiv(
> +                          Constant::getAllOnesValue(IntTy),
> +                          ConstantExpr::getZExt(
> +                            Constant::getAllOnesValue(V->getType()),
> +                            IntTy)),
> +                        getName(".isplat"));
> +    if (V->getType() != ScalarTy) {
> +      if (ScalarTy->isPointerTy())
> +        V = IRB.CreateIntToPtr(V, ScalarTy);
> +      else if (ScalarTy->isPrimitiveType() || ScalarTy->isVectorTy())
> +        V = IRB.CreateBitCast(V, ScalarTy);
> +      else if (ScalarTy->isIntegerTy())
> +        llvm_unreachable("Computed different integer types with equal widths");
> +      else
> +        llvm_unreachable("Invalid scalar type");
> +    }
> +
> +    // If this is an element-wide memset of a vectorizable alloca, insert it.
> +    if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
> +                  EndOffset < NewAllocaEndOffset)) {
> +      StoreInst *Store = IRB.CreateStore(
> +        IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V,
> +                                getIndex(BeginOffset),
> +                                getName(".insert")),
> +        &NewAI);
> +      (void)Store;
> +      DEBUG(dbgs() << "          to: " << *Store << "\n");
> +      return true;
> +    }
> +
> +    // Splat to a vector if needed.
> +    if (VectorType *VecTy = dyn_cast<VectorType>(AllocaTy)) {
> +      VectorType *SplatSourceTy = VectorType::get(V->getType(), 1);
> +      V = IRB.CreateShuffleVector(
> +        IRB.CreateInsertElement(UndefValue::get(SplatSourceTy), V,
> +                                ConstantInt::get(IRB.getInt32Ty(), 0),
> +                                getName(".vsplat.insert")),
> +        UndefValue::get(SplatSourceTy),
> +        ConstantVector::getSplat(VecTy->getNumElements(),
> +                                 ConstantInt::get(IRB.getInt32Ty(), 0)),
> +        getName(".vsplat.shuffle"));
> +      assert(V->getType() == VecTy);
> +    }
> +
> +    Value *New = IRB.CreateStore(V, &NewAI, II.isVolatile());
> +    (void)New;
> +    DEBUG(dbgs() << "          to: " << *New << "\n");
> +    return !II.isVolatile();
> +  }
> +
> +  bool visitMemTransferInst(MemTransferInst &II) {
> +    // Rewriting of memory transfer instructions can be a bit tricky. We break
> +    // them into two categories: split intrinsics and unsplit intrinsics.
> +
> +    DEBUG(dbgs() << "    original: " << II << "\n");
> +    IRBuilder<> IRB(&II);
> +
> +    const AllocaPartitioning::MemTransferOffsets &MTO
> +      = P.getMemTransferOffsets(II);
> +
> +    // For unsplit intrinsics, we simply modify the source and destination
> +    // pointers in place. This isn't just an optimization, it is a matter of
> +    // correctness. With unsplit intrinsics we may be dealing with transfers
> +    // within a single alloca before SROA ran. We may also be dealing with
> +    // memmove instead of memcpy, and so simply updating the pointers is the
> +    // necessary for us to update both source and dest of a single call.
> +    if (!MTO.IsSplittable) {
> +      Value *OldOp = BeginOffset == MTO.DestBegin ? II.getRawDest()
> +                                                  : II.getRawSource();
> +      if (BeginOffset == MTO.DestBegin)
> +        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
> +      else
> +        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
> +
> +      DEBUG(dbgs() << "          to: " << II << "\n");
> +      deleteIfTriviallyDead(OldOp);
> +      return false;
> +    }
> +    // For split transfer intrinsics we have an incredibly useful assurance:
> +    // the source and destination do not reside within the same alloca, and at
> +    // least one of them does not escape. This means that we can replace
> +    // memmove with memcpy, and we don't need to worry about all manner of
> +    // downsides to splitting and transforming the operations.
> +
> +    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
> +    bool IsDest = II.getRawDest() == OldPtr;
> +    // Compute the relative offset within the transfer.
> +    unsigned IntPtrWidth = TD.getPointerSizeInBits();
> +    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
> +                                                       : MTO.SourceBegin));
> +
> +    // If this doesn't map cleanly onto the alloca type, and that type isn't
> +    // a single value type, just emit a memcpy.

Do you care if it is not a *legal* single value type?

> +    bool EmitMemCpy
> +      = !VecTy && (BeginOffset != NewAllocaBeginOffset ||
> +                   EndOffset != NewAllocaEndOffset ||
> +                   !NewAI.getAllocatedType()->isSingleValueType());
> +
> +    // If we're just going to emit a memcpy and the alloca hasn't changed, this
> +    // is a no-op.
> +    if (EmitMemCpy && &OldAI == &NewAI) {
> +      // Assert that we are in fact not changing the copy size.
> +      if (IsDest) {
> +        assert(BeginOffset == MTO.DestBegin);
> +        assert(EndOffset == MTO.DestEnd);
> +      } else {
> +        assert(BeginOffset == MTO.SourceBegin);
> +        assert(EndOffset == MTO.SourceEnd);
> +      }
> +      return false;
> +    }
> +    // Record this instruction for deletion.
> +    if (Pass.DeadSplitInsts.insert(&II))
> +      Pass.DeadInsts.push_back(&II);
> +
> +    bool IsVectorElement = VecTy && (BeginOffset > NewAllocaBeginOffset ||
> +                                     EndOffset < NewAllocaEndOffset);

In the case of VecTy what do you know here?  I'm kind of lost disentangling
all the different cases that need to be handled, and which bits of code are
dealing with what exactly.

> +
> +    Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
> +                              : II.getRawDest()->getType();
> +    if (!EmitMemCpy)
> +      OtherPtrTy = IsVectorElement ? VecTy->getElementType()->getPointerTo()
> +                                   : NewAI.getType();
> +
> +    // Compute the other pointer, folding as much as possible to produce
> +    // a single, simple GEP in most cases.
> +    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
> +    OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
> +                              getName("." + OtherPtr->getName()));
> +
> +    // Strip all inbounds GEPs and pointer casts to try to dig out any root
> +    // alloca that should be re-examined after rewriting this instruction.
> +    if (AllocaInst *AI
> +          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
> +      Pass.Worklist.insert(AI);
> +
> +    if (EmitMemCpy) {
> +      Value *OurPtr
> +        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
> +                                           : II.getRawSource()->getType());
> +      Type *SizeTy = II.getLength()->getType();
> +      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
> +
> +      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
> +                                       IsDest ? OtherPtr : OurPtr,
> +                                       Size, II.getAlignment(),
> +                                       II.isVolatile());
> +      (void)New;
> +      DEBUG(dbgs() << "          to: " << *New << "\n");
> +      return false;
> +    }
> +
> +    Value *SrcPtr = OtherPtr;
> +    Value *DstPtr = &NewAI;
> +    if (!IsDest)
> +      std::swap(SrcPtr, DstPtr);
> +
> +    Value *Src;
> +    if (IsVectorElement && !IsDest) {
> +      // We have to extract rather than load.
> +      Src = IRB.CreateExtractElement(IRB.CreateLoad(SrcPtr,
> +                                                    getName(".copyload")),
> +                                     getIndex(BeginOffset),
> +                                     getName(".copyextract"));
> +    } else {
> +      Src = IRB.CreateLoad(SrcPtr, II.isVolatile(), getName(".copyload"));
> +    }
> +
> +    if (IsVectorElement && IsDest) {
> +      // We have to insert into a loaded copy before storing.
> +      Src = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")),
> +                                    Src, getIndex(BeginOffset),
> +                                    getName(".insert"));
> +    }
> +
> +    Value *Store = IRB.CreateStore(Src, DstPtr, II.isVolatile());
> +    (void)Store;
> +    DEBUG(dbgs() << "          to: " << *Store << "\n");
> +    return !II.isVolatile();
> +  }
> +
> +  bool visitIntrinsicInst(IntrinsicInst &II) {
> +    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
> +           II.getIntrinsicID() == Intrinsic::lifetime_end);
> +    DEBUG(dbgs() << "    original: " << II << "\n");
> +    IRBuilder<> IRB(&II);
> +    assert(II.getArgOperand(1) == OldPtr);
> +
> +    // Record this instruction for deletion.
> +    if (Pass.DeadSplitInsts.insert(&II))
> +      Pass.DeadInsts.push_back(&II);
> +
> +    ConstantInt *Size
> +      = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
> +                         EndOffset - BeginOffset);
> +    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
> +    Value *New;
> +    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
> +      New = IRB.CreateLifetimeStart(Ptr, Size);
> +    else
> +      New = IRB.CreateLifetimeEnd(Ptr, Size);
> +
> +    DEBUG(dbgs() << "          to: " << *New << "\n");
> +    return true;
> +  }
> +
> +  /// PHI instructions that use an alloca and are subsequently loaded can be
> +  /// rewritten to load both input pointers in the pred blocks and then PHI the
> +  /// results, allowing the load of the alloca to be promoted.
> +  /// From this:
> +  ///   %P2 = phi [i32* %Alloca, i32* %Other]
> +  ///   %V = load i32* %P2
> +  /// to:
> +  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
> +  ///   ...
> +  ///   %V2 = load i32* %Other
> +  ///   ...
> +  ///   %V = phi [i32 %V1, i32 %V2]
> +  ///
> +  /// We can do this to a select if its only uses are loads and if the operand
> +  /// to the select can be loaded unconditionally.
> +  ///
> +  /// FIXME: This should be hoisted into a generic utility, likely in
> +  /// Transforms/Util/Local.h
> +  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
> +    // For now, we can only do this promotion if the load is in the same block
> +    // as the PHI, and if there are no stores between the phi and load.
> +    // TODO: Allow recursive phi users.
> +    // TODO: Allow stores.
> +    BasicBlock *BB = PN.getParent();
> +    unsigned MaxAlign = 0;
> +    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
> +         UI != UE; ++UI) {
> +      LoadInst *LI = dyn_cast<LoadInst>(*UI);
> +      if (LI == 0 || !LI->isSimple()) return false;
> +
> +      // For now we only allow loads in the same block as the PHI.  This is
> +      // a common case that happens when instcombine merges two loads through
> +      // a PHI.
> +      if (LI->getParent() != BB) return false;
> +
> +      // Ensure that there are no instructions between the PHI and the load that
> +      // could store.
> +      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
> +        if (BBI->mayWriteToMemory())
> +          return false;
> +
> +      MaxAlign = std::max(MaxAlign, LI->getAlignment());
> +      Loads.push_back(LI);
> +    }
> +
> +    // We can only transform this if it is safe to push the loads into the
> +    // predecessor blocks. The only thing to watch out for is that we can't put
> +    // a possibly trapping load in the predecessor if it is a critical edge.
> +    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
> +         ++Idx) {
> +      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
> +      Value *InVal = PN.getIncomingValue(Idx);
> +
> +      // If the value is produced by the terminator of the predecessor (an
> +      // invoke) or it has siide-effects, there is no valid place to put a load

siide-effects -> side-effects

> +      // in the predecessor.
> +      if (TI == InVal || TI->mayHaveSideEffects())
> +        return false;
> +
> +      // If the predecessor has a single successor, then the edge isn't
> +      // critical.
> +      if (TI->getNumSuccessors() == 1)
> +        continue;
> +
> +      // If this pointer is always safe to load, or if we can prove that there
> +      // is already a load in the block, then we can move the load to the pred
> +      // block.
> +      if (InVal->isDereferenceablePointer() ||
> +          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
> +        continue;
> +
> +      return false;
> +    }
> +
> +    return true;
> +  }
> +
> +  bool visitPHINode(PHINode &PN) {
> +    DEBUG(dbgs() << "    original: " << PN << "\n");
> +
> +    // Find the operand we need to rewrite here.
> +    User::op_iterator OI = PN.op_begin(), OE = PN.op_end();
> +    for (; OI != OE && *OI != OldPtr; ++OI)
> +      ;
> +    assert(OI != OE);
> +
> +    IRBuilder<> Builder(&PN);
> +
> +    SmallVector<LoadInst *, 4> Loads;
> +    if (!isSafePHIToSpeculate(PN, Loads)) {
> +      Value *NewPtr = getAdjustedAllocaPtr(Builder, (*OI)->getType());
> +      *OI = NewPtr;
> +      DEBUG(dbgs() << "          to: " << PN << "\n");
> +      deleteIfTriviallyDead(OldPtr);
> +      return false;
> +    }
> +    assert(!Loads.empty());
> +
> +    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
> +    PHINode *NewPN = Builder.CreatePHI(LoadTy, PN.getNumIncomingValues());
> +    NewPN->takeName(&PN);
> +
> +    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
> +    // matter which one we get and if any differ, it doesn't matter.

Why doesn't it matter?  In essence your are saying that the tbaa tag is a
property of the value being loaded, not a property of the load.  But is this
warranted?  It may be true for how clang produces tbaa tags, but that's not
what matters...  In fact even then is it safe?

> +    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
> +    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
> +    unsigned Align = SomeLoad->getAlignment();
> +    Value *NewPtr = getAdjustedAllocaPtr(Builder, OldPtr->getType());
> +
> +    // Rewrite all loads of the PN to use the new PHI.
> +    do {
> +      LoadInst *LI = Loads.pop_back_val();
> +      LI->replaceAllUsesWith(NewPN);
> +      Pass.DeadInsts.push_back(LI);
> +    } while (!Loads.empty());
> +
> +    // Inject loads into all of the pred blocks.
> +    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
> +      BasicBlock *Pred = PN.getIncomingBlock(Idx);
> +      TerminatorInst *TI = Pred->getTerminator();
> +      Value *InVal = PN.getIncomingValue(Idx);
> +      IRBuilder<> PredBuilder(TI);
> +
> +      // Map the value to the new alloca pointer if this was the old alloca
> +      // pointer.
> +      bool ThisOperand = InVal == OldPtr;
> +      if (ThisOperand)
> +        InVal = NewPtr;
> +
> +      LoadInst *Load
> +        = PredBuilder.CreateLoad(InVal, getName(".sroa.speculate." +
> +                                                Pred->getName()));
> +      ++NumLoadsSpeculated;
> +      Load->setAlignment(Align);
> +      if (TBAATag)
> +        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
> +      NewPN->addIncoming(Load, Pred);
> +
> +      if (ThisOperand)
> +        continue;
> +      Instruction *OtherPtr = dyn_cast<Instruction>(InVal);
> +      if (!OtherPtr)
> +        // No uses to rewrite.
> +        continue;
> +
> +      // Try to lookup and rewrite any partition uses corresponding to this phi
> +      // input.
> +      AllocaPartitioning::iterator PI
> +        = P.findPartitionForPHIOrSelectOperand(PN, OtherPtr);
> +      if (PI != P.end()) {
> +        // If the other pointer is within the partitioning, replace the PHI in
> +        // its uses with the load we just speculated, or add another load for
> +        // it to rewrite if we've already replaced the PHI.
> +        AllocaPartitioning::use_iterator UI
> +          = P.findPartitionUseForPHIOrSelectOperand(PN, OtherPtr);
> +        if (isa<PHINode>(*UI->User))
> +          UI->User = Load;
> +        else {
> +          AllocaPartitioning::PartitionUse OtherUse = *UI;
> +          OtherUse.User = Load;
> +          P.use_insert(PI, std::upper_bound(UI, P.use_end(PI), OtherUse),
> +                       OtherUse);
> +        }
> +      }
> +    }
> +    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
> +    return true;
> +  }
> +
> +  /// Select instructions that use an alloca and are subsequently loaded can be
> +  /// rewritten to load both input pointers and then select between the result,
> +  /// allowing the load of the alloca to be promoted.
> +  /// From this:
> +  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
> +  ///   %V = load i32* %P2
> +  /// to:
> +  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
> +  ///   %V2 = load i32* %Other
> +  ///   %V = select i1 %cond, i32 %V1, i32 %V2
> +  ///
> +  /// We can do this to a select if its only uses are loads and if the operand
> +  /// to the select can be loaded unconditionally.
> +  bool isSafeSelectToSpeculate(SelectInst &SI,
> +                               SmallVectorImpl<LoadInst *> &Loads) {
> +    Value *TValue = SI.getTrueValue();
> +    Value *FValue = SI.getFalseValue();
> +    bool TDerefable = TValue->isDereferenceablePointer();
> +    bool FDerefable = FValue->isDereferenceablePointer();
> +
> +    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
> +         UI != UE; ++UI) {
> +      LoadInst *LI = dyn_cast<LoadInst>(*UI);
> +      if (LI == 0 || !LI->isSimple()) return false;
> +
> +      // Both operands to the select need to be dereferencable, either
> +      // absolutely (e.g. allocas) or at this point because we can see other
> +      // accesses to it.
> +      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
> +                                                      LI->getAlignment(), &TD))
> +        return false;
> +      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
> +                                                      LI->getAlignment(), &TD))
> +        return false;
> +      Loads.push_back(LI);
> +    }
> +
> +    return true;
> +  }
> +
> +  bool visitSelectInst(SelectInst &SI) {
> +    DEBUG(dbgs() << "    original: " << SI << "\n");
> +
> +    // Find the operand we need to rewrite here.
> +    bool IsTrueVal = SI.getTrueValue() == OldPtr;

How about an assert that if it isn't the true value then it must be the false
value?  And how about an assert that it isn't both values?

> +    Value *NewPtr = getAdjustedAllocaPtr(IRBuilder<>(&SI), OldPtr->getType());
> +
> +    // If the select isn't safe to speculate, just use simple logic to emit it.
> +    SmallVector<LoadInst *, 4> Loads;
> +    if (!isSafeSelectToSpeculate(SI, Loads)) {
> +      SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
> +      DEBUG(dbgs() << "          to: " << SI << "\n");
> +      deleteIfTriviallyDead(OldPtr);
> +      return false;
> +    }
> +
> +    Value *OtherPtr = IsTrueVal ? SI.getFalseValue() : SI.getTrueValue();
> +    AllocaPartitioning::iterator PI
> +      = P.findPartitionForPHIOrSelectOperand(SI, OtherPtr);
> +    AllocaPartitioning::PartitionUse OtherUse;
> +    if (PI != P.end()) {
> +      // If the other pointer is within the partitioning, remove the select
> +      // from its uses. We'll add in the new loads below.
> +      AllocaPartitioning::use_iterator UI
> +        = P.findPartitionUseForPHIOrSelectOperand(SI, OtherPtr);
> +      OtherUse = *UI;
> +      P.use_erase(PI, UI);

Having to keep track of uses of other partitions like looks like a pain and
seems fragile.  Do you really need to precompute all uses for all partitions?
If you could just visit one partition at a time, compute its uses and rewrite
them then you wouldn't have to worry about the other partitions like this.  Is
something like that feasible?

> +    }
> +
> +    Value *TV = IsTrueVal ? NewPtr : SI.getTrueValue();
> +    Value *FV = IsTrueVal ? SI.getFalseValue() : NewPtr;
> +    // Replace the loads of the select with a select of two loads.
> +    while (!Loads.empty()) {
> +      LoadInst *LI = Loads.pop_back_val();
> +
> +      IRBuilder<> Builder(LI);
> +      LoadInst *TL =
> +        Builder.CreateLoad(TV, getName("." + LI->getName() + ".true"));
> +      LoadInst *FL =
> +        Builder.CreateLoad(FV, getName("." + LI->getName() + ".false"));
> +      NumLoadsSpeculated += 2;
> +      if (PI != P.end()) {
> +        LoadInst *OtherLoad = IsTrueVal ? FL : TL;
> +        assert(OtherUse.Ptr == OtherLoad->getOperand(0));
> +        OtherUse.User = OtherLoad;
> +        P.use_insert(PI, P.use_end(PI), OtherUse);
> +      }
> +
> +      // Transfer alignment and TBAA info if present.
> +      TL->setAlignment(LI->getAlignment());
> +      FL->setAlignment(LI->getAlignment());
> +      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
> +        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
> +        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
> +      }
> +
> +      Value *V = Builder.CreateSelect(SI.getCondition(), TL, FL);
> +      V->takeName(LI);
> +      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
> +      LI->replaceAllUsesWith(V);
> +      Pass.DeadInsts.push_back(LI);
> +    }
> +    if (PI != P.end())
> +      std::stable_sort(P.use_begin(PI), P.use_end(PI));
> +
> +    deleteIfTriviallyDead(OldPtr);
> +    return NewPtr == &NewAI;
> +  }
> +
> +};
> +}

Ciao, Duncan.