[llvm-commits] [llvm] r165284 - /llvm/trunk/lib/Transforms/Scalar/SROA.cpp

Thu Oct 4 18:52:01 PDT 2012

protip: pass --patience or --histogram to git's diff-generating
commands (git diff, git log -p, etc.) to select alternative diff
algorithms. I just tried it out locally for this patch and the diff is
dramatically better (the diff is a single big block of + and a single
big block of -). FWIW, I find that patience and histogram are usually
basically the same (histogram is an extension of patience), but either
of them is usually significantly better than the default.

More info about patience diff, for the curious:
http://bramcohen.livejournal.com/73318.html
More info about histogram diff:
http://download.eclipse.org/jgit/docs/jgit-2.0.0.201206130900-r/apidocs/org/eclipse/jgit/diff/HistogramDiff.html

-- Sean Silva

On Thu, Oct 4, 2012 at 9:29 PM, Chandler Carruth <chandlerc at gmail.com> wrote:
> Author: chandlerc
> Date: Thu Oct  4 20:29:06 2012
> New Revision: 165284
>
> URL: http://llvm.org/viewvc/llvm-project?rev=165284&view=rev
> Log:
> Lift the speculation visitor above all the helpers that are targeted at
> the rewrite visitor to make the fact that the speculation is completely
> independent a bit more clear.
>
> I promise that this is just a cut/paste of the one visitor and adding
> the annonymous namespace wrappings. The diff may look completely
> preposterous, it does in git for some reason.
>
> Modified:
>     llvm/trunk/lib/Transforms/Scalar/SROA.cpp
>
> Modified: llvm/trunk/lib/Transforms/Scalar/SROA.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/SROA.cpp?rev=165284&r1=165283&r2=165284&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Scalar/SROA.cpp (original)
> +++ llvm/trunk/lib/Transforms/Scalar/SROA.cpp Thu Oct  4 20:29:06 2012
> @@ -1368,715 +1368,717 @@
>  INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
>                      false, false)
>
> -/// \brief Accumulate the constant offsets in a GEP into a single APInt offset.
> -///
> -/// If the provided GEP is all-constant, the total byte offset formed by the
> -/// GEP is computed and Offset is set to it. If the GEP has any non-constant
> -/// operands, the function returns false and the value of Offset is unmodified.
> -static bool accumulateGEPOffsets(const TargetData &TD, GEPOperator &GEP,
> -                                 APInt &Offset) {
> -  APInt GEPOffset(Offset.getBitWidth(), 0);
> -  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
> -       GTI != GTE; ++GTI) {
> -    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
> -    if (!OpC)
> -      return false;
> -    if (OpC->isZero()) continue;
> +namespace {
> +/// \brief Visitor to speculate PHIs and Selects where possible.
> +class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
> +  // Befriend the base class so it can delegate to private visit methods.
> +  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
>
> -    // Handle a struct index, which adds its field offset to the pointer.
> -    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
> -      unsigned ElementIdx = OpC->getZExtValue();
> -      const StructLayout *SL = TD.getStructLayout(STy);
> -      GEPOffset += APInt(Offset.getBitWidth(),
> -                         SL->getElementOffset(ElementIdx));
> -      continue;
> -    }
> +  const TargetData &TD;
> +  AllocaPartitioning &P;
> +  SROA &Pass;
>
> -    APInt TypeSize(Offset.getBitWidth(),
> -                   TD.getTypeAllocSize(GTI.getIndexedType()));
> -    if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) {
> -      assert((VTy->getScalarSizeInBits() % 8) == 0 &&
> -             "vector element size is not a multiple of 8, cannot GEP over it");
> -      TypeSize = VTy->getScalarSizeInBits() / 8;
> -    }
> +public:
> +  PHIOrSelectSpeculator(const TargetData &TD, AllocaPartitioning &P, SROA &Pass)
> +    : TD(TD), P(P), Pass(Pass) {}
>
> -    GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize;
> +  /// \brief Visit the users of an alloca partition and rewrite them.
> +  void visitUsers(AllocaPartitioning::const_iterator PI) {
> +    // Note that we need to use an index here as the underlying vector of uses
> +    // may be grown during speculation. However, we never need to re-visit the
> +    // new uses, and so we can use the initial size bound.
> +    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
> +      const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx);
> +      if (!PU.U)
> +        continue; // Skip dead use.
> +
> +      visit(cast<Instruction>(PU.U->getUser()));
> +    }
>    }
> -  Offset = GEPOffset;
> -  return true;
> -}
>
> -/// \brief Build a GEP out of a base pointer and indices.
> -///
> -/// This will return the BasePtr if that is valid, or build a new GEP
> -/// instruction using the IRBuilder if GEP-ing is needed.
> -static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
> -                       SmallVectorImpl<Value *> &Indices,
> -                       const Twine &Prefix) {
> -  if (Indices.empty())
> -    return BasePtr;
> +private:
> +  // By default, skip this instruction.
> +  void visitInstruction(Instruction &I) {}
>
> -  // A single zero index is a no-op, so check for this and avoid building a GEP
> -  // in that case.
> -  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
> -    return BasePtr;
> +  /// PHI instructions that use an alloca and are subsequently loaded can be
> +  /// rewritten to load both input pointers in the pred blocks and then PHI the
> +  /// results, allowing the load of the alloca to be promoted.
> +  /// From this:
> +  ///   %P2 = phi [i32* %Alloca, i32* %Other]
> +  ///   %V = load i32* %P2
> +  /// to:
> +  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
> +  ///   ...
> +  ///   %V2 = load i32* %Other
> +  ///   ...
> +  ///   %V = phi [i32 %V1, i32 %V2]
> +  ///
> +  /// We can do this to a select if its only uses are loads and if the operands
> +  /// to the select can be loaded unconditionally.
> +  ///
> +  /// FIXME: This should be hoisted into a generic utility, likely in
> +  /// Transforms/Util/Local.h
> +  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
> +    // For now, we can only do this promotion if the load is in the same block
> +    // as the PHI, and if there are no stores between the phi and load.
> +    // TODO: Allow recursive phi users.
> +    // TODO: Allow stores.
> +    BasicBlock *BB = PN.getParent();
> +    unsigned MaxAlign = 0;
> +    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
> +         UI != UE; ++UI) {
> +      LoadInst *LI = dyn_cast<LoadInst>(*UI);
> +      if (LI == 0 || !LI->isSimple()) return false;
>
> -  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
> -}
> +      // For now we only allow loads in the same block as the PHI.  This is
> +      // a common case that happens when instcombine merges two loads through
> +      // a PHI.
> +      if (LI->getParent() != BB) return false;
>
> -/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
> -/// TargetTy without changing the offset of the pointer.
> -///
> -/// This routine assumes we've already established a properly offset GEP with
> -/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
> -/// zero-indices down through type layers until we find one the same as
> -/// TargetTy. If we can't find one with the same type, we at least try to use
> -/// one with the same size. If none of that works, we just produce the GEP as
> -/// indicated by Indices to have the correct offset.
> -static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const TargetData &TD,
> -                                    Value *BasePtr, Type *Ty, Type *TargetTy,
> -                                    SmallVectorImpl<Value *> &Indices,
> -                                    const Twine &Prefix) {
> -  if (Ty == TargetTy)
> -    return buildGEP(IRB, BasePtr, Indices, Prefix);
> +      // Ensure that there are no instructions between the PHI and the load that
> +      // could store.
> +      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
> +        if (BBI->mayWriteToMemory())
> +          return false;
>
> -  // See if we can descend into a struct and locate a field with the correct
> -  // type.
> -  unsigned NumLayers = 0;
> -  Type *ElementTy = Ty;
> -  do {
> -    if (ElementTy->isPointerTy())
> -      break;
> -    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
> -      ElementTy = SeqTy->getElementType();
> -      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0)));
> -    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
> -      ElementTy = *STy->element_begin();
> -      Indices.push_back(IRB.getInt32(0));
> -    } else {
> -      break;
> +      MaxAlign = std::max(MaxAlign, LI->getAlignment());
> +      Loads.push_back(LI);
>      }
> -    ++NumLayers;
> -  } while (ElementTy != TargetTy);
> -  if (ElementTy != TargetTy)
> -    Indices.erase(Indices.end() - NumLayers, Indices.end());
>
> -  return buildGEP(IRB, BasePtr, Indices, Prefix);
> -}
> +    // We can only transform this if it is safe to push the loads into the
> +    // predecessor blocks. The only thing to watch out for is that we can't put
> +    // a possibly trapping load in the predecessor if it is a critical edge.
> +    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
> +         ++Idx) {
> +      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
> +      Value *InVal = PN.getIncomingValue(Idx);
>
> -/// \brief Recursively compute indices for a natural GEP.
> -///
> -/// This is the recursive step for getNaturalGEPWithOffset that walks down the
> -/// element types adding appropriate indices for the GEP.
> -static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD,
> -                                       Value *Ptr, Type *Ty, APInt &Offset,
> -                                       Type *TargetTy,
> -                                       SmallVectorImpl<Value *> &Indices,
> -                                       const Twine &Prefix) {
> -  if (Offset == 0)
> -    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
> +      // If the value is produced by the terminator of the predecessor (an
> +      // invoke) or it has side-effects, there is no valid place to put a load
> +      // in the predecessor.
> +      if (TI == InVal || TI->mayHaveSideEffects())
> +        return false;
>
> -  // We can't recurse through pointer types.
> -  if (Ty->isPointerTy())
> -    return 0;
> +      // If the predecessor has a single successor, then the edge isn't
> +      // critical.
> +      if (TI->getNumSuccessors() == 1)
> +        continue;
>
> -  // We try to analyze GEPs over vectors here, but note that these GEPs are
> -  // extremely poorly defined currently. The long-term goal is to remove GEPing
> -  // over a vector from the IR completely.
> -  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
> -    unsigned ElementSizeInBits = VecTy->getScalarSizeInBits();
> -    if (ElementSizeInBits % 8)
> -      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
> -    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
> -    APInt NumSkippedElements = Offset.udiv(ElementSize);
> -    if (NumSkippedElements.ugt(VecTy->getNumElements()))
> -      return 0;
> -    Offset -= NumSkippedElements * ElementSize;
> -    Indices.push_back(IRB.getInt(NumSkippedElements));
> -    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
> -                                    Offset, TargetTy, Indices, Prefix);
> -  }
> +      // If this pointer is always safe to load, or if we can prove that there
> +      // is already a load in the block, then we can move the load to the pred
> +      // block.
> +      if (InVal->isDereferenceablePointer() ||
> +          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
> +        continue;
>
> -  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
> -    Type *ElementTy = ArrTy->getElementType();
> -    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
> -    APInt NumSkippedElements = Offset.udiv(ElementSize);
> -    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
> -      return 0;
> +      return false;
> +    }
>
> -    Offset -= NumSkippedElements * ElementSize;
> -    Indices.push_back(IRB.getInt(NumSkippedElements));
> -    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
> -                                    Indices, Prefix);
> +    return true;
>    }
>
> -  StructType *STy = dyn_cast<StructType>(Ty);
> -  if (!STy)
> -    return 0;
> +  void visitPHINode(PHINode &PN) {
> +    DEBUG(dbgs() << "    original: " << PN << "\n");
>
> -  const StructLayout *SL = TD.getStructLayout(STy);
> -  uint64_t StructOffset = Offset.getZExtValue();
> -  if (StructOffset >= SL->getSizeInBytes())
> -    return 0;
> -  unsigned Index = SL->getElementContainingOffset(StructOffset);
> -  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
> -  Type *ElementTy = STy->getElementType(Index);
> -  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
> -    return 0; // The offset points into alignment padding.
> +    SmallVector<LoadInst *, 4> Loads;
> +    if (!isSafePHIToSpeculate(PN, Loads))
> +      return;
>
> -  Indices.push_back(IRB.getInt32(Index));
> -  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
> -                                  Indices, Prefix);
> -}
> +    assert(!Loads.empty());
>
> -/// \brief Get a natural GEP from a base pointer to a particular offset and
> -/// resulting in a particular type.
> -///
> -/// The goal is to produce a "natural" looking GEP that works with the existing
> -/// composite types to arrive at the appropriate offset and element type for
> -/// a pointer. TargetTy is the element type the returned GEP should point-to if
> -/// possible. We recurse by decreasing Offset, adding the appropriate index to
> -/// Indices, and setting Ty to the result subtype.
> -///
> -/// If no natural GEP can be constructed, this function returns null.
> -static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD,
> -                                      Value *Ptr, APInt Offset, Type *TargetTy,
> -                                      SmallVectorImpl<Value *> &Indices,
> -                                      const Twine &Prefix) {
> -  PointerType *Ty = cast<PointerType>(Ptr->getType());
> +    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
> +    IRBuilder<> PHIBuilder(&PN);
> +    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
> +                                          PN.getName() + ".sroa.speculated");
>
> -  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
> -  // an i8.
> -  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
> -    return 0;
> -
> -  Type *ElementTy = Ty->getElementType();
> -  if (!ElementTy->isSized())
> -    return 0; // We can't GEP through an unsized element.
> -  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
> -  if (ElementSize == 0)
> -    return 0; // Zero-length arrays can't help us build a natural GEP.
> -  APInt NumSkippedElements = Offset.udiv(ElementSize);
> -
> -  Offset -= NumSkippedElements * ElementSize;
> -  Indices.push_back(IRB.getInt(NumSkippedElements));
> -  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
> -                                  Indices, Prefix);
> -}
> +    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
> +    // matter which one we get and if any differ, it doesn't matter.
> +    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
> +    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
> +    unsigned Align = SomeLoad->getAlignment();
>
> -/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
> -/// resulting pointer has PointerTy.
> -///
> -/// This tries very hard to compute a "natural" GEP which arrives at the offset
> -/// and produces the pointer type desired. Where it cannot, it will try to use
> -/// the natural GEP to arrive at the offset and bitcast to the type. Where that
> -/// fails, it will try to use an existing i8* and GEP to the byte offset and
> -/// bitcast to the type.
> -///
> -/// The strategy for finding the more natural GEPs is to peel off layers of the
> -/// pointer, walking back through bit casts and GEPs, searching for a base
> -/// pointer from which we can compute a natural GEP with the desired
> -/// properities. The algorithm tries to fold as many constant indices into
> -/// a single GEP as possible, thus making each GEP more independent of the
> -/// surrounding code.
> -static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD,
> -                             Value *Ptr, APInt Offset, Type *PointerTy,
> -                             const Twine &Prefix) {
> -  // Even though we don't look through PHI nodes, we could be called on an
> -  // instruction in an unreachable block, which may be on a cycle.
> -  SmallPtrSet<Value *, 4> Visited;
> -  Visited.insert(Ptr);
> -  SmallVector<Value *, 4> Indices;
> +    // Rewrite all loads of the PN to use the new PHI.
> +    do {
> +      LoadInst *LI = Loads.pop_back_val();
> +      LI->replaceAllUsesWith(NewPN);
> +      Pass.DeadInsts.push_back(LI);
> +    } while (!Loads.empty());
>
> -  // We may end up computing an offset pointer that has the wrong type. If we
> -  // never are able to compute one directly that has the correct type, we'll
> -  // fall back to it, so keep it around here.
> -  Value *OffsetPtr = 0;
> +    // Inject loads into all of the pred blocks.
> +    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
> +      BasicBlock *Pred = PN.getIncomingBlock(Idx);
> +      TerminatorInst *TI = Pred->getTerminator();
> +      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
> +      Value *InVal = PN.getIncomingValue(Idx);
> +      IRBuilder<> PredBuilder(TI);
>
> -  // Remember any i8 pointer we come across to re-use if we need to do a raw
> -  // byte offset.
> -  Value *Int8Ptr = 0;
> -  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
> +      LoadInst *Load
> +        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
> +                                         Pred->getName()));
> +      ++NumLoadsSpeculated;
> +      Load->setAlignment(Align);
> +      if (TBAATag)
> +        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
> +      NewPN->addIncoming(Load, Pred);
>
> -  Type *TargetTy = PointerTy->getPointerElementType();
> +      Instruction *Ptr = dyn_cast<Instruction>(InVal);
> +      if (!Ptr)
> +        // No uses to rewrite.
> +        continue;
>
> -  do {
> -    // First fold any existing GEPs into the offset.
> -    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
> -      APInt GEPOffset(Offset.getBitWidth(), 0);
> -      if (!accumulateGEPOffsets(TD, *GEP, GEPOffset))
> -        break;
> -      Offset += GEPOffset;
> -      Ptr = GEP->getPointerOperand();
> -      if (!Visited.insert(Ptr))
> -        break;
> -    }
> +      // Try to lookup and rewrite any partition uses corresponding to this phi
> +      // input.
> +      AllocaPartitioning::iterator PI
> +        = P.findPartitionForPHIOrSelectOperand(InUse);
> +      if (PI == P.end())
> +        continue;
>
> -    // See if we can perform a natural GEP here.
> -    Indices.clear();
> -    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
> -                                           Indices, Prefix)) {
> -      if (P->getType() == PointerTy) {
> -        // Zap any offset pointer that we ended up computing in previous rounds.
> -        if (OffsetPtr && OffsetPtr->use_empty())
> -          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
> -            I->eraseFromParent();
> -        return P;
> -      }
> -      if (!OffsetPtr) {
> -        OffsetPtr = P;
> -      }
> +      // Replace the Use in the PartitionUse for this operand with the Use
> +      // inside the load.
> +      AllocaPartitioning::use_iterator UI
> +        = P.findPartitionUseForPHIOrSelectOperand(InUse);
> +      assert(isa<PHINode>(*UI->U->getUser()));
> +      UI->U = &Load->getOperandUse(Load->getPointerOperandIndex());
>      }
> +    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
> +  }
>
> -    // Stash this pointer if we've found an i8*.
> -    if (Ptr->getType()->isIntegerTy(8)) {
> -      Int8Ptr = Ptr;
> -      Int8PtrOffset = Offset;
> -    }
> +  /// Select instructions that use an alloca and are subsequently loaded can be
> +  /// rewritten to load both input pointers and then select between the result,
> +  /// allowing the load of the alloca to be promoted.
> +  /// From this:
> +  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
> +  ///   %V = load i32* %P2
> +  /// to:
> +  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
> +  ///   %V2 = load i32* %Other
> +  ///   %V = select i1 %cond, i32 %V1, i32 %V2
> +  ///
> +  /// We can do this to a select if its only uses are loads and if the operand
> +  /// to the select can be loaded unconditionally.
> +  bool isSafeSelectToSpeculate(SelectInst &SI,
> +                               SmallVectorImpl<LoadInst *> &Loads) {
> +    Value *TValue = SI.getTrueValue();
> +    Value *FValue = SI.getFalseValue();
> +    bool TDerefable = TValue->isDereferenceablePointer();
> +    bool FDerefable = FValue->isDereferenceablePointer();
>
> -    // Peel off a layer of the pointer and update the offset appropriately.
> -    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
> -      Ptr = cast<Operator>(Ptr)->getOperand(0);
> -    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
> -      if (GA->mayBeOverridden())
> -        break;
> -      Ptr = GA->getAliasee();
> -    } else {
> -      break;
> -    }
> -    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
> -  } while (Visited.insert(Ptr));
> +    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
> +         UI != UE; ++UI) {
> +      LoadInst *LI = dyn_cast<LoadInst>(*UI);
> +      if (LI == 0 || !LI->isSimple()) return false;
>
> -  if (!OffsetPtr) {
> -    if (!Int8Ptr) {
> -      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
> -                                  Prefix + ".raw_cast");
> -      Int8PtrOffset = Offset;
> +      // Both operands to the select need to be dereferencable, either
> +      // absolutely (e.g. allocas) or at this point because we can see other
> +      // accesses to it.
> +      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
> +                                                      LI->getAlignment(), &TD))
> +        return false;
> +      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
> +                                                      LI->getAlignment(), &TD))
> +        return false;
> +      Loads.push_back(LI);
>      }
>
> -    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
> -      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
> -                            Prefix + ".raw_idx");
> +    return true;
>    }
> -  Ptr = OffsetPtr;
>
> -  // On the off chance we were targeting i8*, guard the bitcast here.
> -  if (Ptr->getType() != PointerTy)
> -    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
> +  void visitSelectInst(SelectInst &SI) {
> +    DEBUG(dbgs() << "    original: " << SI << "\n");
> +    IRBuilder<> IRB(&SI);
>
> -  return Ptr;
> -}
> +    // If the select isn't safe to speculate, just use simple logic to emit it.
> +    SmallVector<LoadInst *, 4> Loads;
> +    if (!isSafeSelectToSpeculate(SI, Loads))
> +      return;
>
> -/// \brief Test whether the given alloca partition can be promoted to a vector.
> -///
> -/// This is a quick test to check whether we can rewrite a particular alloca
> -/// partition (and its newly formed alloca) into a vector alloca with only
> -/// whole-vector loads and stores such that it could be promoted to a vector
> -/// SSA value. We only can ensure this for a limited set of operations, and we
> -/// don't want to do the rewrites unless we are confident that the result will
> -/// be promotable, so we have an early test here.
> -static bool isVectorPromotionViable(const TargetData &TD,
> -                                    Type *AllocaTy,
> -                                    AllocaPartitioning &P,
> -                                    uint64_t PartitionBeginOffset,
> -                                    uint64_t PartitionEndOffset,
> -                                    AllocaPartitioning::const_use_iterator I,
> -                                    AllocaPartitioning::const_use_iterator E) {
> -  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
> -  if (!Ty)
> -    return false;
> +    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
> +    AllocaPartitioning::iterator PIs[2];
> +    AllocaPartitioning::PartitionUse PUs[2];
> +    for (unsigned i = 0, e = 2; i != e; ++i) {
> +      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
> +      if (PIs[i] != P.end()) {
> +        // If the pointer is within the partitioning, remove the select from
> +        // its uses. We'll add in the new loads below.
> +        AllocaPartitioning::use_iterator UI
> +          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
> +        PUs[i] = *UI;
> +        // Clear out the use here so that the offsets into the use list remain
> +        // stable but this use is ignored when rewriting.
> +        UI->U = 0;
> +      }
> +    }
>
> -  uint64_t VecSize = TD.getTypeSizeInBits(Ty);
> -  uint64_t ElementSize = Ty->getScalarSizeInBits();
> +    Value *TV = SI.getTrueValue();
> +    Value *FV = SI.getFalseValue();
> +    // Replace the loads of the select with a select of two loads.
> +    while (!Loads.empty()) {
> +      LoadInst *LI = Loads.pop_back_val();
>
> -  // While the definition of LLVM vectors is bitpacked, we don't support sizes
> -  // that aren't byte sized.
> -  if (ElementSize % 8)
> -    return false;
> -  assert((VecSize % 8) == 0 && "vector size not a multiple of element size?");
> -  VecSize /= 8;
> -  ElementSize /= 8;
> +      IRB.SetInsertPoint(LI);
> +      LoadInst *TL =
> +        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
> +      LoadInst *FL =
> +        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
> +      NumLoadsSpeculated += 2;
>
> -  for (; I != E; ++I) {
> -    if (!I->U)
> -      continue; // Skip dead use.
> +      // Transfer alignment and TBAA info if present.
> +      TL->setAlignment(LI->getAlignment());
> +      FL->setAlignment(LI->getAlignment());
> +      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
> +        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
> +        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
> +      }
>
> -    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
> -    uint64_t BeginIndex = BeginOffset / ElementSize;
> -    if (BeginIndex * ElementSize != BeginOffset ||
> -        BeginIndex >= Ty->getNumElements())
> -      return false;
> -    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
> -    uint64_t EndIndex = EndOffset / ElementSize;
> -    if (EndIndex * ElementSize != EndOffset ||
> -        EndIndex > Ty->getNumElements())
> -      return false;
> -
> -    // FIXME: We should build shuffle vector instructions to handle
> -    // non-element-sized accesses.
> -    if ((EndOffset - BeginOffset) != ElementSize &&
> -        (EndOffset - BeginOffset) != VecSize)
> -      return false;
> +      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
> +                                  LI->getName() + ".sroa.speculated");
>
> -    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
> -      if (MI->isVolatile())
> -        return false;
> -      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
> -        const AllocaPartitioning::MemTransferOffsets &MTO
> -          = P.getMemTransferOffsets(*MTI);
> -        if (!MTO.IsSplittable)
> -          return false;
> +      LoadInst *Loads[2] = { TL, FL };
> +      for (unsigned i = 0, e = 2; i != e; ++i) {
> +        if (PIs[i] != P.end()) {
> +          Use *LoadUse = &Loads[i]->getOperandUse(0);
> +          assert(PUs[i].U->get() == LoadUse->get());
> +          PUs[i].U = LoadUse;
> +          P.use_push_back(PIs[i], PUs[i]);
> +        }
>        }
> -    } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) {
> -      // Disable vector promotion when there are loads or stores of an FCA.
> -      return false;
> -    } else if (!isa<LoadInst>(I->U->getUser()) &&
> -               !isa<StoreInst>(I->U->getUser())) {
> -      return false;
> +
> +      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
> +      LI->replaceAllUsesWith(V);
> +      Pass.DeadInsts.push_back(LI);
>      }
>    }
> -  return true;
> +};
>  }
>
> -/// \brief Test whether the given alloca partition can be promoted to an int.
> +/// \brief Accumulate the constant offsets in a GEP into a single APInt offset.
>  ///
> -/// This is a quick test to check whether we can rewrite a particular alloca
> -/// partition (and its newly formed alloca) into an integer alloca suitable for
> -/// promotion to an SSA value. We only can ensure this for a limited set of
> -/// operations, and we don't want to do the rewrites unless we are confident
> -/// that the result will be promotable, so we have an early test here.
> -static bool isIntegerPromotionViable(const TargetData &TD,
> -                                     Type *AllocaTy,
> -                                     uint64_t AllocBeginOffset,
> -                                     AllocaPartitioning &P,
> -                                     AllocaPartitioning::const_use_iterator I,
> -                                     AllocaPartitioning::const_use_iterator E) {
> -  IntegerType *Ty = dyn_cast<IntegerType>(AllocaTy);
> -  if (!Ty || 8*TD.getTypeStoreSize(Ty) != Ty->getBitWidth())
> -    return false;
> -
> -  // Check the uses to ensure the uses are (likely) promoteable integer uses.
> -  // Also ensure that the alloca has a covering load or store. We don't want
> -  // promote because of some other unsplittable entry (which we may make
> -  // splittable later) and lose the ability to promote each element access.
> -  bool WholeAllocaOp = false;
> -  for (; I != E; ++I) {
> -    if (!I->U)
> -      continue; // Skip dead use.
> -
> -    // We can't reasonably handle cases where the load or store extends past
> -    // the end of the aloca's type and into its padding.
> -    if ((I->EndOffset - AllocBeginOffset) > TD.getTypeStoreSize(Ty))
> +/// If the provided GEP is all-constant, the total byte offset formed by the
> +/// GEP is computed and Offset is set to it. If the GEP has any non-constant
> +/// operands, the function returns false and the value of Offset is unmodified.
> +static bool accumulateGEPOffsets(const TargetData &TD, GEPOperator &GEP,
> +                                 APInt &Offset) {
> +  APInt GEPOffset(Offset.getBitWidth(), 0);
> +  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
> +       GTI != GTE; ++GTI) {
> +    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
> +    if (!OpC)
>        return false;
> +    if (OpC->isZero()) continue;
>
> -    if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
> -      if (LI->isVolatile() || !LI->getType()->isIntegerTy())
> -        return false;
> -      if (LI->getType() == Ty)
> -        WholeAllocaOp = true;
> -    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
> -      if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy())
> -        return false;
> -      if (SI->getValueOperand()->getType() == Ty)
> -        WholeAllocaOp = true;
> -    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
> -      if (MI->isVolatile())
> -        return false;
> -      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
> -        const AllocaPartitioning::MemTransferOffsets &MTO
> -          = P.getMemTransferOffsets(*MTI);
> -        if (!MTO.IsSplittable)
> -          return false;
> -      }
> -    } else {
> -      return false;
> +    // Handle a struct index, which adds its field offset to the pointer.
> +    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
> +      unsigned ElementIdx = OpC->getZExtValue();
> +      const StructLayout *SL = TD.getStructLayout(STy);
> +      GEPOffset += APInt(Offset.getBitWidth(),
> +                         SL->getElementOffset(ElementIdx));
> +      continue;
>      }
> +
> +    APInt TypeSize(Offset.getBitWidth(),
> +                   TD.getTypeAllocSize(GTI.getIndexedType()));
> +    if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) {
> +      assert((VTy->getScalarSizeInBits() % 8) == 0 &&
> +             "vector element size is not a multiple of 8, cannot GEP over it");
> +      TypeSize = VTy->getScalarSizeInBits() / 8;
> +    }
> +
> +    GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize;
>    }
> -  return WholeAllocaOp;
> +  Offset = GEPOffset;
> +  return true;
>  }
>
> -namespace {
> -/// \brief Visitor to speculate PHIs and Selects where possible.
> -class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
> -  // Befriend the base class so it can delegate to private visit methods.
> -  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
> +/// \brief Build a GEP out of a base pointer and indices.
> +///
> +/// This will return the BasePtr if that is valid, or build a new GEP
> +/// instruction using the IRBuilder if GEP-ing is needed.
> +static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
> +                       SmallVectorImpl<Value *> &Indices,
> +                       const Twine &Prefix) {
> +  if (Indices.empty())
> +    return BasePtr;
>
> -  const TargetData &TD;
> -  AllocaPartitioning &P;
> -  SROA &Pass;
> +  // A single zero index is a no-op, so check for this and avoid building a GEP
> +  // in that case.
> +  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
> +    return BasePtr;
>
> -public:
> -  PHIOrSelectSpeculator(const TargetData &TD, AllocaPartitioning &P, SROA &Pass)
> -    : TD(TD), P(P), Pass(Pass) {}
> +  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
> +}
>
> -  /// \brief Visit the users of an alloca partition and rewrite them.
> -  void visitUsers(AllocaPartitioning::const_iterator PI) {
> -    // Note that we need to use an index here as the underlying vector of uses
> -    // may be grown during speculation. However, we never need to re-visit the
> -    // new uses, and so we can use the initial size bound.
> -    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
> -      const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx);
> -      if (!PU.U)
> -        continue; // Skip dead use.
> +/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
> +/// TargetTy without changing the offset of the pointer.
> +///
> +/// This routine assumes we've already established a properly offset GEP with
> +/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
> +/// zero-indices down through type layers until we find one the same as
> +/// TargetTy. If we can't find one with the same type, we at least try to use
> +/// one with the same size. If none of that works, we just produce the GEP as
> +/// indicated by Indices to have the correct offset.
> +static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const TargetData &TD,
> +                                    Value *BasePtr, Type *Ty, Type *TargetTy,
> +                                    SmallVectorImpl<Value *> &Indices,
> +                                    const Twine &Prefix) {
> +  if (Ty == TargetTy)
> +    return buildGEP(IRB, BasePtr, Indices, Prefix);
>
> -      visit(cast<Instruction>(PU.U->getUser()));
> +  // See if we can descend into a struct and locate a field with the correct
> +  // type.
> +  unsigned NumLayers = 0;
> +  Type *ElementTy = Ty;
> +  do {
> +    if (ElementTy->isPointerTy())
> +      break;
> +    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
> +      ElementTy = SeqTy->getElementType();
> +      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0)));
> +    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
> +      ElementTy = *STy->element_begin();
> +      Indices.push_back(IRB.getInt32(0));
> +    } else {
> +      break;
>      }
> -  }
> +    ++NumLayers;
> +  } while (ElementTy != TargetTy);
> +  if (ElementTy != TargetTy)
> +    Indices.erase(Indices.end() - NumLayers, Indices.end());
>
> -private:
> -  // By default, skip this instruction.
> -  void visitInstruction(Instruction &I) {}
> +  return buildGEP(IRB, BasePtr, Indices, Prefix);
> +}
>
> -  /// PHI instructions that use an alloca and are subsequently loaded can be
> -  /// rewritten to load both input pointers in the pred blocks and then PHI the
> -  /// results, allowing the load of the alloca to be promoted.
> -  /// From this:
> -  ///   %P2 = phi [i32* %Alloca, i32* %Other]
> -  ///   %V = load i32* %P2
> -  /// to:
> -  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
> -  ///   ...
> -  ///   %V2 = load i32* %Other
> -  ///   ...
> -  ///   %V = phi [i32 %V1, i32 %V2]
> -  ///
> -  /// We can do this to a select if its only uses are loads and if the operands
> -  /// to the select can be loaded unconditionally.
> -  ///
> -  /// FIXME: This should be hoisted into a generic utility, likely in
> -  /// Transforms/Util/Local.h
> -  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
> -    // For now, we can only do this promotion if the load is in the same block
> -    // as the PHI, and if there are no stores between the phi and load.
> -    // TODO: Allow recursive phi users.
> -    // TODO: Allow stores.
> -    BasicBlock *BB = PN.getParent();
> -    unsigned MaxAlign = 0;
> -    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
> -         UI != UE; ++UI) {
> -      LoadInst *LI = dyn_cast<LoadInst>(*UI);
> -      if (LI == 0 || !LI->isSimple()) return false;
> +/// \brief Recursively compute indices for a natural GEP.
> +///
> +/// This is the recursive step for getNaturalGEPWithOffset that walks down the
> +/// element types adding appropriate indices for the GEP.
> +static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD,
> +                                       Value *Ptr, Type *Ty, APInt &Offset,
> +                                       Type *TargetTy,
> +                                       SmallVectorImpl<Value *> &Indices,
> +                                       const Twine &Prefix) {
> +  if (Offset == 0)
> +    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
>
> -      // For now we only allow loads in the same block as the PHI.  This is
> -      // a common case that happens when instcombine merges two loads through
> -      // a PHI.
> -      if (LI->getParent() != BB) return false;
> +  // We can't recurse through pointer types.
> +  if (Ty->isPointerTy())
> +    return 0;
>
> -      // Ensure that there are no instructions between the PHI and the load that
> -      // could store.
> -      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
> -        if (BBI->mayWriteToMemory())
> -          return false;
> +  // We try to analyze GEPs over vectors here, but note that these GEPs are
> +  // extremely poorly defined currently. The long-term goal is to remove GEPing
> +  // over a vector from the IR completely.
> +  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
> +    unsigned ElementSizeInBits = VecTy->getScalarSizeInBits();
> +    if (ElementSizeInBits % 8)
> +      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
> +    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
> +    APInt NumSkippedElements = Offset.udiv(ElementSize);
> +    if (NumSkippedElements.ugt(VecTy->getNumElements()))
> +      return 0;
> +    Offset -= NumSkippedElements * ElementSize;
> +    Indices.push_back(IRB.getInt(NumSkippedElements));
> +    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
> +                                    Offset, TargetTy, Indices, Prefix);
> +  }
>
> -      MaxAlign = std::max(MaxAlign, LI->getAlignment());
> -      Loads.push_back(LI);
> -    }
> +  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
> +    Type *ElementTy = ArrTy->getElementType();
> +    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
> +    APInt NumSkippedElements = Offset.udiv(ElementSize);
> +    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
> +      return 0;
>
> -    // We can only transform this if it is safe to push the loads into the
> -    // predecessor blocks. The only thing to watch out for is that we can't put
> -    // a possibly trapping load in the predecessor if it is a critical edge.
> -    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
> -         ++Idx) {
> -      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
> -      Value *InVal = PN.getIncomingValue(Idx);
> +    Offset -= NumSkippedElements * ElementSize;
> +    Indices.push_back(IRB.getInt(NumSkippedElements));
> +    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
> +                                    Indices, Prefix);
> +  }
>
> -      // If the value is produced by the terminator of the predecessor (an
> -      // invoke) or it has side-effects, there is no valid place to put a load
> -      // in the predecessor.
> -      if (TI == InVal || TI->mayHaveSideEffects())
> -        return false;
> +  StructType *STy = dyn_cast<StructType>(Ty);
> +  if (!STy)
> +    return 0;
>
> -      // If the predecessor has a single successor, then the edge isn't
> -      // critical.
> -      if (TI->getNumSuccessors() == 1)
> -        continue;
> +  const StructLayout *SL = TD.getStructLayout(STy);
> +  uint64_t StructOffset = Offset.getZExtValue();
> +  if (StructOffset >= SL->getSizeInBytes())
> +    return 0;
> +  unsigned Index = SL->getElementContainingOffset(StructOffset);
> +  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
> +  Type *ElementTy = STy->getElementType(Index);
> +  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
> +    return 0; // The offset points into alignment padding.
>
> -      // If this pointer is always safe to load, or if we can prove that there
> -      // is already a load in the block, then we can move the load to the pred
> -      // block.
> -      if (InVal->isDereferenceablePointer() ||
> -          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
> -        continue;
> +  Indices.push_back(IRB.getInt32(Index));
> +  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
> +                                  Indices, Prefix);
> +}
>
> -      return false;
> -    }
> +/// \brief Get a natural GEP from a base pointer to a particular offset and
> +/// resulting in a particular type.
> +///
> +/// The goal is to produce a "natural" looking GEP that works with the existing
> +/// composite types to arrive at the appropriate offset and element type for
> +/// a pointer. TargetTy is the element type the returned GEP should point-to if
> +/// possible. We recurse by decreasing Offset, adding the appropriate index to
> +/// Indices, and setting Ty to the result subtype.
> +///
> +/// If no natural GEP can be constructed, this function returns null.
> +static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD,
> +                                      Value *Ptr, APInt Offset, Type *TargetTy,
> +                                      SmallVectorImpl<Value *> &Indices,
> +                                      const Twine &Prefix) {
> +  PointerType *Ty = cast<PointerType>(Ptr->getType());
>
> -    return true;
> -  }
> +  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
> +  // an i8.
> +  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
> +    return 0;
>
> -  void visitPHINode(PHINode &PN) {
> -    DEBUG(dbgs() << "    original: " << PN << "\n");
> +  Type *ElementTy = Ty->getElementType();
> +  if (!ElementTy->isSized())
> +    return 0; // We can't GEP through an unsized element.
> +  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
> +  if (ElementSize == 0)
> +    return 0; // Zero-length arrays can't help us build a natural GEP.
> +  APInt NumSkippedElements = Offset.udiv(ElementSize);
>
> -    SmallVector<LoadInst *, 4> Loads;
> -    if (!isSafePHIToSpeculate(PN, Loads))
> -      return;
> +  Offset -= NumSkippedElements * ElementSize;
> +  Indices.push_back(IRB.getInt(NumSkippedElements));
> +  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
> +                                  Indices, Prefix);
> +}
>
> -    assert(!Loads.empty());
> +/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
> +/// resulting pointer has PointerTy.
> +///
> +/// This tries very hard to compute a "natural" GEP which arrives at the offset
> +/// and produces the pointer type desired. Where it cannot, it will try to use
> +/// the natural GEP to arrive at the offset and bitcast to the type. Where that
> +/// fails, it will try to use an existing i8* and GEP to the byte offset and
> +/// bitcast to the type.
> +///
> +/// The strategy for finding the more natural GEPs is to peel off layers of the
> +/// pointer, walking back through bit casts and GEPs, searching for a base
> +/// pointer from which we can compute a natural GEP with the desired
> +/// properities. The algorithm tries to fold as many constant indices into
> +/// a single GEP as possible, thus making each GEP more independent of the
> +/// surrounding code.
> +static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD,
> +                             Value *Ptr, APInt Offset, Type *PointerTy,
> +                             const Twine &Prefix) {
> +  // Even though we don't look through PHI nodes, we could be called on an
> +  // instruction in an unreachable block, which may be on a cycle.
> +  SmallPtrSet<Value *, 4> Visited;
> +  Visited.insert(Ptr);
> +  SmallVector<Value *, 4> Indices;
>
> -    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
> -    IRBuilder<> PHIBuilder(&PN);
> -    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
> -                                          PN.getName() + ".sroa.speculated");
> +  // We may end up computing an offset pointer that has the wrong type. If we
> +  // never are able to compute one directly that has the correct type, we'll
> +  // fall back to it, so keep it around here.
> +  Value *OffsetPtr = 0;
>
> -    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
> -    // matter which one we get and if any differ, it doesn't matter.
> -    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
> -    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
> -    unsigned Align = SomeLoad->getAlignment();
> +  // Remember any i8 pointer we come across to re-use if we need to do a raw
> +  // byte offset.
> +  Value *Int8Ptr = 0;
> +  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
>
> -    // Rewrite all loads of the PN to use the new PHI.
> -    do {
> -      LoadInst *LI = Loads.pop_back_val();
> -      LI->replaceAllUsesWith(NewPN);
> -      Pass.DeadInsts.push_back(LI);
> -    } while (!Loads.empty());
> +  Type *TargetTy = PointerTy->getPointerElementType();
>
> -    // Inject loads into all of the pred blocks.
> -    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
> -      BasicBlock *Pred = PN.getIncomingBlock(Idx);
> -      TerminatorInst *TI = Pred->getTerminator();
> -      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
> -      Value *InVal = PN.getIncomingValue(Idx);
> -      IRBuilder<> PredBuilder(TI);
> +  do {
> +    // First fold any existing GEPs into the offset.
> +    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
> +      APInt GEPOffset(Offset.getBitWidth(), 0);
> +      if (!accumulateGEPOffsets(TD, *GEP, GEPOffset))
> +        break;
> +      Offset += GEPOffset;
> +      Ptr = GEP->getPointerOperand();
> +      if (!Visited.insert(Ptr))
> +        break;
> +    }
>
> -      LoadInst *Load
> -        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
> -                                         Pred->getName()));
> -      ++NumLoadsSpeculated;
> -      Load->setAlignment(Align);
> -      if (TBAATag)
> -        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
> -      NewPN->addIncoming(Load, Pred);
> +    // See if we can perform a natural GEP here.
> +    Indices.clear();
> +    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
> +                                           Indices, Prefix)) {
> +      if (P->getType() == PointerTy) {
> +        // Zap any offset pointer that we ended up computing in previous rounds.
> +        if (OffsetPtr && OffsetPtr->use_empty())
> +          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
> +            I->eraseFromParent();
> +        return P;
> +      }
> +      if (!OffsetPtr) {
> +        OffsetPtr = P;
> +      }
> +    }
>
> -      Instruction *Ptr = dyn_cast<Instruction>(InVal);
> -      if (!Ptr)
> -        // No uses to rewrite.
> -        continue;
> +    // Stash this pointer if we've found an i8*.
> +    if (Ptr->getType()->isIntegerTy(8)) {
> +      Int8Ptr = Ptr;
> +      Int8PtrOffset = Offset;
> +    }
>
> -      // Try to lookup and rewrite any partition uses corresponding to this phi
> -      // input.
> -      AllocaPartitioning::iterator PI
> -        = P.findPartitionForPHIOrSelectOperand(InUse);
> -      if (PI == P.end())
> -        continue;
> +    // Peel off a layer of the pointer and update the offset appropriately.
> +    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
> +      Ptr = cast<Operator>(Ptr)->getOperand(0);
> +    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
> +      if (GA->mayBeOverridden())
> +        break;
> +      Ptr = GA->getAliasee();
> +    } else {
> +      break;
> +    }
> +    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
> +  } while (Visited.insert(Ptr));
>
> -      // Replace the Use in the PartitionUse for this operand with the Use
> -      // inside the load.
> -      AllocaPartitioning::use_iterator UI
> -        = P.findPartitionUseForPHIOrSelectOperand(InUse);
> -      assert(isa<PHINode>(*UI->U->getUser()));
> -      UI->U = &Load->getOperandUse(Load->getPointerOperandIndex());
> +  if (!OffsetPtr) {
> +    if (!Int8Ptr) {
> +      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
> +                                  Prefix + ".raw_cast");
> +      Int8PtrOffset = Offset;
>      }
> -    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
> +
> +    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
> +      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
> +                            Prefix + ".raw_idx");
>    }
> +  Ptr = OffsetPtr;
>
> -  /// Select instructions that use an alloca and are subsequently loaded can be
> -  /// rewritten to load both input pointers and then select between the result,
> -  /// allowing the load of the alloca to be promoted.
> -  /// From this:
> -  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
> -  ///   %V = load i32* %P2
> -  /// to:
> -  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
> -  ///   %V2 = load i32* %Other
> -  ///   %V = select i1 %cond, i32 %V1, i32 %V2
> -  ///
> -  /// We can do this to a select if its only uses are loads and if the operand
> -  /// to the select can be loaded unconditionally.
> -  bool isSafeSelectToSpeculate(SelectInst &SI,
> -                               SmallVectorImpl<LoadInst *> &Loads) {
> -    Value *TValue = SI.getTrueValue();
> -    Value *FValue = SI.getFalseValue();
> -    bool TDerefable = TValue->isDereferenceablePointer();
> -    bool FDerefable = FValue->isDereferenceablePointer();
> +  // On the off chance we were targeting i8*, guard the bitcast here.
> +  if (Ptr->getType() != PointerTy)
> +    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
>
> -    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
> -         UI != UE; ++UI) {
> -      LoadInst *LI = dyn_cast<LoadInst>(*UI);
> -      if (LI == 0 || !LI->isSimple()) return false;
> +  return Ptr;
> +}
>
> -      // Both operands to the select need to be dereferencable, either
> -      // absolutely (e.g. allocas) or at this point because we can see other
> -      // accesses to it.
> -      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
> -                                                      LI->getAlignment(), &TD))
> -        return false;
> -      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
> -                                                      LI->getAlignment(), &TD))
> -        return false;
> -      Loads.push_back(LI);
> -    }
> +/// \brief Test whether the given alloca partition can be promoted to a vector.
> +///
> +/// This is a quick test to check whether we can rewrite a particular alloca
> +/// partition (and its newly formed alloca) into a vector alloca with only
> +/// whole-vector loads and stores such that it could be promoted to a vector
> +/// SSA value. We only can ensure this for a limited set of operations, and we
> +/// don't want to do the rewrites unless we are confident that the result will
> +/// be promotable, so we have an early test here.
> +static bool isVectorPromotionViable(const TargetData &TD,
> +                                    Type *AllocaTy,
> +                                    AllocaPartitioning &P,
> +                                    uint64_t PartitionBeginOffset,
> +                                    uint64_t PartitionEndOffset,
> +                                    AllocaPartitioning::const_use_iterator I,
> +                                    AllocaPartitioning::const_use_iterator E) {
> +  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
> +  if (!Ty)
> +    return false;
>
> -    return true;
> -  }
> +  uint64_t VecSize = TD.getTypeSizeInBits(Ty);
> +  uint64_t ElementSize = Ty->getScalarSizeInBits();
>
> -  void visitSelectInst(SelectInst &SI) {
> -    DEBUG(dbgs() << "    original: " << SI << "\n");
> -    IRBuilder<> IRB(&SI);
> +  // While the definition of LLVM vectors is bitpacked, we don't support sizes
> +  // that aren't byte sized.
> +  if (ElementSize % 8)
> +    return false;
> +  assert((VecSize % 8) == 0 && "vector size not a multiple of element size?");
> +  VecSize /= 8;
> +  ElementSize /= 8;
>
> -    // If the select isn't safe to speculate, just use simple logic to emit it.
> -    SmallVector<LoadInst *, 4> Loads;
> -    if (!isSafeSelectToSpeculate(SI, Loads))
> -      return;
> +  for (; I != E; ++I) {
> +    if (!I->U)
> +      continue; // Skip dead use.
>
> -    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
> -    AllocaPartitioning::iterator PIs[2];
> -    AllocaPartitioning::PartitionUse PUs[2];
> -    for (unsigned i = 0, e = 2; i != e; ++i) {
> -      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
> -      if (PIs[i] != P.end()) {
> -        // If the pointer is within the partitioning, remove the select from
> -        // its uses. We'll add in the new loads below.
> -        AllocaPartitioning::use_iterator UI
> -          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
> -        PUs[i] = *UI;
> -        // Clear out the use here so that the offsets into the use list remain
> -        // stable but this use is ignored when rewriting.
> -        UI->U = 0;
> +    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
> +    uint64_t BeginIndex = BeginOffset / ElementSize;
> +    if (BeginIndex * ElementSize != BeginOffset ||
> +        BeginIndex >= Ty->getNumElements())
> +      return false;
> +    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
> +    uint64_t EndIndex = EndOffset / ElementSize;
> +    if (EndIndex * ElementSize != EndOffset ||
> +        EndIndex > Ty->getNumElements())
> +      return false;
> +
> +    // FIXME: We should build shuffle vector instructions to handle
> +    // non-element-sized accesses.
> +    if ((EndOffset - BeginOffset) != ElementSize &&
> +        (EndOffset - BeginOffset) != VecSize)
> +      return false;
> +
> +    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
> +      if (MI->isVolatile())
> +        return false;
> +      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
> +        const AllocaPartitioning::MemTransferOffsets &MTO
> +          = P.getMemTransferOffsets(*MTI);
> +        if (!MTO.IsSplittable)
> +          return false;
>        }
> +    } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) {
> +      // Disable vector promotion when there are loads or stores of an FCA.
> +      return false;
> +    } else if (!isa<LoadInst>(I->U->getUser()) &&
> +               !isa<StoreInst>(I->U->getUser())) {
> +      return false;
>      }
> +  }
> +  return true;
> +}
>
> -    Value *TV = SI.getTrueValue();
> -    Value *FV = SI.getFalseValue();
> -    // Replace the loads of the select with a select of two loads.
> -    while (!Loads.empty()) {
> -      LoadInst *LI = Loads.pop_back_val();
> -
> -      IRB.SetInsertPoint(LI);
> -      LoadInst *TL =
> -        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
> -      LoadInst *FL =
> -        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
> -      NumLoadsSpeculated += 2;
> +/// \brief Test whether the given alloca partition can be promoted to an int.
> +///
> +/// This is a quick test to check whether we can rewrite a particular alloca
> +/// partition (and its newly formed alloca) into an integer alloca suitable for
> +/// promotion to an SSA value. We only can ensure this for a limited set of
> +/// operations, and we don't want to do the rewrites unless we are confident
> +/// that the result will be promotable, so we have an early test here.
> +static bool isIntegerPromotionViable(const TargetData &TD,
> +                                     Type *AllocaTy,
> +                                     uint64_t AllocBeginOffset,
> +                                     AllocaPartitioning &P,
> +                                     AllocaPartitioning::const_use_iterator I,
> +                                     AllocaPartitioning::const_use_iterator E) {
> +  IntegerType *Ty = dyn_cast<IntegerType>(AllocaTy);
> +  if (!Ty || 8*TD.getTypeStoreSize(Ty) != Ty->getBitWidth())
> +    return false;
>
> -      // Transfer alignment and TBAA info if present.
> -      TL->setAlignment(LI->getAlignment());
> -      FL->setAlignment(LI->getAlignment());
> -      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
> -        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
> -        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
> -      }
> +  // Check the uses to ensure the uses are (likely) promoteable integer uses.
> +  // Also ensure that the alloca has a covering load or store. We don't want
> +  // promote because of some other unsplittable entry (which we may make
> +  // splittable later) and lose the ability to promote each element access.
> +  bool WholeAllocaOp = false;
> +  for (; I != E; ++I) {
> +    if (!I->U)
> +      continue; // Skip dead use.
>
> -      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
> -                                  LI->getName() + ".sroa.speculated");
> +    // We can't reasonably handle cases where the load or store extends past
> +    // the end of the aloca's type and into its padding.
> +    if ((I->EndOffset - AllocBeginOffset) > TD.getTypeStoreSize(Ty))
> +      return false;
>
> -      LoadInst *Loads[2] = { TL, FL };
> -      for (unsigned i = 0, e = 2; i != e; ++i) {
> -        if (PIs[i] != P.end()) {
> -          Use *LoadUse = &Loads[i]->getOperandUse(0);
> -          assert(PUs[i].U->get() == LoadUse->get());
> -          PUs[i].U = LoadUse;
> -          P.use_push_back(PIs[i], PUs[i]);
> -        }
> +    if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
> +      if (LI->isVolatile() || !LI->getType()->isIntegerTy())
> +        return false;
> +      if (LI->getType() == Ty)
> +        WholeAllocaOp = true;
> +    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
> +      if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy())
> +        return false;
> +      if (SI->getValueOperand()->getType() == Ty)
> +        WholeAllocaOp = true;
> +    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
> +      if (MI->isVolatile())
> +        return false;
> +      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
> +        const AllocaPartitioning::MemTransferOffsets &MTO
> +          = P.getMemTransferOffsets(*MTI);
> +        if (!MTO.IsSplittable)
> +          return false;
>        }
> -
> -      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
> -      LI->replaceAllUsesWith(V);
> -      Pass.DeadInsts.push_back(LI);
> +    } else {
> +      return false;
>      }
>    }
> -};
> +  return WholeAllocaOp;
> +}
>
> +namespace {
>  /// \brief Visitor to rewrite instructions using a partition of an alloca to
>  /// use a new alloca.
>  ///
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits