[llvm-commits] PATCH: A new SROA implementation

Wed Aug 29 23:59:09 PDT 2012

More inline responses, updated patch still to come.

On Thu, Aug 23, 2012 at 12:58 PM, Duncan Sands <baldrick at free.fr> wrote:

> Hi Chandler,
>
>
> On 22/08/12 07:46, Chandler Carruth wrote:
>
>> Ok, I think this patch is pretty close to being ready to go into the tree,
>> modulo any detailed code review folks are willing to do. Just send me
>> comments
>> if you have them.
>>
>
> more comments, picking up where I stopped off last night.  Well, there is
> still
> a chunk I didn't go through yet, maybe tomorrow.
>
>  +namespace {
>> +// InstVisitor to rewrite instructions using a particular region of an
>> alloca
>> +// to use a new alloca.
>> +class AllocaPartitionRewriter : public InstVisitor<**
>> AllocaPartitionRewriter,
>> +                                                   bool> {
>> +  // Befriend the base class so it can delegate to private visit methods.
>> +  friend class llvm::InstVisitor<**AllocaPartitionRewriter, bool>;
>> +
>> +  const TargetData &TD;
>> +  AllocaPartitioning &P;
>> +  SROA &Pass;
>> +  AllocaInst &OldAI, &NewAI;
>> +  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
>> +
>> +  // If we are rewriting an alloca partition which can be written as pure
>> +  // vector operations, we stash extra information here.
>>
>
> It would be nice to have a summary of what is known in that case.  The
> later
> code seems to assume that you never have to deal with only part of a vector
> element for example.
>

Yea, this is confusing. I actually had a "cleaner" design before where we
had totally separate rewriting for the vector promotable so it was obvious
what assumptions were being made. However, that required duplicating a
massive amount of code from the normal rewriter so I ended up merging them
with these extra bits of state.

Let me know if the comments I'm adding aren't enough.

>
>  +  VectorType *VecTy;
>> +  Type *ElementTy;
>> +  uint64_t ElementSize;
>> +
>> +  // The offset of the partition user currently being rewritten.
>> +  uint64_t BeginOffset, EndOffset;
>> +  Instruction *OldPtr;
>> +
>> +  // The name prefix to use when rewriting instructions for this alloca.
>> +  std::string NamePrefix;
>> +
>> +public:
>> +  AllocaPartitionRewriter(const TargetData &TD, AllocaPartitioning &P,
>> +                          AllocaPartitioning::iterator PI,
>> +                          SROA &Pass, AllocaInst &OldAI, AllocaInst
>> &NewAI,
>> +                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
>> +    : TD(TD), P(P), Pass(Pass),
>> +      OldAI(OldAI), NewAI(NewAI),
>> +      NewAllocaBeginOffset(**NewBeginOffset),
>> +      NewAllocaEndOffset(**NewEndOffset),
>> +      VecTy(), ElementTy(), ElementSize(),
>> +      BeginOffset(), EndOffset() {
>> +  }
>> +
>> +  bool visitUsers(AllocaPartitioning:**:const_use_iterator I,
>> +                  AllocaPartitioning::const_use_**iterator E) {
>> +    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
>> +                                NewAllocaBeginOffset, NewAllocaEndOffset,
>> +                                I, E)) {
>> +      ++NumVectorized;
>> +      VecTy = cast<VectorType>(NewAI.**getAllocatedType());
>> +      ElementTy = VecTy->getElementType();
>> +      ElementSize = TD.getTypeAllocSize(ElementTy)**;
>> +    }
>> +    bool CanSROA = true;
>> +    for (; I != E; ++I) {
>> +      BeginOffset = I->BeginOffset;
>> +      EndOffset = I->EndOffset;
>> +      OldPtr = I->Ptr;
>> +      NamePrefix = (Twine(NewAI.getName()) + "." +
>> Twine(BeginOffset)).str();
>> +      CanSROA &= visit(I->User);
>> +    }
>> +    if (VecTy) {
>> +      assert(CanSROA);
>> +      VecTy = 0;
>> +      ElementTy = 0;
>> +      ElementSize = 0;
>> +    }
>> +    return CanSROA;
>> +  }
>> +
>> +private:
>> +  // Every instruction which can end up as a user must have a rewrite
>> rule.
>> +  bool visitInstruction(Instruction &I) {
>> +    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
>> +    llvm_unreachable("No rewrite rule for this instruction!");
>> +  }
>> +
>> +  Twine getName(const Twine &Suffix) {
>> +    return NamePrefix + Suffix;
>> +  }
>> +
>> +  Value *getAdjustedAllocaPtr(**IRBuilder<> IRB, Type *PointerTy) {
>> +    assert(BeginOffset >= NewAllocaBeginOffset);
>> +    APInt Offset(TD.**getPointerSizeInBits(), BeginOffset -
>> NewAllocaBeginOffset);
>> +    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy,
>> getName(""));
>> +  }
>> +
>> +  ConstantInt *getIndex(uint64_t Offset) {
>> +    assert(VecTy && "Can only call getIndex when rewriting a vector");
>> +    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
>> +    uint64_t Index = RelOffset / ElementSize;
>> +    assert(Index * ElementSize == RelOffset);
>> +    return ConstantInt::get(Type::**getInt32Ty(VecTy->getContext()**),
>> Index);
>> +  }
>> +
>> +  void deleteIfTriviallyDead(Value *V) {
>> +    Instruction *I = cast<Instruction>(V);
>> +    if (isInstructionTriviallyDead(I)**)
>> +      Pass.DeadInsts.push_back(I);
>> +  }
>> +
>> +  Value *getValueCast(IRBuilder<> IRB, Value *V, Type *Ty) {
>> +    if (V->getType()->isIntegerTy() && Ty->isPointerTy())
>> +      return IRB.CreateIntToPtr(V, Ty);
>> +    if (V->getType()->isPointerTy() && Ty->isIntegerTy())
>> +      return IRB.CreatePtrToInt(V, Ty);
>> +
>> +    return IRB.CreateBitCast(V, Ty);
>>
>
> Maybe you can use getCastOpcode here.
>

I'm a bit torn here, but ultimately I think I prefer the narrower accepted
set of casts. It keeps my code honest and shouldn't let questionable
patterns sneak through as easily.

>
>  +  }
>> +
>> +  bool rewriteVectorizedLoadInst(**IRBuilder<> IRB, LoadInst &LI, Value
>> *OldOp) {
>> +    Value *Result;
>> +    if (LI.getType() == VecTy->getElementType() ||
>> +        BeginOffset > NewAllocaBeginOffset || EndOffset <
>> NewAllocaEndOffset) {
>> +      Result
>> +        = IRB.CreateExtractElement(IRB.**CreateLoad(&NewAI,
>> getName(".load")),
>> +                                   getIndex(BeginOffset),
>> +                                   getName(".extract"));
>> +    } else {
>> +      Result = IRB.CreateLoad(&NewAI, getName(".load"));
>> +    }
>> +    if (Result->getType() != LI.getType())
>> +      Result = getValueCast(IRB, Result, LI.getType());
>> +    LI.replaceAllUsesWith(Result);
>> +    Pass.DeadInsts.push_back(&LI);
>> +
>> +    DEBUG(dbgs() << "          to: " << *Result << "\n");
>> +    return true;
>> +  }
>> +
>> +  bool visitLoadInst(LoadInst &LI) {
>> +    DEBUG(dbgs() << "    original: " << LI << "\n");
>> +    Value *OldOp = LI.getOperand(0);
>> +    assert(OldOp == OldPtr);
>> +    IRBuilder<> IRB(&LI);
>> +
>> +    if (VecTy)
>> +      return rewriteVectorizedLoadInst(IRB, LI, OldOp);
>> +
>> +    Value *NewPtr = getAdjustedAllocaPtr(IRB,
>> +                                         LI.getPointerOperand()->**
>> getType());
>> +    LI.setOperand(0, NewPtr);
>> +    DEBUG(dbgs() << "          to: " << LI << "\n");
>> +
>> +    deleteIfTriviallyDead(OldOp);
>> +    return NewPtr == &NewAI && !LI.isVolatile();
>> +  }
>> +
>> +  bool rewriteVectorizedStoreInst(**IRBuilder<> IRB, StoreInst &SI,
>> +                                  Value *OldOp) {
>> +    Value *V = SI.getValueOperand();
>> +    if (V->getType() == ElementTy ||
>> +        BeginOffset > NewAllocaBeginOffset || EndOffset <
>> NewAllocaEndOffset) {
>> +      if (V->getType() != ElementTy)
>> +        V = getValueCast(IRB, V, ElementTy);
>> +      V = IRB.CreateInsertElement(IRB.**CreateLoad(&NewAI,
>> getName(".load")), V,
>> +                                  getIndex(BeginOffset),
>> +                                  getName(".insert"));
>> +    } else if (V->getType() != VecTy) {
>> +      V = getValueCast(IRB, V, VecTy);
>> +    }
>> +    StoreInst *Store = IRB.CreateStore(V, &NewAI);
>> +    Pass.DeadInsts.push_back(&SI);
>> +
>> +    (void)Store;
>> +    DEBUG(dbgs() << "          to: " << *Store << "\n");
>> +    return true;
>> +  }
>> +
>> +  bool visitStoreInst(StoreInst &SI) {
>> +    DEBUG(dbgs() << "    original: " << SI << "\n");
>> +    Value *OldOp = SI.getOperand(1);
>> +    assert(OldOp == OldPtr);
>> +    IRBuilder<> IRB(&SI);
>> +
>> +    if (VecTy)
>> +      return rewriteVectorizedStoreInst(**IRB, SI, OldOp);
>> +
>> +    Value *NewPtr = getAdjustedAllocaPtr(IRB,
>> +                                         SI.getPointerOperand()->**
>> getType());
>> +    SI.setOperand(1, NewPtr);
>> +    DEBUG(dbgs() << "          to: " << SI << "\n");
>> +
>> +    deleteIfTriviallyDead(OldOp);
>> +    return NewPtr == &NewAI && !SI.isVolatile();
>> +  }
>> +
>> +  bool visitMemSetInst(MemSetInst &II) {
>> +    DEBUG(dbgs() << "    original: " << II << "\n");
>> +    IRBuilder<> IRB(&II);
>> +    assert(II.getRawDest() == OldPtr);
>> +
>> +    // Record this instruction for deletion.
>> +    if (Pass.DeadSplitInsts.insert(&**II))
>> +      Pass.DeadInsts.push_back(&II);
>> +
>> +    Type *AllocaTy = NewAI.getAllocatedType();
>> +    Type *ScalarTy = AllocaTy->getScalarType();
>> +
>> +    // If this doesn't map cleanly onto the alloca type, and that type
>> isn't
>> +    // a single value type, just emit a memcpy.
>>
>
> memcpy -> memset
>
>  +    if (!VecTy && (BeginOffset != NewAllocaBeginOffset ||
>> +                   EndOffset != NewAllocaEndOffset ||
>> +                   !AllocaTy->isSingleValueType() ||
>> +                   !TD.isLegalInteger(TD.**getTypeSizeInBits(ScalarTy))))
>> {
>> +      Type *SizeTy = II.getLength()->getType();
>> +      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
>> +
>> +      CallInst *New
>> +        = IRB.CreateMemSet(**getAdjustedAllocaPtr(IRB,
>> +
>>  II.getRawDest()->getType()),
>> +                           II.getValue(), Size, II.getAlignment(),
>> +                           II.isVolatile());
>> +      (void)New;
>> +      DEBUG(dbgs() << "          to: " << *New << "\n");
>> +      return false;
>> +    }
>> +
>> +    // If we can represent this as a simple value, we have to build the
>> actual
>> +    // value to store, which requires expanding the byte present in
>> memset to
>> +    // a sensible representation for the alloca type. This is essentially
>> +    // splatting the byte to a sufficiently wide integer, bitcasting to
>> the
>> +    // desired scalar type, and splatting it across any desired vector
>> type.
>>
>
> How useful is it for you to do this kind of thing?  Won't it happen later
> in
> instcombine anyway?
>

I don't think so, and we really want to do this here because it exposes an
alloca promotion opportunity. The whole point of the rewriter is to make
things suitable for promotion.

>  +    Value *V = II.getValue();
>> +    IntegerType *VTy = cast<IntegerType>(V->getType()**);
>> +    Type *IntTy = Type::getIntNTy(VTy->**getContext(),
>> +                                  TD.getTypeSizeInBits(ScalarTy)**);
>> +    if (TD.getTypeSizeInBits(**ScalarTy) > VTy->getBitWidth())
>> +      V = IRB.CreateMul(IRB.CreateZExt(**V, IntTy, getName(".zext")),
>> +                        ConstantExpr::getUDiv(
>> +                          Constant::getAllOnesValue(**IntTy),
>> +                          ConstantExpr::getZExt(
>> +                            Constant::getAllOnesValue(V->**getType()),
>> +                            IntTy)),
>> +                        getName(".isplat"));
>> +    if (V->getType() != ScalarTy) {
>> +      if (ScalarTy->isPointerTy())
>> +        V = IRB.CreateIntToPtr(V, ScalarTy);
>> +      else if (ScalarTy->isPrimitiveType() || ScalarTy->isVectorTy())
>> +        V = IRB.CreateBitCast(V, ScalarTy);
>> +      else if (ScalarTy->isIntegerTy())
>> +        llvm_unreachable("Computed different integer types with equal
>> widths");
>> +      else
>> +        llvm_unreachable("Invalid scalar type");
>> +    }
>> +
>> +    // If this is an element-wide memset of a vectorizable alloca,
>> insert it.
>> +    if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
>> +                  EndOffset < NewAllocaEndOffset)) {
>> +      StoreInst *Store = IRB.CreateStore(
>> +        IRB.CreateInsertElement(IRB.**CreateLoad(&NewAI,
>> getName(".load")), V,
>> +                                getIndex(BeginOffset),
>> +                                getName(".insert")),
>> +        &NewAI);
>> +      (void)Store;
>> +      DEBUG(dbgs() << "          to: " << *Store << "\n");
>> +      return true;
>> +    }
>> +
>> +    // Splat to a vector if needed.
>> +    if (VectorType *VecTy = dyn_cast<VectorType>(AllocaTy)**) {
>> +      VectorType *SplatSourceTy = VectorType::get(V->getType(), 1);
>> +      V = IRB.CreateShuffleVector(
>> +        IRB.CreateInsertElement(**UndefValue::get(SplatSourceTy)**, V,
>> +                                ConstantInt::get(IRB.**getInt32Ty(), 0),
>> +                                getName(".vsplat.insert")),
>> +        UndefValue::get(SplatSourceTy)**,
>> +        ConstantVector::getSplat(**VecTy->getNumElements(),
>> +                                 ConstantInt::get(IRB.**getInt32Ty(),
>> 0)),
>> +        getName(".vsplat.shuffle"));
>> +      assert(V->getType() == VecTy);
>> +    }
>> +
>> +    Value *New = IRB.CreateStore(V, &NewAI, II.isVolatile());
>> +    (void)New;
>> +    DEBUG(dbgs() << "          to: " << *New << "\n");
>> +    return !II.isVolatile();
>> +  }
>> +
>> +  bool visitMemTransferInst(**MemTransferInst &II) {
>> +    // Rewriting of memory transfer instructions can be a bit tricky. We
>> break
>> +    // them into two categories: split intrinsics and unsplit intrinsics.
>> +
>> +    DEBUG(dbgs() << "    original: " << II << "\n");
>> +    IRBuilder<> IRB(&II);
>> +
>> +    const AllocaPartitioning::**MemTransferOffsets &MTO
>> +      = P.getMemTransferOffsets(II);
>> +
>> +    // For unsplit intrinsics, we simply modify the source and
>> destination
>> +    // pointers in place. This isn't just an optimization, it is a
>> matter of
>> +    // correctness. With unsplit intrinsics we may be dealing with
>> transfers
>> +    // within a single alloca before SROA ran. We may also be dealing
>> with
>> +    // memmove instead of memcpy, and so simply updating the pointers is
>> the
>> +    // necessary for us to update both source and dest of a single call.
>> +    if (!MTO.IsSplittable) {
>> +      Value *OldOp = BeginOffset == MTO.DestBegin ? II.getRawDest()
>> +                                                  : II.getRawSource();
>> +      if (BeginOffset == MTO.DestBegin)
>> +        II.setDest(**getAdjustedAllocaPtr(IRB,
>> II.getRawDest()->getType()));
>> +      else
>> +        II.setSource(**getAdjustedAllocaPtr(IRB,
>> II.getRawSource()->getType()))**;
>> +
>> +      DEBUG(dbgs() << "          to: " << II << "\n");
>> +      deleteIfTriviallyDead(OldOp);
>> +      return false;
>> +    }
>> +    // For split transfer intrinsics we have an incredibly useful
>> assurance:
>> +    // the source and destination do not reside within the same alloca,
>> and at
>> +    // least one of them does not escape. This means that we can replace
>> +    // memmove with memcpy, and we don't need to worry about all manner
>> of
>> +    // downsides to splitting and transforming the operations.
>> +
>> +    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
>> +    bool IsDest = II.getRawDest() == OldPtr;
>> +    // Compute the relative offset within the transfer.
>> +    unsigned IntPtrWidth = TD.getPointerSizeInBits();
>> +    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
>> +                                                       :
>> MTO.SourceBegin));
>> +
>> +    // If this doesn't map cleanly onto the alloca type, and that type
>> isn't
>> +    // a single value type, just emit a memcpy.
>>
>
> Do you care if it is not a *legal* single value type?
>

Not really. The only types we end up with in the alloca type that are
single value types are types that were already in use in the source
program. Because we only ever re-use types, we should only form things that
the source code and/or frontend already thought were plausible.

>
>  +    bool EmitMemCpy
>> +      = !VecTy && (BeginOffset != NewAllocaBeginOffset ||
>> +                   EndOffset != NewAllocaEndOffset ||
>> +                   !NewAI.getAllocatedType()->**isSingleValueType());
>> +
>> +    // If we're just going to emit a memcpy and the alloca hasn't
>> changed, this
>> +    // is a no-op.
>> +    if (EmitMemCpy && &OldAI == &NewAI) {
>> +      // Assert that we are in fact not changing the copy size.
>> +      if (IsDest) {
>> +        assert(BeginOffset == MTO.DestBegin);
>> +        assert(EndOffset == MTO.DestEnd);
>> +      } else {
>> +        assert(BeginOffset == MTO.SourceBegin);
>> +        assert(EndOffset == MTO.SourceEnd);
>> +      }
>> +      return false;
>> +    }
>> +    // Record this instruction for deletion.
>> +    if (Pass.DeadSplitInsts.insert(&**II))
>> +      Pass.DeadInsts.push_back(&II);
>> +
>> +    bool IsVectorElement = VecTy && (BeginOffset > NewAllocaBeginOffset
>> ||
>> +                                     EndOffset < NewAllocaEndOffset);
>>
>
> In the case of VecTy what do you know here?  I'm kind of lost disentangling
> all the different cases that need to be handled, and which bits of code are
> dealing with what exactly.
>

Yea, see my comment above. Do my new comments in the code help? If not, any
concrete ideas about how to improve this? I've tried a few things, and
worry that I'm stuck in my personal local optima, missing some
simplification I've not yet thought of...

>  +
>> +    Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
>> +                              : II.getRawDest()->getType();
>> +    if (!EmitMemCpy)
>> +      OtherPtrTy = IsVectorElement ? VecTy->getElementType()->**
>> getPointerTo()
>> +                                   : NewAI.getType();
>> +
>> +    // Compute the other pointer, folding as much as possible to produce
>> +    // a single, simple GEP in most cases.
>> +    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
>> +    OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
>> +                              getName("." + OtherPtr->getName()));
>> +
>> +    // Strip all inbounds GEPs and pointer casts to try to dig out any
>> root
>> +    // alloca that should be re-examined after rewriting this
>> instruction.
>> +    if (AllocaInst *AI
>> +          = dyn_cast<AllocaInst>(OtherPtr-**>stripInBoundsOffsets()))
>> +      Pass.Worklist.insert(AI);
>> +
>> +    if (EmitMemCpy) {
>> +      Value *OurPtr
>> +        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
>> +                                           :
>> II.getRawSource()->getType());
>> +      Type *SizeTy = II.getLength()->getType();
>> +      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
>> +
>> +      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
>> +                                       IsDest ? OtherPtr : OurPtr,
>> +                                       Size, II.getAlignment(),
>> +                                       II.isVolatile());
>> +      (void)New;
>> +      DEBUG(dbgs() << "          to: " << *New << "\n");
>> +      return false;
>> +    }
>> +
>> +    Value *SrcPtr = OtherPtr;
>> +    Value *DstPtr = &NewAI;
>> +    if (!IsDest)
>> +      std::swap(SrcPtr, DstPtr);
>> +
>> +    Value *Src;
>> +    if (IsVectorElement && !IsDest) {
>> +      // We have to extract rather than load.
>> +      Src = IRB.CreateExtractElement(IRB.**CreateLoad(SrcPtr,
>> +
>>  getName(".copyload")),
>> +                                     getIndex(BeginOffset),
>> +                                     getName(".copyextract"));
>> +    } else {
>> +      Src = IRB.CreateLoad(SrcPtr, II.isVolatile(),
>> getName(".copyload"));
>> +    }
>> +
>> +    if (IsVectorElement && IsDest) {
>> +      // We have to insert into a loaded copy before storing.
>> +      Src = IRB.CreateInsertElement(IRB.**CreateLoad(&NewAI,
>> getName(".load")),
>> +                                    Src, getIndex(BeginOffset),
>> +                                    getName(".insert"));
>> +    }
>> +
>> +    Value *Store = IRB.CreateStore(Src, DstPtr, II.isVolatile());
>> +    (void)Store;
>> +    DEBUG(dbgs() << "          to: " << *Store << "\n");
>> +    return !II.isVolatile();
>> +  }
>> +
>> +  bool visitIntrinsicInst(**IntrinsicInst &II) {
>> +    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
>> +           II.getIntrinsicID() == Intrinsic::lifetime_end);
>> +    DEBUG(dbgs() << "    original: " << II << "\n");
>> +    IRBuilder<> IRB(&II);
>> +    assert(II.getArgOperand(1) == OldPtr);
>> +
>> +    // Record this instruction for deletion.
>> +    if (Pass.DeadSplitInsts.insert(&**II))
>> +      Pass.DeadInsts.push_back(&II);
>> +
>> +    ConstantInt *Size
>> +      = ConstantInt::get(cast<**IntegerType>(II.getArgOperand(**
>> 0)->getType()),
>> +                         EndOffset - BeginOffset);
>> +    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()
>> **);
>> +    Value *New;
>> +    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
>> +      New = IRB.CreateLifetimeStart(Ptr, Size);
>> +    else
>> +      New = IRB.CreateLifetimeEnd(Ptr, Size);
>> +
>> +    DEBUG(dbgs() << "          to: " << *New << "\n");
>> +    return true;
>> +  }
>> +
>> +  /// PHI instructions that use an alloca and are subsequently loaded
>> can be
>> +  /// rewritten to load both input pointers in the pred blocks and then
>> PHI the
>> +  /// results, allowing the load of the alloca to be promoted.
>> +  /// From this:
>> +  ///   %P2 = phi [i32* %Alloca, i32* %Other]
>> +  ///   %V = load i32* %P2
>> +  /// to:
>> +  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
>> +  ///   ...
>> +  ///   %V2 = load i32* %Other
>> +  ///   ...
>> +  ///   %V = phi [i32 %V1, i32 %V2]
>> +  ///
>> +  /// We can do this to a select if its only uses are loads and if the
>> operand
>> +  /// to the select can be loaded unconditionally.
>> +  ///
>> +  /// FIXME: This should be hoisted into a generic utility, likely in
>> +  /// Transforms/Util/Local.h
>> +  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *>
>> &Loads) {
>> +    // For now, we can only do this promotion if the load is in the same
>> block
>> +    // as the PHI, and if there are no stores between the phi and load.
>> +    // TODO: Allow recursive phi users.
>> +    // TODO: Allow stores.
>> +    BasicBlock *BB = PN.getParent();
>> +    unsigned MaxAlign = 0;
>> +    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
>> +         UI != UE; ++UI) {
>> +      LoadInst *LI = dyn_cast<LoadInst>(*UI);
>> +      if (LI == 0 || !LI->isSimple()) return false;
>> +
>> +      // For now we only allow loads in the same block as the PHI.  This
>> is
>> +      // a common case that happens when instcombine merges two loads
>> through
>> +      // a PHI.
>> +      if (LI->getParent() != BB) return false;
>> +
>> +      // Ensure that there are no instructions between the PHI and the
>> load that
>> +      // could store.
>> +      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
>> +        if (BBI->mayWriteToMemory())
>> +          return false;
>> +
>> +      MaxAlign = std::max(MaxAlign, LI->getAlignment());
>> +      Loads.push_back(LI);
>> +    }
>> +
>> +    // We can only transform this if it is safe to push the loads into
>> the
>> +    // predecessor blocks. The only thing to watch out for is that we
>> can't put
>> +    // a possibly trapping load in the predecessor if it is a critical
>> edge.
>> +    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
>> +         ++Idx) {
>> +      TerminatorInst *TI = PN.getIncomingBlock(Idx)->**getTerminator();
>> +      Value *InVal = PN.getIncomingValue(Idx);
>> +
>> +      // If the value is produced by the terminator of the predecessor
>> (an
>> +      // invoke) or it has siide-effects, there is no valid place to put
>> a load
>>
>
> siide-effects -> side-effects
>
>  +      // in the predecessor.
>> +      if (TI == InVal || TI->mayHaveSideEffects())
>> +        return false;
>> +
>> +      // If the predecessor has a single successor, then the edge isn't
>> +      // critical.
>> +      if (TI->getNumSuccessors() == 1)
>> +        continue;
>> +
>> +      // If this pointer is always safe to load, or if we can prove that
>> there
>> +      // is already a load in the block, then we can move the load to
>> the pred
>> +      // block.
>> +      if (InVal->**isDereferenceablePointer() ||
>> +          isSafeToLoadUnconditionally(**InVal, TI, MaxAlign, &TD))
>> +        continue;
>> +
>> +      return false;
>> +    }
>> +
>> +    return true;
>> +  }
>> +
>> +  bool visitPHINode(PHINode &PN) {
>> +    DEBUG(dbgs() << "    original: " << PN << "\n");
>>
>> +
>> +    // Find the operand we need to rewrite here.
>> +    User::op_iterator OI = PN.op_begin(), OE = PN.op_end();
>> +    for (; OI != OE && *OI != OldPtr; ++OI)
>> +      ;
>> +    assert(OI != OE);
>> +
>> +    IRBuilder<> Builder(&PN);
>> +
>> +    SmallVector<LoadInst *, 4> Loads;
>> +    if (!isSafePHIToSpeculate(PN, Loads)) {
>> +      Value *NewPtr = getAdjustedAllocaPtr(Builder, (*OI)->getType());
>> +      *OI = NewPtr;
>> +      DEBUG(dbgs() << "          to: " << PN << "\n");
>> +      deleteIfTriviallyDead(OldPtr);
>> +      return false;
>> +    }
>> +    assert(!Loads.empty());
>> +
>> +    Type *LoadTy = cast<PointerType>(PN.getType()**)->getElementType();
>> +    PHINode *NewPN = Builder.CreatePHI(LoadTy,
>> PN.getNumIncomingValues());
>> +    NewPN->takeName(&PN);
>> +
>> +    // Get the TBAA tag and alignment to use from one of the loads.  It
>> doesn't
>> +    // matter which one we get and if any differ, it doesn't matter.
>>
>
> Why doesn't it matter?  In essence your are saying that the tbaa tag is a
> property of the value being loaded, not a property of the load.  But is
> this
> warranted?  It may be true for how clang produces tbaa tags, but that's not
> what matters...  In fact even then is it safe?
>

I have no idea. This is not new code, this was lifted directly from the
existing SROA. I think Dan is looking at changing all of this too.

>
>  +    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
>> +    MDNode *TBAATag = SomeLoad->getMetadata(**LLVMContext::MD_tbaa);
>> +    unsigned Align = SomeLoad->getAlignment();
>> +    Value *NewPtr = getAdjustedAllocaPtr(Builder, OldPtr->getType());
>> +
>> +    // Rewrite all loads of the PN to use the new PHI.
>> +    do {
>> +      LoadInst *LI = Loads.pop_back_val();
>> +      LI->replaceAllUsesWith(NewPN);
>> +      Pass.DeadInsts.push_back(LI);
>> +    } while (!Loads.empty());
>> +
>> +    // Inject loads into all of the pred blocks.
>> +    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
>> ++Idx) {
>> +      BasicBlock *Pred = PN.getIncomingBlock(Idx);
>> +      TerminatorInst *TI = Pred->getTerminator();
>> +      Value *InVal = PN.getIncomingValue(Idx);
>> +      IRBuilder<> PredBuilder(TI);
>> +
>> +      // Map the value to the new alloca pointer if this was the old
>> alloca
>> +      // pointer.
>> +      bool ThisOperand = InVal == OldPtr;
>> +      if (ThisOperand)
>> +        InVal = NewPtr;
>> +
>> +      LoadInst *Load
>> +        = PredBuilder.CreateLoad(InVal, getName(".sroa.speculate." +
>> +                                                Pred->getName()));
>> +      ++NumLoadsSpeculated;
>> +      Load->setAlignment(Align);
>> +      if (TBAATag)
>> +        Load->setMetadata(LLVMContext:**:MD_tbaa, TBAATag);
>> +      NewPN->addIncoming(Load, Pred);
>> +
>> +      if (ThisOperand)
>> +        continue;
>> +      Instruction *OtherPtr = dyn_cast<Instruction>(InVal);
>> +      if (!OtherPtr)
>> +        // No uses to rewrite.
>> +        continue;
>> +
>> +      // Try to lookup and rewrite any partition uses corresponding to
>> this phi
>> +      // input.
>> +      AllocaPartitioning::iterator PI
>> +        = P.**findPartitionForPHIOrSelectOpe**rand(PN, OtherPtr);
>> +      if (PI != P.end()) {
>> +        // If the other pointer is within the partitioning, replace the
>> PHI in
>> +        // its uses with the load we just speculated, or add another
>> load for
>> +        // it to rewrite if we've already replaced the PHI.
>> +        AllocaPartitioning::use_**iterator UI
>> +          = P.**findPartitionUseForPHIOrSelect**Operand(PN, OtherPtr);
>> +        if (isa<PHINode>(*UI->User))
>> +          UI->User = Load;
>> +        else {
>> +          AllocaPartitioning::**PartitionUse OtherUse = *UI;
>> +          OtherUse.User = Load;
>> +          P.use_insert(PI, std::upper_bound(UI, P.use_end(PI), OtherUse),
>> +                       OtherUse);
>> +        }
>> +      }
>> +    }
>> +    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
>> +    return true;
>> +  }
>> +
>> +  /// Select instructions that use an alloca and are subsequently loaded
>> can be
>> +  /// rewritten to load both input pointers and then select between the
>> result,
>> +  /// allowing the load of the alloca to be promoted.
>> +  /// From this:
>> +  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
>> +  ///   %V = load i32* %P2
>> +  /// to:
>> +  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
>> +  ///   %V2 = load i32* %Other
>> +  ///   %V = select i1 %cond, i32 %V1, i32 %V2
>> +  ///
>> +  /// We can do this to a select if its only uses are loads and if the
>> operand
>> +  /// to the select can be loaded unconditionally.
>> +  bool isSafeSelectToSpeculate(**SelectInst &SI,
>> +                               SmallVectorImpl<LoadInst *> &Loads) {
>> +    Value *TValue = SI.getTrueValue();
>> +    Value *FValue = SI.getFalseValue();
>> +    bool TDerefable = TValue->**isDereferenceablePointer();
>> +    bool FDerefable = FValue->**isDereferenceablePointer();
>> +
>> +    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
>> +         UI != UE; ++UI) {
>> +      LoadInst *LI = dyn_cast<LoadInst>(*UI);
>> +      if (LI == 0 || !LI->isSimple()) return false;
>> +
>> +      // Both operands to the select need to be dereferencable, either
>> +      // absolutely (e.g. allocas) or at this point because we can see
>> other
>> +      // accesses to it.
>> +      if (!TDerefable && !isSafeToLoadUnconditionally(**TValue, LI,
>> +
>>  LI->getAlignment(), &TD))
>> +        return false;
>> +      if (!FDerefable && !isSafeToLoadUnconditionally(**FValue, LI,
>> +
>>  LI->getAlignment(), &TD))
>> +        return false;
>> +      Loads.push_back(LI);
>> +    }
>> +
>> +    return true;
>> +  }
>> +
>> +  bool visitSelectInst(SelectInst &SI) {
>> +    DEBUG(dbgs() << "    original: " << SI << "\n");
>>
>> +
>> +    // Find the operand we need to rewrite here.
>> +    bool IsTrueVal = SI.getTrueValue() == OldPtr;
>>
>
> How about an assert that if it isn't the true value then it must be the
> false
> value?  And how about an assert that it isn't both values?
>

Yea, this is particularly nice as if we just miss an optimization several
steps earlier, it stops holding.

>
>  +    Value *NewPtr = getAdjustedAllocaPtr(**IRBuilder<>(&SI),
>> OldPtr->getType());
>> +
>> +    // If the select isn't safe to speculate, just use simple logic to
>> emit it.
>> +    SmallVector<LoadInst *, 4> Loads;
>> +    if (!isSafeSelectToSpeculate(SI, Loads)) {
>> +      SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
>> +      DEBUG(dbgs() << "          to: " << SI << "\n");
>> +      deleteIfTriviallyDead(OldPtr);
>> +      return false;
>> +    }
>> +
>> +    Value *OtherPtr = IsTrueVal ? SI.getFalseValue() : SI.getTrueValue();
>> +    AllocaPartitioning::iterator PI
>> +      = P.**findPartitionForPHIOrSelectOpe**rand(SI, OtherPtr);
>> +    AllocaPartitioning::**PartitionUse OtherUse;
>> +    if (PI != P.end()) {
>> +      // If the other pointer is within the partitioning, remove the
>> select
>> +      // from its uses. We'll add in the new loads below.
>> +      AllocaPartitioning::use_**iterator UI
>> +        = P.**findPartitionUseForPHIOrSelect**Operand(SI, OtherPtr);
>> +      OtherUse = *UI;
>> +      P.use_erase(PI, UI);
>>
>
> Having to keep track of uses of other partitions like looks like a pain and
> seems fragile.  Do you really need to precompute all uses for all
> partitions?
> If you could just visit one partition at a time, compute its uses and
> rewrite
> them then you wouldn't have to worry about the other partitions like this.
>  Is
> something like that feasible?
>

We actually talked about this some in another context, but to recap, not
really.

The problem is that the only way to compute the uses of a partition is to
walk the entire recursive user set for the original alloca. Because this is
a recursive user chain walk (mmm linked lists) it is *really* slow. Doing
it once per partition would make the increased complexity of this algorithm
much more problematic. One of the things I've worked very hard on is to
minimize the impact of the aggressive splitting by having everything in the
complexity domain of the number of splits have quite high locality and low
constant factors. Rotating the use mapping to be per-partition in a dense
data structure is a big part of this.

>  +    }
>> +
>> +    Value *TV = IsTrueVal ? NewPtr : SI.getTrueValue();
>> +    Value *FV = IsTrueVal ? SI.getFalseValue() : NewPtr;
>> +    // Replace the loads of the select with a select of two loads.
>> +    while (!Loads.empty()) {
>> +      LoadInst *LI = Loads.pop_back_val();
>> +
>> +      IRBuilder<> Builder(LI);
>> +      LoadInst *TL =
>> +        Builder.CreateLoad(TV, getName("." + LI->getName() + ".true"));
>> +      LoadInst *FL =
>> +        Builder.CreateLoad(FV, getName("." + LI->getName() + ".false"));
>> +      NumLoadsSpeculated += 2;
>> +      if (PI != P.end()) {
>> +        LoadInst *OtherLoad = IsTrueVal ? FL : TL;
>> +        assert(OtherUse.Ptr == OtherLoad->getOperand(0));
>> +        OtherUse.User = OtherLoad;
>> +        P.use_insert(PI, P.use_end(PI), OtherUse);
>> +      }
>> +
>> +      // Transfer alignment and TBAA info if present.
>> +      TL->setAlignment(LI->**getAlignment());
>> +      FL->setAlignment(LI->**getAlignment());
>> +      if (MDNode *Tag = LI->getMetadata(LLVMContext::**MD_tbaa)) {
>> +        TL->setMetadata(LLVMContext::**MD_tbaa, Tag);
>> +        FL->setMetadata(LLVMContext::**MD_tbaa, Tag);
>> +      }
>> +
>> +      Value *V = Builder.CreateSelect(SI.**getCondition(), TL, FL);
>> +      V->takeName(LI);
>> +      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
>> +      LI->replaceAllUsesWith(V);
>> +      Pass.DeadInsts.push_back(LI);
>> +    }
>> +    if (PI != P.end())
>> +      std::stable_sort(P.use_begin(**PI), P.use_end(PI));
>> +
>> +    deleteIfTriviallyDead(OldPtr);
>> +    return NewPtr == &NewAI;
>> +  }
>> +
>> +};
>> +}
>>
>
> Ciao, Duncan.
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20120829/ce672617/attachment.html>