[llvm] [SROA] Use tree-structure merge to remove alloca (PR #152793)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 27 13:27:49 PDT 2025
================
@@ -2811,6 +2902,220 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
return CanSROA;
}
+ /// Attempts to rewrite a partition using tree-structured merge optimization.
+ ///
+ /// This function analyzes a partition to determine if it can be optimized
+ /// using a tree-structured merge pattern, where multiple non-overlapping
+ /// stores completely fill an alloca. And there is no load from the alloca in
+ /// the middle of the stores. Such patterns can be optimized by eliminating
+ /// the intermediate stores and directly constructing the final vector by
+ /// using shufflevectors.
+ ///
+ /// Example transformation:
+ /// Before: (stores do not have to be in order)
+ /// %alloca = alloca <8 x float>
+ /// store <2 x float> %val0, ptr %alloca ; offset 0-1
+ /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
+ /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
+ /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
+ ///
+ /// After:
+ /// %alloca = alloca <8 x float>
+ /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
+ /// i32 3>
+ /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
+ /// i32 3>
+ /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
+ /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ /// store %shuffle2, ptr %alloca
+ ///
+ /// The optimization looks for partitions that:
+ /// 1. Have no overlapping split slice tails
+ /// 2. Contain non-overlapping stores that cover the entire alloca
+ /// 3. Have exactly one load that reads the complete alloca structure and not
+ /// in the middle of the stores (TODO: maybe we can relax the constraint
+ /// about reading the entire alloca structure)
+ ///
+ /// \param P The partition to analyze and potentially rewrite
+ /// \return An optional vector of values that were deleted during the rewrite
+ /// process, or std::nullopt if the partition cannot be optimized
+ /// using tree-structured merge
+ std::optional<SmallVector<Value *, 4>>
+ rewriteTreeStructuredMerge(Partition &P) {
+ // No tail slices that overlap with the partition
+ if (P.splitSliceTails().size() > 0)
+ return std::nullopt;
+
+ SmallVector<Value *, 4> DeletedValues;
+ LoadInst *TheLoad = nullptr;
+
+ // Structure to hold store information
+ struct StoreInfo {
+ StoreInst *Store;
+ uint64_t BeginOffset;
+ uint64_t EndOffset;
+ Value *StoredValue;
+ StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
+ : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
+ };
+
+ SmallVector<StoreInfo, 4> StoreInfos;
+
+ // The alloca must be a fixed vector type
+ Type *AllocatedEltTy = nullptr;
+ if (auto *FixedVecTy = dyn_cast<FixedVectorType>(NewAI.getAllocatedType()))
+ AllocatedEltTy = FixedVecTy->getElementType();
+ else
+ return std::nullopt;
+ // If the allocated element type is a pointer, we do not handle it
+ // TODO: handle this case by using inttoptr/ptrtoint
+ if (AllocatedEltTy->isPtrOrPtrVectorTy())
+ return std::nullopt;
+
+ for (Slice &S : P) {
+ auto *User = cast<Instruction>(S.getUse()->getUser());
+ if (auto *LI = dyn_cast<LoadInst>(User)) {
+ // Do not handle the case where there is more than one load
+ // TODO: maybe we can handle this case
+ if (TheLoad)
+ return std::nullopt;
+ // If load is not a fixed vector type, we do not handle it
+ // If the number of loaded bits is not the same as the new alloca type
+ // size, we do not handle it
+ auto *FixedVecTy = dyn_cast<FixedVectorType>(LI->getType());
+ if (!FixedVecTy)
+ return std::nullopt;
+ if (DL.getTypeSizeInBits(FixedVecTy) !=
+ DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+ return std::nullopt;
+ // If the loaded value is a pointer, we do not handle it
+ // TODO: handle this case by using inttoptr/ptrtoint
+ if (FixedVecTy->getElementType()->isPtrOrPtrVectorTy())
+ return std::nullopt;
+ TheLoad = LI;
+ } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+ // The stored value should be a fixed vector type
+ Type *StoredValueType = SI->getValueOperand()->getType();
+ if (!isa<FixedVectorType>(StoredValueType))
+ return std::nullopt;
+
+ // The total number of stored bits should be the multiple of the new
+ // alloca element type size
+ if (DL.getTypeSizeInBits(StoredValueType) %
+ DL.getTypeSizeInBits(AllocatedEltTy) !=
+ 0)
+ return std::nullopt;
+ // If the stored value is a pointer, we do not handle it
+ // TODO: handle this case by using inttoptr/ptrtoint
+ if (StoredValueType->isPtrOrPtrVectorTy())
+ return std::nullopt;
+ StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
+ SI->getValueOperand());
+ } else {
+ // If we have instructions other than load and store, we cannot do the
+ // tree structured merge
+ return std::nullopt;
+ }
+ }
+ // If we do not have any load, we cannot do the tree structured merge
+ if (!TheLoad)
+ return std::nullopt;
+
+ // If we do not have multiple stores, we cannot do the tree structured merge
+ if (StoreInfos.size() < 2)
+ return std::nullopt;
+
+ // Stores should not overlap and should cover the whole alloca
+ // Sort by begin offset
+ llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
+ return A.BeginOffset < B.BeginOffset;
+ });
+
+ // Check for overlaps and coverage
+ uint64_t ExpectedStart = NewAllocaBeginOffset;
+ TypeSize TotalStoreBits = TypeSize::getZero();
+ for (auto &StoreInfo : StoreInfos) {
+ uint64_t BeginOff = StoreInfo.BeginOffset;
+ uint64_t EndOff = StoreInfo.EndOffset;
+
+ // Check for gap or overlap
+ if (BeginOff != ExpectedStart)
+ return std::nullopt;
+
+ ExpectedStart = EndOff;
+ TotalStoreBits +=
+ DL.getTypeSizeInBits(StoreInfo.Store->getValueOperand()->getType());
+ }
+ // Check that stores cover the entire alloca
+ // We need check both the end offset and the total store bits
+ if (ExpectedStart != NewAllocaEndOffset ||
+ TotalStoreBits != DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+ return std::nullopt;
+
+ // Stores should be in the same basic block
+ // The load should not be in the middle of the stores
+ BasicBlock *LoadBB = TheLoad->getParent();
+ BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
+
+ for (auto &StoreInfo : StoreInfos) {
+ if (StoreInfo.Store->getParent() != StoreBB)
+ return std::nullopt;
+ if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
+ return std::nullopt;
+ }
+
+ // If we reach here, the partition can be merged with a tree structured
+ // merge
+ LLVM_DEBUG({
+ dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
+ << "\n Ordered stores:\n";
+ for (auto [i, Info] : enumerate(StoreInfos))
+ dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
+ << Info.EndOffset << ") \tStore: " << *Info.Store
+ << "\tValue: " << *Info.StoredValue << "\n";
+ });
+
+ // Instead of having these stores, we merge all the stored values into a
+ // vector and store the merged value into the alloca
+ std::queue<Value *> VecElements;
+ IRBuilder<> Builder(StoreInfos.back().Store);
+ for (const auto &Info : StoreInfos) {
+ DeletedValues.push_back(Info.Store);
+ VecElements.push(Info.StoredValue);
+ }
+
+ LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
+ while (VecElements.size() > 1) {
+ uint64_t NumElts = VecElements.size();
+ for (uint64_t i = 0; i < NumElts / 2; i++) {
+ Value *V0 = VecElements.front();
+ VecElements.pop();
+ Value *V1 = VecElements.front();
+ VecElements.pop();
+ Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
+ LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
+ VecElements.push(Merged);
+ }
+ if (NumElts % 2 == 1) {
+ Value *V = VecElements.front();
+ VecElements.pop();
+ VecElements.push(V);
+ }
+ }
+
+ // Store the merged value into the alloca
+ Value *MergedValue = VecElements.front();
+ Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
+
+ IRBuilder<> LoadBuilder(TheLoad);
+ TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
+ TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
+ TheLoad->getName() + ".sroa.new.load"));
----------------
Chengjunp wrote:
Because `TheLoad` is a load from the old alloca instead of the new one. `NewAI` may be only a partition of the original alloca.
For example, considering the following case
```llvm
entry:
%alloca = alloca <8 x float>
%ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
store <2 x float> %a, ptr %ptr0
%ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
store <2 x float> %b, ptr %ptr1
%ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
store <2 x float> %c, ptr %ptr2
%ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
store <2 x float> %d, ptr %ptr3
%result1 = load <4 x float>, ptr %alloca
%ptr_offset4 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
%result2 = load <4 x float>, ptr %ptr_offset4
store <4 x float> %result1, ptr %e
store <4 x float> %result2, ptr %f
ret void
```
In SROA, %alloca will be divided into two partitions with offsets [0,16) and [16,32). Here %result1 will only cover the first partition, which will be the new alloca. So we need to create a new load from this new alloca and replace the original one. And the new load will be eventually removed when SROA promote the alloca later.
https://github.com/llvm/llvm-project/pull/152793
More information about the llvm-commits
mailing list