[llvm] [SROA] Use tree-structure merge to remove alloca (PR #152793)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 8 13:52:19 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Chengjun (Chengjunp)
<details>
<summary>Changes</summary>
This patch introduces a new optimization in SROA that handles the pattern where multiple non-overlapping vector `store`s completely fill an `alloca`.
The current approach to handle this pattern introduces many `.vecexpand` and `.vecblend` instructions, which can dramatically slow down compilation when dealing with large `alloca`s built from many small vector `store`s. For example, consider an `alloca` of type `<128 x float>` filled by 64 `store`s of `<2 x float>` each. The current implementation requires:
- 64 `shufflevector`s( `.vecexpand`)
- 64 `select`s ( `.vecblend` )
- All operations use masks of size 128
- These operations form a long dependency chain
This kind of IR is both difficult to optimize and slow to compile, particularly impacting the `InstCombine` pass.
This patch introduces a tree-structured merge approach that significantly reduces the number of operations and improves compilation performance.
Key features:
- Detects when vector `store`s completely fill an `alloca` without gaps
- Ensures no loads occur in the middle of the store sequence
- Uses a tree-based approach with `shufflevector`s to merge stored values
- Reduces the number of intermediate operations compared to linear merging
- Eliminates the long dependency chains that hurt optimization
Example transformation:
```
// Before: (stores do not have to be in order)
%alloca = alloca <8 x float>
store <2 x float> %val0, ptr %alloca ; offset 0-1
store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
%result = load <8 x float>, ptr %alloca
// After (tree-structured merge):
%shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%result = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
```
Benefits:
- Logarithmic depth (O(log n)) instead of linear dependency chains
- Fewer total operations for large vectors
- Better optimization opportunities for subsequent passes
- Significant compilation time improvements for large vector patterns
For some large cases, the compile time can be reduced from about 60s to less than 3s.
---
Patch is 50.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152793.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/SROA.cpp (+288-7)
- (added) llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll (+214)
- (added) llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll (+408)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d6e27aa20730b..2bbaf7813c3c0 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -91,6 +91,7 @@
#include <cstdint>
#include <cstring>
#include <iterator>
+#include <queue>
#include <string>
#include <tuple>
#include <utility>
@@ -2678,6 +2679,53 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
return V;
}
+static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) {
+ assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() &&
+ "Can not merge two non-vector values");
+
+ // V0 and V1 are vectors
+ // Create a new vector type with combined elements
+ // Use ShuffleVector to concatenate the vectors
+ auto *VecType0 = cast<FixedVectorType>(V0->getType());
+ auto *VecType1 = cast<FixedVectorType>(V1->getType());
+
+ assert(VecType0->getElementType() == VecType1->getElementType() &&
+ "Can not merge two vectors with different element types");
+ unsigned NumElts0 = VecType0->getNumElements();
+ unsigned NumElts1 = VecType1->getNumElements();
+
+ SmallVector<int, 16> ShuffleMask;
+
+ if (NumElts0 == NumElts1) {
+ for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
+ ShuffleMask.push_back(i);
+ } else {
+ // If two vectors have different sizes, we need to extend
+ // the smaller vector to the size of the larger vector.
+ unsigned SmallSize = std::min(NumElts0, NumElts1);
+ unsigned LargeSize = std::max(NumElts0, NumElts1);
+ bool IsV0Smaller = NumElts0 < NumElts1;
+ Value *SmallVec = IsV0Smaller ? V0 : V1;
+
+ SmallVector<int, 16> ExtendMask;
+ for (unsigned i = 0; i < SmallSize; ++i)
+ ExtendMask.push_back(i);
+ for (unsigned i = SmallSize; i < LargeSize; ++i)
+ ExtendMask.push_back(PoisonMaskElem);
+ Value *ExtendedVec = Builder.CreateShuffleVector(
+ SmallVec, PoisonValue::get(SmallVec->getType()), ExtendMask);
+ LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
+ V0 = IsV0Smaller ? ExtendedVec : V0;
+ V1 = IsV0Smaller ? V1 : ExtendedVec;
+ for (unsigned i = 0; i < NumElts0; ++i)
+ ShuffleMask.push_back(i);
+ for (unsigned i = 0; i < NumElts1; ++i)
+ ShuffleMask.push_back(LargeSize + i);
+ }
+
+ return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+}
+
namespace {
/// Visitor to rewrite instructions using p particular slice of an alloca
@@ -2822,6 +2870,230 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
return CanSROA;
}
+ /// Attempts to rewrite a partition using tree-structured merge optimization.
+ ///
+ /// This function analyzes a partition to determine if it can be optimized
+ /// using a tree-structured merge pattern, where multiple non-overlapping
+ /// stores completely fill an alloca. And there is no load from the alloca in
+ /// the middle of the stores. Such patterns can be optimized by eliminating
+ /// the intermediate stores and directly constructing the final vector by
+ /// using shufflevectors.
+ ///
+ /// Example transformation:
+ /// Before: (stores do not have to be in order)
+ /// %alloca = alloca <8 x float>
+ /// store <2 x float> %val0, ptr %alloca ; offset 0-1
+ /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
+ /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
+ /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
+ ///
+ /// After:
+ /// %alloca = alloca <8 x float>
+ /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
+ /// i32 3>
+ /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
+ /// i32 3>
+ /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
+ /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ /// store %shuffle2, ptr %alloca
+ ///
+ /// The optimization looks for partitions that:
+ /// 1. Have no overlapping split slice tails
+ /// 2. Contain non-overlapping stores that cover the entire alloca
+ /// 3. Have exactly one load that reads the complete alloca structure and not
+ /// in the middle of the stores (TODO: maybe we can relax the constraint
+ /// about reading the entire alloca structure)
+ ///
+ /// \param P The partition to analyze and potentially rewrite
+ /// \return An optional vector of values that were deleted during the rewrite
+ /// process, or std::nullopt if the partition cannot be optimized
+ /// using tree-structured merge
+ std::optional<SmallVector<Value *, 4>>
+ rewriteTreeStructuredMerge(Partition &P) {
+ // No tail slices that overlap with the partition
+ if (P.splitSliceTails().size() > 0)
+ return std::nullopt;
+
+ SmallVector<Value *, 4> DeletedValues;
+ LoadInst *TheLoad = nullptr;
+
+ // Structure to hold store information
+ struct StoreInfo {
+ StoreInst *Store;
+ uint64_t BeginOffset;
+ uint64_t EndOffset;
+ Value *StoredValue;
+ TypeSize StoredTypeSize = TypeSize::getZero();
+
+ StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val,
+ TypeSize StoredTypeSize)
+ : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val),
+ StoredTypeSize(StoredTypeSize) {}
+ };
+
+ SmallVector<StoreInfo, 4> StoreInfos;
+
+ // The alloca must be a fixed vector type
+ auto *AllocatedTy = NewAI.getAllocatedType();
+ if (!isa<FixedVectorType>(AllocatedTy))
+ return std::nullopt;
+
+ Slice *LoadSlice = nullptr;
+ Type *LoadElementType = nullptr;
+ Type *StoreElementType = nullptr;
+ for (Slice &S : P) {
+ auto *User = cast<Instruction>(S.getUse()->getUser());
+ if (auto *LI = dyn_cast<LoadInst>(User)) {
+ // Do not handle the case where there is more than one load
+ // TODO: maybe we can handle this case
+ if (TheLoad)
+ return std::nullopt;
+ // If load is not a fixed vector type, we do not handle it
+ // If the number of loaded bits is not the same as the new alloca type
+ // size, we do not handle it
+ auto *FixedVecTy = dyn_cast<FixedVectorType>(LI->getType());
+ if (!FixedVecTy)
+ return std::nullopt;
+ if (DL.getTypeSizeInBits(FixedVecTy) !=
+ DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+ return std::nullopt;
+ LoadElementType = FixedVecTy->getElementType();
+ TheLoad = LI;
+ LoadSlice = &S;
+ } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+ // The store needs to be a fixed vector type
+ // All the stores should have the same element type
+ Type *StoredValueType = SI->getValueOperand()->getType();
+ Type *CurrentElementType = nullptr;
+ TypeSize StoredTypeSize = TypeSize::getZero();
+ if (auto *FixedVecTy = dyn_cast<FixedVectorType>(StoredValueType)) {
+ // Fixed vector type - use its element type
+ CurrentElementType = FixedVecTy->getElementType();
+ StoredTypeSize = DL.getTypeSizeInBits(FixedVecTy);
+ } else
+ return std::nullopt;
+ // Check element type consistency across all stores
+ if (StoreElementType && StoreElementType != CurrentElementType)
+ return std::nullopt;
+ StoreElementType = CurrentElementType;
+ StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
+ SI->getValueOperand(), StoredTypeSize);
+ } else {
+ // If we have instructions other than load and store, we cannot do the
+ // tree structured merge
+ return std::nullopt;
+ }
+ }
+ // If we do not have any load, we cannot do the tree structured merge
+ if (!TheLoad)
+ return std::nullopt;
+
+ // If we do not have any stores, we cannot do the tree structured merge
+ if (StoreInfos.empty())
+ return std::nullopt;
+
+ // The load and store element types should be the same
+ if (LoadElementType != StoreElementType)
+ return std::nullopt;
+
+ // The load should cover the whole alloca
+ // TODO: maybe we can relax this constraint
+ if (!LoadSlice || LoadSlice->beginOffset() != NewAllocaBeginOffset ||
+ LoadSlice->endOffset() != NewAllocaEndOffset)
+ return std::nullopt;
+
+ // Stores should not overlap and should cover the whole alloca
+ // Sort by begin offset
+ llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
+ return A.BeginOffset < B.BeginOffset;
+ });
+
+ // Check for overlaps and coverage
+ uint64_t ExpectedStart = NewAllocaBeginOffset;
+ TypeSize TotalStoreBits = TypeSize::getZero();
+ Instruction *PrevStore = nullptr;
+ for (auto &StoreInfo : StoreInfos) {
+ uint64_t BeginOff = StoreInfo.BeginOffset;
+ uint64_t EndOff = StoreInfo.EndOffset;
+
+ // Check for gap or overlap
+ if (BeginOff != ExpectedStart)
+ return std::nullopt;
+
+ ExpectedStart = EndOff;
+ TotalStoreBits += StoreInfo.StoredTypeSize;
+ PrevStore = StoreInfo.Store;
+ }
+ // Check that stores cover the entire alloca
+ // We need check both the end offset and the total store bits
+ if (ExpectedStart != NewAllocaEndOffset ||
+ TotalStoreBits != DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+ return std::nullopt;
+
+ // Stores should be in the same basic block
+ // The load should not be in the middle of the stores
+ BasicBlock *LoadBB = TheLoad->getParent();
+ BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
+
+ for (auto &StoreInfo : StoreInfos) {
+ if (StoreInfo.Store->getParent() != StoreBB)
+ return std::nullopt;
+ if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
+ return std::nullopt;
+ }
+
+ // If we reach here, the partition can be merged with a tree structured
+ // merge
+ LLVM_DEBUG({
+ dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
+ << "\n Ordered stores:\n";
+ for (auto [i, Info] : enumerate(StoreInfos))
+ dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
+ << Info.EndOffset << ") \tStore: " << *Info.Store
+ << "\tValue: " << *Info.StoredValue << "\n";
+ });
+
+ // Instead of having these stores, we merge all the stored values into a
+ // vector and store the merged value into the alloca
+ std::queue<Value *> VecElements;
+ IRBuilder<> Builder(StoreInfos.back().Store);
+ for (const auto &Info : StoreInfos) {
+ DeletedValues.push_back(Info.Store);
+ VecElements.push(Info.StoredValue);
+ }
+
+ LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
+ while (VecElements.size() > 1) {
+ uint64_t NumElts = VecElements.size();
+ for (uint64_t i = 0; i < NumElts / 2; i++) {
+ Value *V0 = VecElements.front();
+ VecElements.pop();
+ Value *V1 = VecElements.front();
+ VecElements.pop();
+ Value *Merged = mergeTwoVectors(V0, V1, Builder);
+ LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
+ VecElements.push(Merged);
+ }
+ if (NumElts % 2 == 1) {
+ Value *V = VecElements.front();
+ VecElements.pop();
+ VecElements.push(V);
+ }
+ }
+
+ // Store the merged value into the alloca
+ Value *MergedValue = VecElements.front();
+ Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
+
+ IRBuilder<> LoadBuilder(TheLoad);
+ TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
+ TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
+ TheLoad->getName() + ".sroa.new.load"));
+ DeletedValues.push_back(TheLoad);
+
+ return DeletedValues;
+ }
+
private:
// Make sure the other visit overloads are visible.
using Base::visit;
@@ -4996,13 +5268,22 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
P.endOffset(), IsIntegerPromotable, VecTy,
PHIUsers, SelectUsers);
bool Promotable = true;
- for (Slice *S : P.splitSliceTails()) {
- Promotable &= Rewriter.visit(S);
- ++NumUses;
- }
- for (Slice &S : P) {
- Promotable &= Rewriter.visit(&S);
- ++NumUses;
+ // Check whether we can have tree-structured merge.
+ std::optional<SmallVector<Value *, 4>> DeletedValues =
+ Rewriter.rewriteTreeStructuredMerge(P);
+ if (DeletedValues) {
+ NumUses += DeletedValues->size() + 1;
+ for (Value *V : *DeletedValues)
+ DeadInsts.push_back(V);
+ } else {
+ for (Slice *S : P.splitSliceTails()) {
+ Promotable &= Rewriter.visit(S);
+ ++NumUses;
+ }
+ for (Slice &S : P) {
+ Promotable &= Rewriter.visit(&S);
+ ++NumUses;
+ }
}
NumAllocaPartitionUses += NumUses;
diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
new file mode 100644
index 0000000000000..61d77478e0b59
--- /dev/null
+++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
@@ -0,0 +1,214 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -disable-output -debug-only=sroa 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='sroa<modify-cfg>' -disable-output -debug-only=sroa 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+; CHECK-NOT: Tree structured merge rewrite
+define i32 @test_alloca_not_fixed_vector() {
+entry:
+ %alloca = alloca [4 x float]
+
+ %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
+ store float 1.0, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 1
+ store float 2.0, ptr %ptr1
+
+ %result = load i32, ptr %alloca
+ ret i32 %result
+}
+
+define <4 x float> @test_more_than_one_load(<2 x float> %a, <2 x float> %b) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %b, ptr %ptr1
+
+ %result1 = load <4 x float>, ptr %alloca
+ %result2 = load <4 x float>, ptr %alloca
+
+ %final = fadd <4 x float> %result1, %result2
+ ret <4 x float> %final
+}
+
+define void @test_no_load(<4 x float> %a) {
+entry:
+ %alloca = alloca <4 x float>
+ store <4 x float> %a, ptr %alloca
+ ret void
+}
+
+define i32 @test_load_not_fixed_vector(<2 x float> %a, <2 x float> %b) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %b, ptr %ptr1
+
+ %result = load i32, ptr %alloca
+ ret i32 %result
+}
+
+define <3 x float> @test_load_not_covering_alloca(<2 x float> %a, <2 x float> %b) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %b, ptr %ptr1
+
+ %result = load <3 x float>, ptr %ptr0
+ ret <3 x float> %result
+}
+
+define <4 x float> @test_store_not_fixed_vector(<vscale x 2 x float> %a) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ %fixed = extractelement <vscale x 2 x float> %a, i32 0
+ store float %fixed, ptr %ptr0
+
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_store_not_same_element_type() {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ %float_vec = insertelement <2 x float> undef, float 1.0, i32 0
+ %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
+ store <2 x float> %float_vec2, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ %int_vec = insertelement <2 x i32> undef, i32 3, i32 0
+ %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1
+ store <2 x i32> %int_vec2, ptr %ptr1
+
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+define <4 x i32> @test_load_store_different_element_type() {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ %float_vec = insertelement <2 x float> undef, float 1.0, i32 0
+ %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
+ store <2 x float> %float_vec2, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ %float_vec3 = insertelement <2 x float> undef, float 3.0, i32 0
+ %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1
+ store <2 x float> %float_vec4, ptr %ptr1
+
+ %result = load <4 x i32>, ptr %alloca
+ ret <4 x i32> %result
+}
+
+define <4 x float> @test_no_stores() {
+entry:
+ %alloca = alloca <4 x float>
+
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_stores_overlapping(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 1
+ store <2 x float> %b, ptr %ptr1
+
+ %ptr2 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %c, ptr %ptr2
+
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_stores_not_covering_alloca(<2 x float> %a) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_stores_not_same_basic_block(<2 x float> %a, <2 x float> %b, i1 %cond) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ br i1 %cond, label %then, label %else
+
+then:
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %b, ptr %ptr1
+ br label %merge
+
+else:
+ br label %merge
+
+merge:
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_load_before_stores(<2 x float> %a, <2 x float> %b) {
+entry:
+ %alloca = alloca <4 x float>
+
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ %intermediate = load <4 x float>, ptr %alloca
+
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %b, ptr %ptr1
+
+ ret <4 x float> %intermediate
+}
+
+define <4 x float> @test_other_instructions(<2 x float> %a, <2 x float> %b) {
+entry:
+ %alloca = alloca <4 x float>
+
+ ; Store first vector
+ %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+ store <2 x float> %a, ptr %ptr0
+
+ ; Other instruction (memset) that's not a simple load/store
+ call void @llvm.memset.p0.i64(ptr %alloca, i8 0, i64 8, i1 false)
+
+ ; Store second vector
+ %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+ store <2 x float> %b, ptr %ptr1
+
+ %result = load <4 x float>, ptr %alloca
+ ret <4 x float> %result
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
new file mode 100644
index 0000000000000..c74b0b932ddef
--- /dev/null
+++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHEC...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152793
More information about the llvm-commits
mailing list