[llvm] [SROA] Use tree-structure merge to remove alloca (PR #152793)

via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 27 13:22:38 PDT 2025


https://github.com/Chengjunp updated https://github.com/llvm/llvm-project/pull/152793

>From a30ca096ddaae30663d4b10c344eff439cd44d57 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Fri, 8 Aug 2025 20:45:07 +0000
Subject: [PATCH 1/7] Initial impl of tree structure merge in SROA

---
 llvm/lib/Transforms/Scalar/SROA.cpp           | 295 ++++++++++++-
 ...r-promotion-cannot-tree-structure-merge.ll | 214 +++++++++
 ...ctor-promotion-via-tree-structure-merge.ll | 408 ++++++++++++++++++
 3 files changed, 910 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
 create mode 100644 llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d6e27aa20730b..2bbaf7813c3c0 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -91,6 +91,7 @@
 #include <cstdint>
 #include <cstring>
 #include <iterator>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -2678,6 +2679,53 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   return V;
 }
 
+static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) {
+  assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() &&
+         "Can not merge two non-vector values");
+
+  // V0 and V1 are vectors
+  // Create a new vector type with combined elements
+  // Use ShuffleVector to concatenate the vectors
+  auto *VecType0 = cast<FixedVectorType>(V0->getType());
+  auto *VecType1 = cast<FixedVectorType>(V1->getType());
+
+  assert(VecType0->getElementType() == VecType1->getElementType() &&
+         "Can not merge two vectors with different element types");
+  unsigned NumElts0 = VecType0->getNumElements();
+  unsigned NumElts1 = VecType1->getNumElements();
+
+  SmallVector<int, 16> ShuffleMask;
+
+  if (NumElts0 == NumElts1) {
+    for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
+      ShuffleMask.push_back(i);
+  } else {
+    // If two vectors have different sizes, we need to extend
+    // the smaller vector to the size of the larger vector.
+    unsigned SmallSize = std::min(NumElts0, NumElts1);
+    unsigned LargeSize = std::max(NumElts0, NumElts1);
+    bool IsV0Smaller = NumElts0 < NumElts1;
+    Value *SmallVec = IsV0Smaller ? V0 : V1;
+
+    SmallVector<int, 16> ExtendMask;
+    for (unsigned i = 0; i < SmallSize; ++i)
+      ExtendMask.push_back(i);
+    for (unsigned i = SmallSize; i < LargeSize; ++i)
+      ExtendMask.push_back(PoisonMaskElem);
+    Value *ExtendedVec = Builder.CreateShuffleVector(
+        SmallVec, PoisonValue::get(SmallVec->getType()), ExtendMask);
+    LLVM_DEBUG(dbgs() << "    shufflevector: " << *ExtendedVec << "\n");
+    V0 = IsV0Smaller ? ExtendedVec : V0;
+    V1 = IsV0Smaller ? V1 : ExtendedVec;
+    for (unsigned i = 0; i < NumElts0; ++i)
+      ShuffleMask.push_back(i);
+    for (unsigned i = 0; i < NumElts1; ++i)
+      ShuffleMask.push_back(LargeSize + i);
+  }
+
+  return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+}
+
 namespace {
 
 /// Visitor to rewrite instructions using p particular slice of an alloca
@@ -2822,6 +2870,230 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     return CanSROA;
   }
 
+  /// Attempts to rewrite a partition using tree-structured merge optimization.
+  ///
+  /// This function analyzes a partition to determine if it can be optimized
+  /// using a tree-structured merge pattern, where multiple non-overlapping
+  /// stores completely fill an alloca. And there is no load from the alloca in
+  /// the middle of the stores. Such patterns can be optimized by eliminating
+  /// the intermediate stores and directly constructing the final vector by
+  /// using shufflevectors.
+  ///
+  /// Example transformation:
+  /// Before: (stores do not have to be in order)
+  ///   %alloca = alloca <8 x float>
+  ///   store <2 x float> %val0, ptr %alloca             ; offset 0-1
+  ///   store <2 x float> %val2, ptr %alloca+16          ; offset 4-5
+  ///   store <2 x float> %val1, ptr %alloca+8           ; offset 2-3
+  ///   store <2 x float> %val3, ptr %alloca+24          ; offset 6-7
+  ///
+  /// After:
+  ///   %alloca = alloca <8 x float>
+  ///   %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
+  ///                 i32 3>
+  ///   %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
+  ///                 i32 3>
+  ///   %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
+  ///                 i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ///   store %shuffle2, ptr %alloca
+  ///
+  /// The optimization looks for partitions that:
+  /// 1. Have no overlapping split slice tails
+  /// 2. Contain non-overlapping stores that cover the entire alloca
+  /// 3. Have exactly one load that reads the complete alloca structure and not
+  ///    in the middle of the stores (TODO: maybe we can relax the constraint
+  ///    about reading the entire alloca structure)
+  ///
+  /// \param P The partition to analyze and potentially rewrite
+  /// \return An optional vector of values that were deleted during the rewrite
+  ///         process, or std::nullopt if the partition cannot be optimized
+  ///         using tree-structured merge
+  std::optional<SmallVector<Value *, 4>>
+  rewriteTreeStructuredMerge(Partition &P) {
+    // No tail slices that overlap with the partition
+    if (P.splitSliceTails().size() > 0)
+      return std::nullopt;
+
+    SmallVector<Value *, 4> DeletedValues;
+    LoadInst *TheLoad = nullptr;
+
+    // Structure to hold store information
+    struct StoreInfo {
+      StoreInst *Store;
+      uint64_t BeginOffset;
+      uint64_t EndOffset;
+      Value *StoredValue;
+      TypeSize StoredTypeSize = TypeSize::getZero();
+
+      StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val,
+                TypeSize StoredTypeSize)
+          : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val),
+            StoredTypeSize(StoredTypeSize) {}
+    };
+
+    SmallVector<StoreInfo, 4> StoreInfos;
+
+    // The alloca must be a fixed vector type
+    auto *AllocatedTy = NewAI.getAllocatedType();
+    if (!isa<FixedVectorType>(AllocatedTy))
+      return std::nullopt;
+
+    Slice *LoadSlice = nullptr;
+    Type *LoadElementType = nullptr;
+    Type *StoreElementType = nullptr;
+    for (Slice &S : P) {
+      auto *User = cast<Instruction>(S.getUse()->getUser());
+      if (auto *LI = dyn_cast<LoadInst>(User)) {
+        // Do not handle the case where there is more than one load
+        // TODO: maybe we can handle this case
+        if (TheLoad)
+          return std::nullopt;
+        // If load is not a fixed vector type, we do not handle it
+        // If the number of loaded bits is not the same as the new alloca type
+        // size, we do not handle it
+        auto *FixedVecTy = dyn_cast<FixedVectorType>(LI->getType());
+        if (!FixedVecTy)
+          return std::nullopt;
+        if (DL.getTypeSizeInBits(FixedVecTy) !=
+            DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+          return std::nullopt;
+        LoadElementType = FixedVecTy->getElementType();
+        TheLoad = LI;
+        LoadSlice = &S;
+      } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+        // The store needs to be a fixed vector type
+        // All the stores should have the same element type
+        Type *StoredValueType = SI->getValueOperand()->getType();
+        Type *CurrentElementType = nullptr;
+        TypeSize StoredTypeSize = TypeSize::getZero();
+        if (auto *FixedVecTy = dyn_cast<FixedVectorType>(StoredValueType)) {
+          // Fixed vector type - use its element type
+          CurrentElementType = FixedVecTy->getElementType();
+          StoredTypeSize = DL.getTypeSizeInBits(FixedVecTy);
+        } else
+          return std::nullopt;
+        // Check element type consistency across all stores
+        if (StoreElementType && StoreElementType != CurrentElementType)
+          return std::nullopt;
+        StoreElementType = CurrentElementType;
+        StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
+                                SI->getValueOperand(), StoredTypeSize);
+      } else {
+        // If we have instructions other than load and store, we cannot do the
+        // tree structured merge
+        return std::nullopt;
+      }
+    }
+    // If we do not have any load, we cannot do the tree structured merge
+    if (!TheLoad)
+      return std::nullopt;
+
+    // If we do not have any stores, we cannot do the tree structured merge
+    if (StoreInfos.empty())
+      return std::nullopt;
+
+    // The load and store element types should be the same
+    if (LoadElementType != StoreElementType)
+      return std::nullopt;
+
+    // The load should cover the whole alloca
+    // TODO: maybe we can relax this constraint
+    if (!LoadSlice || LoadSlice->beginOffset() != NewAllocaBeginOffset ||
+        LoadSlice->endOffset() != NewAllocaEndOffset)
+      return std::nullopt;
+
+    // Stores should not overlap and should cover the whole alloca
+    // Sort by begin offset
+    llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
+      return A.BeginOffset < B.BeginOffset;
+    });
+
+    // Check for overlaps and coverage
+    uint64_t ExpectedStart = NewAllocaBeginOffset;
+    TypeSize TotalStoreBits = TypeSize::getZero();
+    Instruction *PrevStore = nullptr;
+    for (auto &StoreInfo : StoreInfos) {
+      uint64_t BeginOff = StoreInfo.BeginOffset;
+      uint64_t EndOff = StoreInfo.EndOffset;
+
+      // Check for gap or overlap
+      if (BeginOff != ExpectedStart)
+        return std::nullopt;
+
+      ExpectedStart = EndOff;
+      TotalStoreBits += StoreInfo.StoredTypeSize;
+      PrevStore = StoreInfo.Store;
+    }
+    // Check that stores cover the entire alloca
+    // We need check both the end offset and the total store bits
+    if (ExpectedStart != NewAllocaEndOffset ||
+        TotalStoreBits != DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+      return std::nullopt;
+
+    // Stores should be in the same basic block
+    // The load should not be in the middle of the stores
+    BasicBlock *LoadBB = TheLoad->getParent();
+    BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
+
+    for (auto &StoreInfo : StoreInfos) {
+      if (StoreInfo.Store->getParent() != StoreBB)
+        return std::nullopt;
+      if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
+        return std::nullopt;
+    }
+
+    // If we reach here, the partition can be merged with a tree structured
+    // merge
+    LLVM_DEBUG({
+      dbgs() << "Tree structured merge rewrite:\n  Load: " << *TheLoad
+             << "\n Ordered stores:\n";
+      for (auto [i, Info] : enumerate(StoreInfos))
+        dbgs() << "    [" << i << "] Range[" << Info.BeginOffset << ", "
+               << Info.EndOffset << ") \tStore: " << *Info.Store
+               << "\tValue: " << *Info.StoredValue << "\n";
+    });
+
+    // Instead of having these stores, we merge all the stored values into a
+    // vector and store the merged value into the alloca
+    std::queue<Value *> VecElements;
+    IRBuilder<> Builder(StoreInfos.back().Store);
+    for (const auto &Info : StoreInfos) {
+      DeletedValues.push_back(Info.Store);
+      VecElements.push(Info.StoredValue);
+    }
+
+    LLVM_DEBUG(dbgs() << "  Rewrite stores into shufflevectors:\n");
+    while (VecElements.size() > 1) {
+      uint64_t NumElts = VecElements.size();
+      for (uint64_t i = 0; i < NumElts / 2; i++) {
+        Value *V0 = VecElements.front();
+        VecElements.pop();
+        Value *V1 = VecElements.front();
+        VecElements.pop();
+        Value *Merged = mergeTwoVectors(V0, V1, Builder);
+        LLVM_DEBUG(dbgs() << "    shufflevector: " << *Merged << "\n");
+        VecElements.push(Merged);
+      }
+      if (NumElts % 2 == 1) {
+        Value *V = VecElements.front();
+        VecElements.pop();
+        VecElements.push(V);
+      }
+    }
+
+    // Store the merged value into the alloca
+    Value *MergedValue = VecElements.front();
+    Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
+
+    IRBuilder<> LoadBuilder(TheLoad);
+    TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
+        TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
+        TheLoad->getName() + ".sroa.new.load"));
+    DeletedValues.push_back(TheLoad);
+
+    return DeletedValues;
+  }
+
 private:
   // Make sure the other visit overloads are visible.
   using Base::visit;
@@ -4996,13 +5268,22 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
                                P.endOffset(), IsIntegerPromotable, VecTy,
                                PHIUsers, SelectUsers);
   bool Promotable = true;
-  for (Slice *S : P.splitSliceTails()) {
-    Promotable &= Rewriter.visit(S);
-    ++NumUses;
-  }
-  for (Slice &S : P) {
-    Promotable &= Rewriter.visit(&S);
-    ++NumUses;
+  // Check whether we can have tree-structured merge.
+  std::optional<SmallVector<Value *, 4>> DeletedValues =
+      Rewriter.rewriteTreeStructuredMerge(P);
+  if (DeletedValues) {
+    NumUses += DeletedValues->size() + 1;
+    for (Value *V : *DeletedValues)
+      DeadInsts.push_back(V);
+  } else {
+    for (Slice *S : P.splitSliceTails()) {
+      Promotable &= Rewriter.visit(S);
+      ++NumUses;
+    }
+    for (Slice &S : P) {
+      Promotable &= Rewriter.visit(&S);
+      ++NumUses;
+    }
   }
 
   NumAllocaPartitionUses += NumUses;
diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
new file mode 100644
index 0000000000000..61d77478e0b59
--- /dev/null
+++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
@@ -0,0 +1,214 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -disable-output -debug-only=sroa 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='sroa<modify-cfg>' -disable-output -debug-only=sroa 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+; CHECK-NOT: Tree structured merge rewrite
+define i32 @test_alloca_not_fixed_vector() {
+entry:
+  %alloca = alloca [4 x float]
+
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
+  store float 1.0, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 1
+  store float 2.0, ptr %ptr1
+
+  %result = load i32, ptr %alloca
+  ret i32 %result
+}
+
+define <4 x float> @test_more_than_one_load(<2 x float> %a, <2 x float> %b) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %result1 = load <4 x float>, ptr %alloca
+  %result2 = load <4 x float>, ptr %alloca
+
+  %final = fadd <4 x float> %result1, %result2
+  ret <4 x float> %final
+}
+
+define void @test_no_load(<4 x float> %a) {
+entry:
+  %alloca = alloca <4 x float>
+  store <4 x float> %a, ptr %alloca
+  ret void
+}
+
+define i32 @test_load_not_fixed_vector(<2 x float> %a, <2 x float> %b) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %result = load i32, ptr %alloca
+  ret i32 %result
+}
+
+define <3 x float> @test_load_not_covering_alloca(<2 x float> %a, <2 x float> %b) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %result = load <3 x float>, ptr %ptr0
+  ret <3 x float> %result
+}
+
+define <4 x float> @test_store_not_fixed_vector(<vscale x 2 x float> %a) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %fixed = extractelement <vscale x 2 x float> %a, i32 0
+  store float %fixed, ptr %ptr0
+
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_store_not_same_element_type() {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %float_vec = insertelement <2 x float> undef, float 1.0, i32 0
+  %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
+  store <2 x float> %float_vec2, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %int_vec = insertelement <2 x i32> undef, i32 3, i32 0
+  %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1
+  store <2 x i32> %int_vec2, ptr %ptr1
+
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x i32> @test_load_store_different_element_type() {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %float_vec = insertelement <2 x float> undef, float 1.0, i32 0
+  %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
+  store <2 x float> %float_vec2, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %float_vec3 = insertelement <2 x float> undef, float 3.0, i32 0
+  %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1
+  store <2 x float> %float_vec4, ptr %ptr1
+
+  %result = load <4 x i32>, ptr %alloca
+  ret <4 x i32> %result
+}
+
+define <4 x float> @test_no_stores() {
+entry:
+  %alloca = alloca <4 x float>
+
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_stores_overlapping(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 1
+  store <2 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %c, ptr %ptr2
+
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_stores_not_covering_alloca(<2 x float> %a) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_stores_not_same_basic_block(<2 x float> %a, <2 x float> %b, i1 %cond) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  br i1 %cond, label %then, label %else
+
+then:
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+  br label %merge
+
+else:
+  br label %merge
+
+merge:
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_load_before_stores(<2 x float> %a, <2 x float> %b) {
+entry:
+  %alloca = alloca <4 x float>
+
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %intermediate = load <4 x float>, ptr %alloca
+
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  ret <4 x float> %intermediate
+}
+
+define <4 x float> @test_other_instructions(<2 x float> %a, <2 x float> %b) {
+entry:
+  %alloca = alloca <4 x float>
+  
+  ; Store first vector
+  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+  
+  ; Other instruction (memset) that's not a simple load/store
+  call void @llvm.memset.p0.i64(ptr %alloca, i8 0, i64 8, i1 false)
+  
+  ; Store second vector
+  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+  
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
new file mode 100644
index 0000000000000..c74b0b932ddef
--- /dev/null
+++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
+; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+; Basic tree-structured merge: 4 stores of <2 x float> into <8 x float>
+define <8 x float> @basic_tree_merge(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: define <8 x float> @basic_tree_merge(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+; DEBUG-LABEL: define <8 x float> @basic_tree_merge(
+; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]]) !dbg [[DBG5:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META9:![0-9]+]], !DIExpression(), [[META17:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META11:![0-9]+]], !DIExpression(), [[META18:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META12:![0-9]+]], !DIExpression(), [[META19:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META13:![0-9]+]], !DIExpression(), [[META20:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META14:![0-9]+]], !DIExpression(), [[META21:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG22:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG22]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG22]]
+; DEBUG-NEXT:      #dbg_value(<8 x float> [[TMP2]], [[META15:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; DEBUG-NEXT:    ret <8 x float> [[TMP2]], !dbg [[DBG24:![0-9]+]]
+;
+entry:
+  %alloca = alloca <8 x float>
+
+  ; Store the vectors at different offsets
+  %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  store <2 x float> %c, ptr %ptr2
+
+  %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
+  store <2 x float> %d, ptr %ptr3
+
+  ; Load the complete vector
+  %result = load <8 x float>, ptr %alloca
+  ret <8 x float> %result
+}
+
+define void @multiple_partitions(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, ptr %e, ptr %f) {
+; CHECK-LABEL: define void @multiple_partitions(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[E:%.*]], ptr [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[E]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[F]], align 16
+; CHECK-NEXT:    ret void
+;
+; DEBUG-LABEL: define void @multiple_partitions(
+; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[E:%.*]], ptr [[F:%.*]]) !dbg [[DBG25:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META27:![0-9]+]], !DIExpression(), [[META36:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META27]], !DIExpression(), [[META36]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META28:![0-9]+]], !DIExpression(), [[META37:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META29:![0-9]+]], !DIExpression(), [[META38:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG39:![0-9]+]]
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META30:![0-9]+]], !DIExpression(), [[META40:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META31:![0-9]+]], !DIExpression(), [[META41:![0-9]+]])
+; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG42:![0-9]+]]
+; DEBUG-NEXT:      #dbg_value(<4 x float> [[TMP0]], [[META32:![0-9]+]], !DIExpression(), [[META43:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META34:![0-9]+]], !DIExpression(), [[META44:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(<4 x float> [[TMP1]], [[META35:![0-9]+]], !DIExpression(), [[META45:![0-9]+]])
+; DEBUG-NEXT:    store <4 x float> [[TMP0]], ptr [[E]], align 16, !dbg [[DBG46:![0-9]+]]
+; DEBUG-NEXT:    store <4 x float> [[TMP1]], ptr [[F]], align 16, !dbg [[DBG47:![0-9]+]]
+; DEBUG-NEXT:    ret void, !dbg [[DBG48:![0-9]+]]
+;
+entry:
+  %alloca = alloca <8 x float>
+
+  %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  store <2 x float> %c, ptr %ptr2
+
+  %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
+  store <2 x float> %d, ptr %ptr3
+
+  %result1 = load <4 x float>, ptr %alloca
+
+  %ptr_offset4 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  %result2 = load <4 x float>, ptr %ptr_offset4
+
+  store <4 x float> %result1, ptr %e
+  store <4 x float> %result2, ptr %f
+
+  ret void
+}
+
+; Out-of-order stores: stores happen in non-sequential order
+define <8 x i32> @out_of_order_stores(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: define <8 x i32> @out_of_order_stores(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+; DEBUG-LABEL: define <8 x i32> @out_of_order_stores(
+; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG49:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META51:![0-9]+]], !DIExpression(), [[META57:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META52:![0-9]+]], !DIExpression(), [[META58:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META53:![0-9]+]], !DIExpression(), [[META59:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META54:![0-9]+]], !DIExpression(), [[META60:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG61:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG61]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG61]]
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META55:![0-9]+]], !DIExpression(), [[META62:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(<8 x i32> [[TMP2]], [[META56:![0-9]+]], !DIExpression(), [[META63:![0-9]+]])
+; DEBUG-NEXT:    ret <8 x i32> [[TMP2]], !dbg [[DBG64:![0-9]+]]
+;
+entry:
+  %alloca = alloca <8 x i32>
+
+  ; Store out of order
+  %ptr2 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 4
+  store <2 x i32> %c, ptr %ptr2
+
+  %ptr0 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 0
+  store <2 x i32> %a, ptr %ptr0
+
+  %ptr3 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 6
+  store <2 x i32> %d, ptr %ptr3
+
+  %ptr1 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 2
+  store <2 x i32> %b, ptr %ptr1
+
+  %result = load <8 x i32>, ptr %alloca
+  ret <8 x i32> %result
+}
+
+; Single element stores: 8 stores of <1 x i16> into <8 x i16>
+define <8 x i16> @single_element_stores(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x i16> %d, <1 x i16> %e, <1 x i16> %f, <1 x i16> %g, <1 x i16> %h) {
+; CHECK-LABEL: define <8 x i16> @single_element_stores(
+; CHECK-SAME: <1 x i16> [[A:%.*]], <1 x i16> [[B:%.*]], <1 x i16> [[C:%.*]], <1 x i16> [[D:%.*]], <1 x i16> [[E:%.*]], <1 x i16> [[F:%.*]], <1 x i16> [[G:%.*]], <1 x i16> [[H:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <1 x i16> [[A]], <1 x i16> [[B]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i16> [[C]], <1 x i16> [[D]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <1 x i16> [[E]], <1 x i16> [[F]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <1 x i16> [[G]], <1 x i16> [[H]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i16> [[TMP6]]
+;
+; DEBUG-LABEL: define <8 x i16> @single_element_stores(
+; DEBUG-SAME: <1 x i16> [[A:%.*]], <1 x i16> [[B:%.*]], <1 x i16> [[C:%.*]], <1 x i16> [[D:%.*]], <1 x i16> [[E:%.*]], <1 x i16> [[F:%.*]], <1 x i16> [[G:%.*]], <1 x i16> [[H:%.*]]) !dbg [[DBG65:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META67:![0-9]+]], !DIExpression(), [[META77:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META68:![0-9]+]], !DIExpression(), [[META78:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META69:![0-9]+]], !DIExpression(), [[META79:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META70:![0-9]+]], !DIExpression(), [[META80:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META71:![0-9]+]], !DIExpression(), [[META81:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META72:![0-9]+]], !DIExpression(), [[META82:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META73:![0-9]+]], !DIExpression(), [[META83:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META74:![0-9]+]], !DIExpression(), [[META84:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META75:![0-9]+]], !DIExpression(), [[META85:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <1 x i16> [[A]], <1 x i16> [[B]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i16> [[C]], <1 x i16> [[D]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <1 x i16> [[E]], <1 x i16> [[F]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86]]
+; DEBUG-NEXT:    [[TMP3:%.*]] = shufflevector <1 x i16> [[G]], <1 x i16> [[H]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86]]
+; DEBUG-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG86]]
+; DEBUG-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG86]]
+; DEBUG-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG86]]
+; DEBUG-NEXT:      #dbg_value(<8 x i16> [[TMP6]], [[META76:![0-9]+]], !DIExpression(), [[META87:![0-9]+]])
+; DEBUG-NEXT:    ret <8 x i16> [[TMP6]], !dbg [[DBG88:![0-9]+]]
+;
+entry:
+  %alloca = alloca <8 x i16>
+
+  %ptr0 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 0
+  store <1 x i16> %a, ptr %ptr0
+  %ptr1 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 1
+  store <1 x i16> %b, ptr %ptr1
+  %ptr2 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 2
+  store <1 x i16> %c, ptr %ptr2
+  %ptr3 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 3
+  store <1 x i16> %d, ptr %ptr3
+  %ptr4 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 4
+  store <1 x i16> %e, ptr %ptr4
+  %ptr5 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 5
+  store <1 x i16> %f, ptr %ptr5
+  %ptr6 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 6
+  store <1 x i16> %g, ptr %ptr6
+  %ptr7 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 7
+  store <1 x i16> %h, ptr %ptr7
+
+  %result = load <8 x i16>, ptr %alloca
+  ret <8 x i16> %result
+}
+
+; Non-power-of-2: 3 stores of <2 x float> into <6 x float>
+define <6 x float> @non_power_of_2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: define <6 x float> @non_power_of_2(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <6 x float> [[TMP2]]
+;
+; DEBUG-LABEL: define <6 x float> @non_power_of_2(
+; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG89:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META91:![0-9]+]], !DIExpression(), [[META96:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META92:![0-9]+]], !DIExpression(), [[META97:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META93:![0-9]+]], !DIExpression(), [[META98:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META94:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG100:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG100]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, !dbg [[DBG100]]
+; DEBUG-NEXT:      #dbg_value(<6 x float> [[TMP2]], [[META95:![0-9]+]], !DIExpression(), [[META101:![0-9]+]])
+; DEBUG-NEXT:    ret <6 x float> [[TMP2]], !dbg [[DBG102:![0-9]+]]
+;
+entry:
+  %alloca = alloca <6 x float>
+
+  %ptr0 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 4
+  store <2 x float> %c, ptr %ptr2
+
+  %result = load <6 x float>, ptr %alloca
+  ret <6 x float> %result
+}
+
+; Store with different size of vectors
+define <7 x float> @store_with_different_size_of_vectors(<1 x float> %a, <4 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: define <7 x float> @store_with_different_size_of_vectors(
+; CHECK-SAME: <1 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x float> [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <1 x float> [[A]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[B]], <5 x i32> <i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    ret <7 x float> [[TMP3]]
+;
+; DEBUG-LABEL: define <7 x float> @store_with_different_size_of_vectors(
+; DEBUG-SAME: <1 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG103:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META105:![0-9]+]], !DIExpression(), [[META110:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META106:![0-9]+]], !DIExpression(), [[META111:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META107:![0-9]+]], !DIExpression(), [[META112:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META108:![0-9]+]], !DIExpression(), [[META113:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <1 x float> [[A]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, !dbg [[DBG114:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[B]], <5 x i32> <i32 0, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG114]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>, !dbg [[DBG114]]
+; DEBUG-NEXT:    [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>, !dbg [[DBG114]]
+; DEBUG-NEXT:      #dbg_value(<7 x float> [[TMP3]], [[META109:![0-9]+]], !DIExpression(), [[META115:![0-9]+]])
+; DEBUG-NEXT:    ret <7 x float> [[TMP3]], !dbg [[DBG116:![0-9]+]]
+;
+entry:
+  %alloca = alloca <7 x float>
+
+  %ptr0 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 0
+  store <1 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 1
+  store <4 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 5
+  store <2 x float> %c, ptr %ptr2
+
+  %result = load <7 x float>, ptr %alloca
+  ret <7 x float> %result
+}
+
+;.
+; DEBUG: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+; DEBUG: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
+; DEBUG: [[DBG5]] = distinct !DISubprogram(name: "basic_tree_merge", linkageName: "basic_tree_merge", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]])
+; DEBUG: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
+; DEBUG: [[META7]] = !{}
+; DEBUG: [[META8]] = !{[[META9]], [[META11]], [[META12]], [[META13]], [[META14]], [[META15]]}
+; DEBUG: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 1, type: [[META10:![0-9]+]])
+; DEBUG: [[META10]] = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
+; DEBUG: [[META11]] = !DILocalVariable(name: "2", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10]])
+; DEBUG: [[META12]] = !DILocalVariable(name: "3", scope: [[DBG5]], file: [[META1]], line: 4, type: [[META10]])
+; DEBUG: [[META13]] = !DILocalVariable(name: "4", scope: [[DBG5]], file: [[META1]], line: 6, type: [[META10]])
+; DEBUG: [[META14]] = !DILocalVariable(name: "5", scope: [[DBG5]], file: [[META1]], line: 8, type: [[META10]])
+; DEBUG: [[META15]] = !DILocalVariable(name: "6", scope: [[DBG5]], file: [[META1]], line: 10, type: [[META16:![0-9]+]])
+; DEBUG: [[META16]] = !DIBasicType(name: "ty256", size: 256, encoding: DW_ATE_unsigned)
+; DEBUG: [[META17]] = !DILocation(line: 1, column: 1, scope: [[DBG5]])
+; DEBUG: [[META18]] = !DILocation(line: 2, column: 1, scope: [[DBG5]])
+; DEBUG: [[META19]] = !DILocation(line: 4, column: 1, scope: [[DBG5]])
+; DEBUG: [[META20]] = !DILocation(line: 6, column: 1, scope: [[DBG5]])
+; DEBUG: [[META21]] = !DILocation(line: 8, column: 1, scope: [[DBG5]])
+; DEBUG: [[DBG22]] = !DILocation(line: 9, column: 1, scope: [[DBG5]])
+; DEBUG: [[META23]] = !DILocation(line: 10, column: 1, scope: [[DBG5]])
+; DEBUG: [[DBG24]] = !DILocation(line: 11, column: 1, scope: [[DBG5]])
+; DEBUG: [[DBG25]] = distinct !DISubprogram(name: "multiple_partitions", linkageName: "multiple_partitions", scope: null, file: [[META1]], line: 12, type: [[META6]], scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META26:![0-9]+]])
+; DEBUG: [[META26]] = !{[[META27]], [[META28]], [[META29]], [[META30]], [[META31]], [[META32]], [[META34]], [[META35]]}
+; DEBUG: [[META27]] = !DILocalVariable(name: "7", scope: [[DBG25]], file: [[META1]], line: 12, type: [[META10]])
+; DEBUG: [[META28]] = !DILocalVariable(name: "8", scope: [[DBG25]], file: [[META1]], line: 13, type: [[META10]])
+; DEBUG: [[META29]] = !DILocalVariable(name: "9", scope: [[DBG25]], file: [[META1]], line: 15, type: [[META10]])
+; DEBUG: [[META30]] = !DILocalVariable(name: "10", scope: [[DBG25]], file: [[META1]], line: 17, type: [[META10]])
+; DEBUG: [[META31]] = !DILocalVariable(name: "11", scope: [[DBG25]], file: [[META1]], line: 19, type: [[META10]])
+; DEBUG: [[META32]] = !DILocalVariable(name: "12", scope: [[DBG25]], file: [[META1]], line: 21, type: [[META33:![0-9]+]])
+; DEBUG: [[META33]] = !DIBasicType(name: "ty128", size: 128, encoding: DW_ATE_unsigned)
+; DEBUG: [[META34]] = !DILocalVariable(name: "13", scope: [[DBG25]], file: [[META1]], line: 22, type: [[META10]])
+; DEBUG: [[META35]] = !DILocalVariable(name: "14", scope: [[DBG25]], file: [[META1]], line: 23, type: [[META33]])
+; DEBUG: [[META36]] = !DILocation(line: 12, column: 1, scope: [[DBG25]])
+; DEBUG: [[META37]] = !DILocation(line: 13, column: 1, scope: [[DBG25]])
+; DEBUG: [[META38]] = !DILocation(line: 15, column: 1, scope: [[DBG25]])
+; DEBUG: [[DBG39]] = !DILocation(line: 16, column: 1, scope: [[DBG25]])
+; DEBUG: [[META40]] = !DILocation(line: 17, column: 1, scope: [[DBG25]])
+; DEBUG: [[META41]] = !DILocation(line: 19, column: 1, scope: [[DBG25]])
+; DEBUG: [[DBG42]] = !DILocation(line: 20, column: 1, scope: [[DBG25]])
+; DEBUG: [[META43]] = !DILocation(line: 21, column: 1, scope: [[DBG25]])
+; DEBUG: [[META44]] = !DILocation(line: 22, column: 1, scope: [[DBG25]])
+; DEBUG: [[META45]] = !DILocation(line: 23, column: 1, scope: [[DBG25]])
+; DEBUG: [[DBG46]] = !DILocation(line: 24, column: 1, scope: [[DBG25]])
+; DEBUG: [[DBG47]] = !DILocation(line: 25, column: 1, scope: [[DBG25]])
+; DEBUG: [[DBG48]] = !DILocation(line: 26, column: 1, scope: [[DBG25]])
+; DEBUG: [[DBG49]] = distinct !DISubprogram(name: "out_of_order_stores", linkageName: "out_of_order_stores", scope: null, file: [[META1]], line: 27, type: [[META6]], scopeLine: 27, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META50:![0-9]+]])
+; DEBUG: [[META50]] = !{[[META51]], [[META52]], [[META53]], [[META54]], [[META55]], [[META56]]}
+; DEBUG: [[META51]] = !DILocalVariable(name: "15", scope: [[DBG49]], file: [[META1]], line: 27, type: [[META10]])
+; DEBUG: [[META52]] = !DILocalVariable(name: "16", scope: [[DBG49]], file: [[META1]], line: 28, type: [[META10]])
+; DEBUG: [[META53]] = !DILocalVariable(name: "17", scope: [[DBG49]], file: [[META1]], line: 30, type: [[META10]])
+; DEBUG: [[META54]] = !DILocalVariable(name: "18", scope: [[DBG49]], file: [[META1]], line: 32, type: [[META10]])
+; DEBUG: [[META55]] = !DILocalVariable(name: "19", scope: [[DBG49]], file: [[META1]], line: 34, type: [[META10]])
+; DEBUG: [[META56]] = !DILocalVariable(name: "20", scope: [[DBG49]], file: [[META1]], line: 36, type: [[META16]])
+; DEBUG: [[META57]] = !DILocation(line: 27, column: 1, scope: [[DBG49]])
+; DEBUG: [[META58]] = !DILocation(line: 28, column: 1, scope: [[DBG49]])
+; DEBUG: [[META59]] = !DILocation(line: 30, column: 1, scope: [[DBG49]])
+; DEBUG: [[META60]] = !DILocation(line: 32, column: 1, scope: [[DBG49]])
+; DEBUG: [[DBG61]] = !DILocation(line: 33, column: 1, scope: [[DBG49]])
+; DEBUG: [[META62]] = !DILocation(line: 34, column: 1, scope: [[DBG49]])
+; DEBUG: [[META63]] = !DILocation(line: 36, column: 1, scope: [[DBG49]])
+; DEBUG: [[DBG64]] = !DILocation(line: 37, column: 1, scope: [[DBG49]])
+; DEBUG: [[DBG65]] = distinct !DISubprogram(name: "single_element_stores", linkageName: "single_element_stores", scope: null, file: [[META1]], line: 38, type: [[META6]], scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META66:![0-9]+]])
+; DEBUG: [[META66]] = !{[[META67]], [[META68]], [[META69]], [[META70]], [[META71]], [[META72]], [[META73]], [[META74]], [[META75]], [[META76]]}
+; DEBUG: [[META67]] = !DILocalVariable(name: "21", scope: [[DBG65]], file: [[META1]], line: 38, type: [[META10]])
+; DEBUG: [[META68]] = !DILocalVariable(name: "22", scope: [[DBG65]], file: [[META1]], line: 39, type: [[META10]])
+; DEBUG: [[META69]] = !DILocalVariable(name: "23", scope: [[DBG65]], file: [[META1]], line: 41, type: [[META10]])
+; DEBUG: [[META70]] = !DILocalVariable(name: "24", scope: [[DBG65]], file: [[META1]], line: 43, type: [[META10]])
+; DEBUG: [[META71]] = !DILocalVariable(name: "25", scope: [[DBG65]], file: [[META1]], line: 45, type: [[META10]])
+; DEBUG: [[META72]] = !DILocalVariable(name: "26", scope: [[DBG65]], file: [[META1]], line: 47, type: [[META10]])
+; DEBUG: [[META73]] = !DILocalVariable(name: "27", scope: [[DBG65]], file: [[META1]], line: 49, type: [[META10]])
+; DEBUG: [[META74]] = !DILocalVariable(name: "28", scope: [[DBG65]], file: [[META1]], line: 51, type: [[META10]])
+; DEBUG: [[META75]] = !DILocalVariable(name: "29", scope: [[DBG65]], file: [[META1]], line: 53, type: [[META10]])
+; DEBUG: [[META76]] = !DILocalVariable(name: "30", scope: [[DBG65]], file: [[META1]], line: 55, type: [[META33]])
+; DEBUG: [[META77]] = !DILocation(line: 38, column: 1, scope: [[DBG65]])
+; DEBUG: [[META78]] = !DILocation(line: 39, column: 1, scope: [[DBG65]])
+; DEBUG: [[META79]] = !DILocation(line: 41, column: 1, scope: [[DBG65]])
+; DEBUG: [[META80]] = !DILocation(line: 43, column: 1, scope: [[DBG65]])
+; DEBUG: [[META81]] = !DILocation(line: 45, column: 1, scope: [[DBG65]])
+; DEBUG: [[META82]] = !DILocation(line: 47, column: 1, scope: [[DBG65]])
+; DEBUG: [[META83]] = !DILocation(line: 49, column: 1, scope: [[DBG65]])
+; DEBUG: [[META84]] = !DILocation(line: 51, column: 1, scope: [[DBG65]])
+; DEBUG: [[META85]] = !DILocation(line: 53, column: 1, scope: [[DBG65]])
+; DEBUG: [[DBG86]] = !DILocation(line: 54, column: 1, scope: [[DBG65]])
+; DEBUG: [[META87]] = !DILocation(line: 55, column: 1, scope: [[DBG65]])
+; DEBUG: [[DBG88]] = !DILocation(line: 56, column: 1, scope: [[DBG65]])
+; DEBUG: [[DBG89]] = distinct !DISubprogram(name: "non_power_of_2", linkageName: "non_power_of_2", scope: null, file: [[META1]], line: 57, type: [[META6]], scopeLine: 57, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META90:![0-9]+]])
+; DEBUG: [[META90]] = !{[[META91]], [[META92]], [[META93]], [[META94]], [[META95]]}
+; DEBUG: [[META91]] = !DILocalVariable(name: "31", scope: [[DBG89]], file: [[META1]], line: 57, type: [[META10]])
+; DEBUG: [[META92]] = !DILocalVariable(name: "32", scope: [[DBG89]], file: [[META1]], line: 58, type: [[META10]])
+; DEBUG: [[META93]] = !DILocalVariable(name: "33", scope: [[DBG89]], file: [[META1]], line: 60, type: [[META10]])
+; DEBUG: [[META94]] = !DILocalVariable(name: "34", scope: [[DBG89]], file: [[META1]], line: 62, type: [[META10]])
+; DEBUG: [[META95]] = !DILocalVariable(name: "35", scope: [[DBG89]], file: [[META1]], line: 64, type: [[META16]])
+; DEBUG: [[META96]] = !DILocation(line: 57, column: 1, scope: [[DBG89]])
+; DEBUG: [[META97]] = !DILocation(line: 58, column: 1, scope: [[DBG89]])
+; DEBUG: [[META98]] = !DILocation(line: 60, column: 1, scope: [[DBG89]])
+; DEBUG: [[META99]] = !DILocation(line: 62, column: 1, scope: [[DBG89]])
+; DEBUG: [[DBG100]] = !DILocation(line: 63, column: 1, scope: [[DBG89]])
+; DEBUG: [[META101]] = !DILocation(line: 64, column: 1, scope: [[DBG89]])
+; DEBUG: [[DBG102]] = !DILocation(line: 65, column: 1, scope: [[DBG89]])
+; DEBUG: [[DBG103]] = distinct !DISubprogram(name: "store_with_different_size_of_vectors", linkageName: "store_with_different_size_of_vectors", scope: null, file: [[META1]], line: 66, type: [[META6]], scopeLine: 66, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META104:![0-9]+]])
+; DEBUG: [[META104]] = !{[[META105]], [[META106]], [[META107]], [[META108]], [[META109]]}
+; DEBUG: [[META105]] = !DILocalVariable(name: "36", scope: [[DBG103]], file: [[META1]], line: 66, type: [[META10]])
+; DEBUG: [[META106]] = !DILocalVariable(name: "37", scope: [[DBG103]], file: [[META1]], line: 67, type: [[META10]])
+; DEBUG: [[META107]] = !DILocalVariable(name: "38", scope: [[DBG103]], file: [[META1]], line: 69, type: [[META10]])
+; DEBUG: [[META108]] = !DILocalVariable(name: "39", scope: [[DBG103]], file: [[META1]], line: 71, type: [[META10]])
+; DEBUG: [[META109]] = !DILocalVariable(name: "40", scope: [[DBG103]], file: [[META1]], line: 73, type: [[META16]])
+; DEBUG: [[META110]] = !DILocation(line: 66, column: 1, scope: [[DBG103]])
+; DEBUG: [[META111]] = !DILocation(line: 67, column: 1, scope: [[DBG103]])
+; DEBUG: [[META112]] = !DILocation(line: 69, column: 1, scope: [[DBG103]])
+; DEBUG: [[META113]] = !DILocation(line: 71, column: 1, scope: [[DBG103]])
+; DEBUG: [[DBG114]] = !DILocation(line: 72, column: 1, scope: [[DBG103]])
+; DEBUG: [[META115]] = !DILocation(line: 73, column: 1, scope: [[DBG103]])
+; DEBUG: [[DBG116]] = !DILocation(line: 74, column: 1, scope: [[DBG103]])
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-MODIFY-CFG: {{.*}}
+; CHECK-PRESERVE-CFG: {{.*}}

>From 46039cebead52e23434f0ebba1e468d8a003068a Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Fri, 8 Aug 2025 22:03:01 +0000
Subject: [PATCH 2/7] Not do tree merge when only having one store

---
 llvm/lib/Transforms/Scalar/SROA.cpp                       | 4 ++--
 .../SROA/vector-promotion-cannot-tree-structure-merge.ll  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 2bbaf7813c3c0..397f44687aa6d 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2988,8 +2988,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     if (!TheLoad)
       return std::nullopt;
 
-    // If we do not have any stores, we cannot do the tree structured merge
-    if (StoreInfos.empty())
+    // If we do not have multiple stores, we cannot do the tree structured merge
+    if (StoreInfos.size() < 2)
       return std::nullopt;
 
     // The load and store element types should be the same
diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
index 61d77478e0b59..ab11adaa8156e 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
@@ -88,12 +88,12 @@ entry:
   %alloca = alloca <4 x float>
 
   %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
-  %float_vec = insertelement <2 x float> undef, float 1.0, i32 0
+  %float_vec = insertelement <2 x float> poison, float 1.0, i32 0
   %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
   store <2 x float> %float_vec2, ptr %ptr0
 
   %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
-  %int_vec = insertelement <2 x i32> undef, i32 3, i32 0
+  %int_vec = insertelement <2 x i32> poison, i32 3, i32 0
   %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1
   store <2 x i32> %int_vec2, ptr %ptr1
 
@@ -106,12 +106,12 @@ entry:
   %alloca = alloca <4 x float>
 
   %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
-  %float_vec = insertelement <2 x float> undef, float 1.0, i32 0
+  %float_vec = insertelement <2 x float> poison, float 1.0, i32 0
   %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
   store <2 x float> %float_vec2, ptr %ptr0
 
   %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
-  %float_vec3 = insertelement <2 x float> undef, float 3.0, i32 0
+  %float_vec3 = insertelement <2 x float> poison, float 3.0, i32 0
   %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1
   store <2 x float> %float_vec4, ptr %ptr1
 

>From 68eea1e5440139a3b23d0057987de18bdd9ffd16 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Fri, 15 Aug 2025 19:27:59 +0000
Subject: [PATCH 3/7] Handle the cases where ld/st has different elt types

---
 llvm/lib/Transforms/Scalar/SROA.cpp           | 108 +++++++++++-------
 ...r-promotion-cannot-tree-structure-merge.ll |  35 ------
 ...ctor-promotion-via-tree-structure-merge.ll |  70 ++++++++++++
 3 files changed, 135 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 397f44687aa6d..7a0ebf7ce6bc0 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2679,7 +2679,32 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   return V;
 }
 
-static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) {
+/// This function takes two vector values and combines them into a single vector
+/// by concatenating their elements. The function handles:
+///
+/// 1. Element type mismatch: If either vector's element type differs from
+///    NewAIEltType, the function bitcasts the vector to use NewAIEltType while
+///    preserving the total bit width (adjusting the number of elements
+///    accordingly).
+///
+/// 2. Size mismatch: After transforming the vectors to have the desired element
+///    type, if the two vectors have different numbers of elements, the smaller
+///    vector is extended with poison values to match the size of the larger
+///    vector before concatenation.
+///
+/// 3. Concatenation: The vectors are merged using a shuffle operation that
+///    places all elements of V0 first, followed by all elements of V1.
+///
+/// \param V0 The first vector to merge (must be a vector type)
+/// \param V1 The second vector to merge (must be a vector type)
+/// \param DL The data layout for size calculations
+/// \param NewAIEltTy The desired element type for the result vector
+/// \param Builder IRBuilder for creating new instructions
+/// \return A new vector containing all elements from V0 followed by all
+/// elements from V1
+static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
+                              Type *NewAIEltTy,
+                              IRBuilder<> &Builder) {
   assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() &&
          "Can not merge two non-vector values");
 
@@ -2689,8 +2714,28 @@ static Value *mergeTwoVectors(Value *V0, Value *V1, IRBuilder<> &Builder) {
   auto *VecType0 = cast<FixedVectorType>(V0->getType());
   auto *VecType1 = cast<FixedVectorType>(V1->getType());
 
-  assert(VecType0->getElementType() == VecType1->getElementType() &&
-         "Can not merge two vectors with different element types");
+  // If V0/V1 element types are different from NewAllocaElementType,
+  // we need to introduce bitcasts before merging them
+  auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
+                             const char *DebugName) {
+    Type *EltType = VecType->getElementType();
+    if (EltType != NewAIEltTy) {
+      // Calculate new number of elements to maintain same bit width
+      unsigned TotalBits =
+          VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
+      unsigned NewNumElts =
+          TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
+
+      auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
+      V = Builder.CreateBitCast(V, NewVecType);
+      VecType = NewVecType;
+      LLVM_DEBUG(dbgs() << "    bitcast " << DebugName << ": " << *V << "\n");
+    }
+  };
+
+  BitcastIfNeeded(V0, VecType0, "V0");
+  BitcastIfNeeded(V1, VecType1, "V1");
+
   unsigned NumElts0 = VecType0->getNumElements();
   unsigned NumElts1 = VecType1->getNumElements();
 
@@ -2923,24 +2968,19 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       uint64_t BeginOffset;
       uint64_t EndOffset;
       Value *StoredValue;
-      TypeSize StoredTypeSize = TypeSize::getZero();
-
-      StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val,
-                TypeSize StoredTypeSize)
-          : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val),
-            StoredTypeSize(StoredTypeSize) {}
+      StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
+          : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
     };
 
     SmallVector<StoreInfo, 4> StoreInfos;
 
     // The alloca must be a fixed vector type
-    auto *AllocatedTy = NewAI.getAllocatedType();
-    if (!isa<FixedVectorType>(AllocatedTy))
+    Type *AllocatedEltTy = nullptr;
+    if (auto *FixedVecTy = dyn_cast<FixedVectorType>(NewAI.getAllocatedType()))
+      AllocatedEltTy = FixedVecTy->getElementType();
+    else
       return std::nullopt;
 
-    Slice *LoadSlice = nullptr;
-    Type *LoadElementType = nullptr;
-    Type *StoreElementType = nullptr;
     for (Slice &S : P) {
       auto *User = cast<Instruction>(S.getUse()->getUser());
       if (auto *LI = dyn_cast<LoadInst>(User)) {
@@ -2957,27 +2997,20 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         if (DL.getTypeSizeInBits(FixedVecTy) !=
             DL.getTypeSizeInBits(NewAI.getAllocatedType()))
           return std::nullopt;
-        LoadElementType = FixedVecTy->getElementType();
         TheLoad = LI;
-        LoadSlice = &S;
       } else if (auto *SI = dyn_cast<StoreInst>(User)) {
-        // The store needs to be a fixed vector type
-        // All the stores should have the same element type
+        // The stored value should be a fixed vector type
         Type *StoredValueType = SI->getValueOperand()->getType();
-        Type *CurrentElementType = nullptr;
-        TypeSize StoredTypeSize = TypeSize::getZero();
-        if (auto *FixedVecTy = dyn_cast<FixedVectorType>(StoredValueType)) {
-          // Fixed vector type - use its element type
-          CurrentElementType = FixedVecTy->getElementType();
-          StoredTypeSize = DL.getTypeSizeInBits(FixedVecTy);
-        } else
+        if (!isa<FixedVectorType>(StoredValueType))
           return std::nullopt;
-        // Check element type consistency across all stores
-        if (StoreElementType && StoreElementType != CurrentElementType)
+        
+        // The total number of stored bits should be the multiple of the new
+        // alloca element type size
+        if (DL.getTypeSizeInBits(StoredValueType) %
+            DL.getTypeSizeInBits(AllocatedEltTy) != 0)
           return std::nullopt;
-        StoreElementType = CurrentElementType;
         StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
-                                SI->getValueOperand(), StoredTypeSize);
+                                SI->getValueOperand());
       } else {
         // If we have instructions other than load and store, we cannot do the
         // tree structured merge
@@ -2992,16 +3025,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     if (StoreInfos.size() < 2)
       return std::nullopt;
 
-    // The load and store element types should be the same
-    if (LoadElementType != StoreElementType)
-      return std::nullopt;
-
-    // The load should cover the whole alloca
-    // TODO: maybe we can relax this constraint
-    if (!LoadSlice || LoadSlice->beginOffset() != NewAllocaBeginOffset ||
-        LoadSlice->endOffset() != NewAllocaEndOffset)
-      return std::nullopt;
-
     // Stores should not overlap and should cover the whole alloca
     // Sort by begin offset
     llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
@@ -3011,7 +3034,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     // Check for overlaps and coverage
     uint64_t ExpectedStart = NewAllocaBeginOffset;
     TypeSize TotalStoreBits = TypeSize::getZero();
-    Instruction *PrevStore = nullptr;
     for (auto &StoreInfo : StoreInfos) {
       uint64_t BeginOff = StoreInfo.BeginOffset;
       uint64_t EndOff = StoreInfo.EndOffset;
@@ -3021,8 +3043,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         return std::nullopt;
 
       ExpectedStart = EndOff;
-      TotalStoreBits += StoreInfo.StoredTypeSize;
-      PrevStore = StoreInfo.Store;
+      TotalStoreBits +=
+          DL.getTypeSizeInBits(StoreInfo.Store->getValueOperand()->getType());
     }
     // Check that stores cover the entire alloca
     // We need check both the end offset and the total store bits
@@ -3070,7 +3092,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         VecElements.pop();
         Value *V1 = VecElements.front();
         VecElements.pop();
-        Value *Merged = mergeTwoVectors(V0, V1, Builder);
+        Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
         LLVM_DEBUG(dbgs() << "    shufflevector: " << *Merged << "\n");
         VecElements.push(Merged);
       }
diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
index ab11adaa8156e..e4b106856de47 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
@@ -83,41 +83,6 @@ entry:
   ret <4 x float> %result
 }
 
-define <4 x float> @test_store_not_same_element_type() {
-entry:
-  %alloca = alloca <4 x float>
-
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
-  %float_vec = insertelement <2 x float> poison, float 1.0, i32 0
-  %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
-  store <2 x float> %float_vec2, ptr %ptr0
-
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
-  %int_vec = insertelement <2 x i32> poison, i32 3, i32 0
-  %int_vec2 = insertelement <2 x i32> %int_vec, i32 4, i32 1
-  store <2 x i32> %int_vec2, ptr %ptr1
-
-  %result = load <4 x float>, ptr %alloca
-  ret <4 x float> %result
-}
-
-define <4 x i32> @test_load_store_different_element_type() {
-entry:
-  %alloca = alloca <4 x float>
-
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
-  %float_vec = insertelement <2 x float> poison, float 1.0, i32 0
-  %float_vec2 = insertelement <2 x float> %float_vec, float 2.0, i32 1
-  store <2 x float> %float_vec2, ptr %ptr0
-
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
-  %float_vec3 = insertelement <2 x float> poison, float 3.0, i32 0
-  %float_vec4 = insertelement <2 x float> %float_vec3, float 4.0, i32 1
-  store <2 x float> %float_vec4, ptr %ptr1
-
-  %result = load <4 x i32>, ptr %alloca
-  ret <4 x i32> %result
-}
 
 define <4 x float> @test_no_stores() {
 entry:
diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
index c74b0b932ddef..83bc48b617f29 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
@@ -287,6 +287,60 @@ entry:
   ret <7 x float> %result
 }
 
+; Load and store with different element type
+define <4 x double> @load_store_different_element_type(<2 x i32> %a, <2 x float> %b, <2 x float> %c, <2 x i32> %d) {
+; CHECK-LABEL: define <4 x double> @load_store_different_element_type(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x i32> [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <1 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[C]] to <1 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[D]] to <1 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP6]]
+;
+; DEBUG-LABEL: define <4 x double> @load_store_different_element_type(
+; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG117:![0-9]+]] {
+; DEBUG-NEXT:  [[ENTRY:.*:]]
+; DEBUG-NEXT:      #dbg_value(ptr poison, [[META119:![0-9]+]], !DIExpression(), [[META125:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META119]], !DIExpression(), [[META125]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META120:![0-9]+]], !DIExpression(), [[META126:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META121:![0-9]+]], !DIExpression(), [[META127:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META122:![0-9]+]], !DIExpression(), [[META128:![0-9]+]])
+; DEBUG-NEXT:      #dbg_value(ptr undef, [[META123:![0-9]+]], !DIExpression(), [[META129:![0-9]+]])
+; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>, !dbg [[DBG130:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <1 x double>, !dbg [[DBG130]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG130]]
+; DEBUG-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[C]] to <1 x double>, !dbg [[DBG130]]
+; DEBUG-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[D]] to <1 x double>, !dbg [[DBG130]]
+; DEBUG-NEXT:    [[TMP5:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP4]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG130]]
+; DEBUG-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG130]]
+; DEBUG-NEXT:      #dbg_value(<4 x double> [[TMP6]], [[META124:![0-9]+]], !DIExpression(), [[META131:![0-9]+]])
+; DEBUG-NEXT:    ret <4 x double> [[TMP6]], !dbg [[DBG132:![0-9]+]]
+;
+entry:
+  %alloca = alloca <8 x float>
+
+  ; Store the vectors at different offsets
+  %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
+  store <2 x i32> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  store <2 x float> %c, ptr %ptr2
+
+  %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
+  store <2 x i32> %d, ptr %ptr3
+
+  ; Load the complete vector
+  %result = load <4 x double>, ptr %alloca
+  ret <4 x double> %result
+}
+
 ;.
 ; DEBUG: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
 ; DEBUG: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
@@ -402,6 +456,22 @@ entry:
 ; DEBUG: [[DBG114]] = !DILocation(line: 72, column: 1, scope: [[DBG103]])
 ; DEBUG: [[META115]] = !DILocation(line: 73, column: 1, scope: [[DBG103]])
 ; DEBUG: [[DBG116]] = !DILocation(line: 74, column: 1, scope: [[DBG103]])
+; DEBUG: [[DBG117]] = distinct !DISubprogram(name: "load_store_different_element_type", linkageName: "load_store_different_element_type", scope: null, file: [[META1]], line: 75, type: [[META6]], scopeLine: 75, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META118:![0-9]+]])
+; DEBUG: [[META118]] = !{[[META119]], [[META120]], [[META121]], [[META122]], [[META123]], [[META124]]}
+; DEBUG: [[META119]] = !DILocalVariable(name: "41", scope: [[DBG117]], file: [[META1]], line: 75, type: [[META10]])
+; DEBUG: [[META120]] = !DILocalVariable(name: "42", scope: [[DBG117]], file: [[META1]], line: 76, type: [[META10]])
+; DEBUG: [[META121]] = !DILocalVariable(name: "43", scope: [[DBG117]], file: [[META1]], line: 78, type: [[META10]])
+; DEBUG: [[META122]] = !DILocalVariable(name: "44", scope: [[DBG117]], file: [[META1]], line: 80, type: [[META10]])
+; DEBUG: [[META123]] = !DILocalVariable(name: "45", scope: [[DBG117]], file: [[META1]], line: 82, type: [[META10]])
+; DEBUG: [[META124]] = !DILocalVariable(name: "46", scope: [[DBG117]], file: [[META1]], line: 84, type: [[META16]])
+; DEBUG: [[META125]] = !DILocation(line: 75, column: 1, scope: [[DBG117]])
+; DEBUG: [[META126]] = !DILocation(line: 76, column: 1, scope: [[DBG117]])
+; DEBUG: [[META127]] = !DILocation(line: 78, column: 1, scope: [[DBG117]])
+; DEBUG: [[META128]] = !DILocation(line: 80, column: 1, scope: [[DBG117]])
+; DEBUG: [[META129]] = !DILocation(line: 82, column: 1, scope: [[DBG117]])
+; DEBUG: [[DBG130]] = !DILocation(line: 83, column: 1, scope: [[DBG117]])
+; DEBUG: [[META131]] = !DILocation(line: 84, column: 1, scope: [[DBG117]])
+; DEBUG: [[DBG132]] = !DILocation(line: 85, column: 1, scope: [[DBG117]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}

>From 1d1e3d38590f3f47dd0fa667ed0e2314ea78342a Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Tue, 19 Aug 2025 04:20:04 +0000
Subject: [PATCH 4/7] Do not handle ptr cases

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 7a0ebf7ce6bc0..5a0aa3365444f 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2980,6 +2980,10 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       AllocatedEltTy = FixedVecTy->getElementType();
     else
       return std::nullopt;
+    // If the allocated element type is a pointer, we do not handle it
+    // TODO: handle this case by using inttoptr/ptrtoint
+    if (AllocatedEltTy->isPtrOrPtrVectorTy())
+      return std::nullopt;
 
     for (Slice &S : P) {
       auto *User = cast<Instruction>(S.getUse()->getUser());
@@ -2997,6 +3001,10 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         if (DL.getTypeSizeInBits(FixedVecTy) !=
             DL.getTypeSizeInBits(NewAI.getAllocatedType()))
           return std::nullopt;
+        // If the loaded value is a pointer, we do not handle it
+        // TODO: handle this case by using inttoptr/ptrtoint
+        if (FixedVecTy->getElementType()->isPtrOrPtrVectorTy())
+          return std::nullopt;
         TheLoad = LI;
       } else if (auto *SI = dyn_cast<StoreInst>(User)) {
         // The stored value should be a fixed vector type
@@ -3009,6 +3017,10 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         if (DL.getTypeSizeInBits(StoredValueType) %
             DL.getTypeSizeInBits(AllocatedEltTy) != 0)
           return std::nullopt;
+        // If the stored value is a pointer, we do not handle it
+        // TODO: handle this case by using inttoptr/ptrtoint
+        if (StoredValueType->isPtrOrPtrVectorTy())
+          return std::nullopt;
         StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
                                 SI->getValueOperand());
       } else {

>From a8849460978d33eec56560bd12683c7971d237a3 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Tue, 19 Aug 2025 04:23:39 +0000
Subject: [PATCH 5/7] format

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 5a0aa3365444f..9c5c3b3d50555 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2703,8 +2703,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
 /// \return A new vector containing all elements from V0 followed by all
 /// elements from V1
 static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
-                              Type *NewAIEltTy,
-                              IRBuilder<> &Builder) {
+                              Type *NewAIEltTy, IRBuilder<> &Builder) {
   assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() &&
          "Can not merge two non-vector values");
 
@@ -2723,8 +2722,7 @@ static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
       // Calculate new number of elements to maintain same bit width
       unsigned TotalBits =
           VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
-      unsigned NewNumElts =
-          TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
+      unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
 
       auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
       V = Builder.CreateBitCast(V, NewVecType);
@@ -3011,11 +3009,12 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         Type *StoredValueType = SI->getValueOperand()->getType();
         if (!isa<FixedVectorType>(StoredValueType))
           return std::nullopt;
-        
+
         // The total number of stored bits should be the multiple of the new
         // alloca element type size
         if (DL.getTypeSizeInBits(StoredValueType) %
-            DL.getTypeSizeInBits(AllocatedEltTy) != 0)
+                DL.getTypeSizeInBits(AllocatedEltTy) !=
+            0)
           return std::nullopt;
         // If the stored value is a pointer, we do not handle it
         // TODO: handle this case by using inttoptr/ptrtoint

>From 3146a3b65467814fb5dce0a8043837f74e8ef34e Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Wed, 27 Aug 2025 20:07:37 +0000
Subject: [PATCH 6/7] Fix bugs and update tests

---
 llvm/lib/Transforms/Scalar/SROA.cpp           |  91 ++--
 ...r-promotion-cannot-tree-structure-merge.ll |  99 +++--
 ...ctor-promotion-via-tree-structure-merge.ll | 403 +++++-------------
 3 files changed, 220 insertions(+), 373 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index c24655e7492d9..aeea2d31c7a4e 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2693,9 +2693,6 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
 /// elements from V1
 static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
                               Type *NewAIEltTy, IRBuilder<> &Builder) {
-  assert(V0->getType()->isVectorTy() && V1->getType()->isVectorTy() &&
-         "Can not merge two non-vector values");
-
   // V0 and V1 are vectors
   // Create a new vector type with combined elements
   // Use ShuffleVector to concatenate the vectors
@@ -2737,18 +2734,15 @@ static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
     unsigned SmallSize = std::min(NumElts0, NumElts1);
     unsigned LargeSize = std::max(NumElts0, NumElts1);
     bool IsV0Smaller = NumElts0 < NumElts1;
-    Value *SmallVec = IsV0Smaller ? V0 : V1;
-
+    Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
     SmallVector<int, 16> ExtendMask;
     for (unsigned i = 0; i < SmallSize; ++i)
       ExtendMask.push_back(i);
     for (unsigned i = SmallSize; i < LargeSize; ++i)
       ExtendMask.push_back(PoisonMaskElem);
-    Value *ExtendedVec = Builder.CreateShuffleVector(
-        SmallVec, PoisonValue::get(SmallVec->getType()), ExtendMask);
+    ExtendedVec = Builder.CreateShuffleVector(
+        ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
     LLVM_DEBUG(dbgs() << "    shufflevector: " << *ExtendedVec << "\n");
-    V0 = IsV0Smaller ? ExtendedVec : V0;
-    V1 = IsV0Smaller ? V1 : ExtendedVec;
     for (unsigned i = 0; i < NumElts0; ++i)
       ShuffleMask.push_back(i);
     for (unsigned i = 0; i < NumElts1; ++i)
@@ -2961,53 +2955,45 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 
     SmallVector<StoreInfo, 4> StoreInfos;
 
-    // The alloca must be a fixed vector type
-    Type *AllocatedEltTy = nullptr;
-    if (auto *FixedVecTy = dyn_cast<FixedVectorType>(NewAI.getAllocatedType()))
-      AllocatedEltTy = FixedVecTy->getElementType();
-    else
-      return std::nullopt;
-    // If the allocated element type is a pointer, we do not handle it
-    // TODO: handle this case by using inttoptr/ptrtoint
-    if (AllocatedEltTy->isPtrOrPtrVectorTy())
-      return std::nullopt;
+    // If the new alloca is a fixed vector type, we use its element type as the
+    // allocated element type, otherwise we use i8 as the allocated element
+    Type *AllocatedEltTy =
+        isa<FixedVectorType>(NewAI.getAllocatedType())
+            ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
+            : Type::getInt8Ty(NewAI.getContext());
+
+    // Helper to check if a type is
+    //  1. A fixed vector type
+    //  2. The element type is not a pointer
+    //  3. The element type size is byte-aligned
+    // We only handle the cases that the ld/st meet these conditions
+    auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
+      auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
+      return FixedVecTy &&
+             DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
+             !FixedVecTy->getElementType()->isPointerTy();
+    };
 
     for (Slice &S : P) {
       auto *User = cast<Instruction>(S.getUse()->getUser());
       if (auto *LI = dyn_cast<LoadInst>(User)) {
-        // Do not handle the case where there is more than one load
-        // TODO: maybe we can handle this case
-        if (TheLoad)
-          return std::nullopt;
-        // If load is not a fixed vector type, we do not handle it
-        // If the number of loaded bits is not the same as the new alloca type
-        // size, we do not handle it
-        auto *FixedVecTy = dyn_cast<FixedVectorType>(LI->getType());
-        if (!FixedVecTy)
-          return std::nullopt;
-        if (DL.getTypeSizeInBits(FixedVecTy) !=
-            DL.getTypeSizeInBits(NewAI.getAllocatedType()))
-          return std::nullopt;
-        // If the loaded value is a pointer, we do not handle it
-        // TODO: handle this case by using inttoptr/ptrtoint
-        if (FixedVecTy->getElementType()->isPtrOrPtrVectorTy())
+        // Do not handle the case if 
+        //   1. There is more than one load
+        //   2. The load is volatile
+        //   3. The load does not read the entire alloca structure
+        //   4. The load does not meet the conditions in the helper function
+        if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
+            S.beginOffset() != NewAllocaBeginOffset ||
+            S.endOffset() != NewAllocaEndOffset ||
+            LI->isVolatile())
           return std::nullopt;
         TheLoad = LI;
       } else if (auto *SI = dyn_cast<StoreInst>(User)) {
-        // The stored value should be a fixed vector type
-        Type *StoredValueType = SI->getValueOperand()->getType();
-        if (!isa<FixedVectorType>(StoredValueType))
-          return std::nullopt;
-
-        // The total number of stored bits should be the multiple of the new
-        // alloca element type size
-        if (DL.getTypeSizeInBits(StoredValueType) %
-                DL.getTypeSizeInBits(AllocatedEltTy) !=
-            0)
-          return std::nullopt;
-        // If the stored value is a pointer, we do not handle it
-        // TODO: handle this case by using inttoptr/ptrtoint
-        if (StoredValueType->isPtrOrPtrVectorTy())
+        // Do not handle the case if 
+        //   1. The store does not meet the conditions in the helper function
+        //   2. The store is volatile
+        if (!IsTypeValidForTreeStructuredMerge(SI->getValueOperand()->getType()) ||
+            SI->isVolatile())
           return std::nullopt;
         StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
                                 SI->getValueOperand());
@@ -3033,7 +3019,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 
     // Check for overlaps and coverage
     uint64_t ExpectedStart = NewAllocaBeginOffset;
-    TypeSize TotalStoreBits = TypeSize::getZero();
     for (auto &StoreInfo : StoreInfos) {
       uint64_t BeginOff = StoreInfo.BeginOffset;
       uint64_t EndOff = StoreInfo.EndOffset;
@@ -3043,13 +3028,9 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         return std::nullopt;
 
       ExpectedStart = EndOff;
-      TotalStoreBits +=
-          DL.getTypeSizeInBits(StoreInfo.Store->getValueOperand()->getType());
     }
     // Check that stores cover the entire alloca
-    // We need check both the end offset and the total store bits
-    if (ExpectedStart != NewAllocaEndOffset ||
-        TotalStoreBits != DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+    if (ExpectedStart != NewAllocaEndOffset)
       return std::nullopt;
 
     // Stores should be in the same basic block
diff --git a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
index e4b106856de47..c858d071451e8 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
@@ -21,12 +21,12 @@ entry:
 
 define <4 x float> @test_more_than_one_load(<2 x float> %a, <2 x float> %b) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
   %result1 = load <4 x float>, ptr %alloca
@@ -38,19 +38,19 @@ entry:
 
 define void @test_no_load(<4 x float> %a) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
   store <4 x float> %a, ptr %alloca
   ret void
 }
 
 define i32 @test_load_not_fixed_vector(<2 x float> %a, <2 x float> %b) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
   %result = load i32, ptr %alloca
@@ -59,12 +59,12 @@ entry:
 
 define <3 x float> @test_load_not_covering_alloca(<2 x float> %a, <2 x float> %b) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
   %result = load <3 x float>, ptr %ptr0
@@ -73,9 +73,9 @@ entry:
 
 define <4 x float> @test_store_not_fixed_vector(<vscale x 2 x float> %a) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   %fixed = extractelement <vscale x 2 x float> %a, i32 0
   store float %fixed, ptr %ptr0
 
@@ -86,7 +86,7 @@ entry:
 
 define <4 x float> @test_no_stores() {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
   %result = load <4 x float>, ptr %alloca
   ret <4 x float> %result
@@ -94,15 +94,15 @@ entry:
 
 define <4 x float> @test_stores_overlapping(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 1
   store <2 x float> %b, ptr %ptr1
 
-  %ptr2 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr2 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %c, ptr %ptr2
 
   %result = load <4 x float>, ptr %alloca
@@ -111,9 +111,9 @@ entry:
 
 define <4 x float> @test_stores_not_covering_alloca(<2 x float> %a) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
   %result = load <4 x float>, ptr %alloca
@@ -122,15 +122,15 @@ entry:
 
 define <4 x float> @test_stores_not_same_basic_block(<2 x float> %a, <2 x float> %b, i1 %cond) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
   br i1 %cond, label %then, label %else
 
 then:
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
   br label %merge
 
@@ -144,14 +144,14 @@ merge:
 
 define <4 x float> @test_load_before_stores(<2 x float> %a, <2 x float> %b) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
 
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
   %intermediate = load <4 x float>, ptr %alloca
 
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
   ret <4 x float> %intermediate
@@ -159,21 +159,64 @@ entry:
 
 define <4 x float> @test_other_instructions(<2 x float> %a, <2 x float> %b) {
 entry:
-  %alloca = alloca <4 x float>
+  %alloca = alloca [4 x float]
   
   ; Store first vector
-  %ptr0 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
   
   ; Other instruction (memset) that's not a simple load/store
   call void @llvm.memset.p0.i64(ptr %alloca, i8 0, i64 8, i1 false)
   
   ; Store second vector
-  %ptr1 = getelementptr inbounds <4 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
   
   %result = load <4 x float>, ptr %alloca
   ret <4 x float> %result
 }
 
+define <4 x float> @volatile_stores(<2 x i32> %a, <2 x i32> %b) {
+entry:
+  %alloca = alloca [4 x float]
+
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
+  store volatile <2 x i32> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
+  store volatile <2 x i32> %b, ptr %ptr1
+
+  %result = load <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x float> @volatile_loads(<2 x i32> %a, <2 x i32> %b) {
+entry:
+  %alloca = alloca [4 x float]
+
+  %ptr0 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 0
+  store <2 x i32> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [4 x float], ptr %alloca, i32 0, i32 2
+  store <2 x i32> %b, ptr %ptr1
+
+  %result = load volatile <4 x float>, ptr %alloca
+  ret <4 x float> %result
+}
+
+define <4 x i15> @non_byte_aligned_alloca(<2 x i15> %a, <2 x i15> %b) {
+entry:
+  %alloca = alloca [4 x i15]
+
+  %ptr0 = getelementptr inbounds [4 x i15], ptr %alloca, i32 0, i32 0
+  store <2 x i15> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [4 x i15], ptr %alloca, i32 0, i32 2
+  store <2 x i15> %b, ptr %ptr1
+
+  %result = load <4 x i15>, ptr %alloca
+  ret <4 x i15> %result
+
+}
+
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
index 83bc48b617f29..8bfe0bb83051e 100644
--- a/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion-via-tree-structure-merge.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
-; RUN: opt < %s -passes=debugify,sroa -S | FileCheck %s --check-prefix=DEBUG
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 ; Basic tree-structured merge: 4 stores of <2 x float> into <8 x float>
@@ -14,37 +13,21 @@ define <8 x float> @basic_tree_merge(<2 x float> %a, <2 x float> %b, <2 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[TMP2]]
 ;
-; DEBUG-LABEL: define <8 x float> @basic_tree_merge(
-; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]]) !dbg [[DBG5:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META9:![0-9]+]], !DIExpression(), [[META17:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META11:![0-9]+]], !DIExpression(), [[META18:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META12:![0-9]+]], !DIExpression(), [[META19:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META13:![0-9]+]], !DIExpression(), [[META20:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META14:![0-9]+]], !DIExpression(), [[META21:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG22:![0-9]+]]
-; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG22]]
-; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG22]]
-; DEBUG-NEXT:      #dbg_value(<8 x float> [[TMP2]], [[META15:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
-; DEBUG-NEXT:    ret <8 x float> [[TMP2]], !dbg [[DBG24:![0-9]+]]
-;
 entry:
-  %alloca = alloca <8 x float>
+  %alloca = alloca [8 x float]
 
-  ; Store the vectors at different offsets
-  %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
-  %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4
   store <2 x float> %c, ptr %ptr2
 
-  %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
+  %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6
   store <2 x float> %d, ptr %ptr3
 
-  ; Load the complete vector
   %result = load <8 x float>, ptr %alloca
   ret <8 x float> %result
 }
@@ -59,42 +42,24 @@ define void @multiple_partitions(<2 x float> %a, <2 x float> %b, <2 x float> %c,
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[F]], align 16
 ; CHECK-NEXT:    ret void
 ;
-; DEBUG-LABEL: define void @multiple_partitions(
-; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[E:%.*]], ptr [[F:%.*]]) !dbg [[DBG25:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META27:![0-9]+]], !DIExpression(), [[META36:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META27]], !DIExpression(), [[META36]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META28:![0-9]+]], !DIExpression(), [[META37:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META29:![0-9]+]], !DIExpression(), [[META38:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG39:![0-9]+]]
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META30:![0-9]+]], !DIExpression(), [[META40:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META31:![0-9]+]], !DIExpression(), [[META41:![0-9]+]])
-; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG42:![0-9]+]]
-; DEBUG-NEXT:      #dbg_value(<4 x float> [[TMP0]], [[META32:![0-9]+]], !DIExpression(), [[META43:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META34:![0-9]+]], !DIExpression(), [[META44:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(<4 x float> [[TMP1]], [[META35:![0-9]+]], !DIExpression(), [[META45:![0-9]+]])
-; DEBUG-NEXT:    store <4 x float> [[TMP0]], ptr [[E]], align 16, !dbg [[DBG46:![0-9]+]]
-; DEBUG-NEXT:    store <4 x float> [[TMP1]], ptr [[F]], align 16, !dbg [[DBG47:![0-9]+]]
-; DEBUG-NEXT:    ret void, !dbg [[DBG48:![0-9]+]]
-;
 entry:
-  %alloca = alloca <8 x float>
+  %alloca = alloca [8 x float]
 
-  %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
-  %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4
   store <2 x float> %c, ptr %ptr2
 
-  %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
+  %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6
   store <2 x float> %d, ptr %ptr3
 
   %result1 = load <4 x float>, ptr %alloca
 
-  %ptr_offset4 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  %ptr_offset4 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4
   %result2 = load <4 x float>, ptr %ptr_offset4
 
   store <4 x float> %result1, ptr %e
@@ -113,34 +78,19 @@ define <8 x i32> @out_of_order_stores(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c,
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
-; DEBUG-LABEL: define <8 x i32> @out_of_order_stores(
-; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG49:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META51:![0-9]+]], !DIExpression(), [[META57:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META52:![0-9]+]], !DIExpression(), [[META58:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META53:![0-9]+]], !DIExpression(), [[META59:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META54:![0-9]+]], !DIExpression(), [[META60:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG61:![0-9]+]]
-; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG61]]
-; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG61]]
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META55:![0-9]+]], !DIExpression(), [[META62:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(<8 x i32> [[TMP2]], [[META56:![0-9]+]], !DIExpression(), [[META63:![0-9]+]])
-; DEBUG-NEXT:    ret <8 x i32> [[TMP2]], !dbg [[DBG64:![0-9]+]]
-;
 entry:
-  %alloca = alloca <8 x i32>
+  %alloca = alloca [8 x i32]
 
-  ; Store out of order
-  %ptr2 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 4
+  %ptr2 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 4
   store <2 x i32> %c, ptr %ptr2
 
-  %ptr0 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 0
   store <2 x i32> %a, ptr %ptr0
 
-  %ptr3 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 6
+  %ptr3 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 6
   store <2 x i32> %d, ptr %ptr3
 
-  %ptr1 = getelementptr inbounds <8 x i32>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [8 x i32], ptr %alloca, i32 0, i32 2
   store <2 x i32> %b, ptr %ptr1
 
   %result = load <8 x i32>, ptr %alloca
@@ -161,46 +111,24 @@ define <8 x i16> @single_element_stores(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP6]]
 ;
-; DEBUG-LABEL: define <8 x i16> @single_element_stores(
-; DEBUG-SAME: <1 x i16> [[A:%.*]], <1 x i16> [[B:%.*]], <1 x i16> [[C:%.*]], <1 x i16> [[D:%.*]], <1 x i16> [[E:%.*]], <1 x i16> [[F:%.*]], <1 x i16> [[G:%.*]], <1 x i16> [[H:%.*]]) !dbg [[DBG65:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META67:![0-9]+]], !DIExpression(), [[META77:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META68:![0-9]+]], !DIExpression(), [[META78:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META69:![0-9]+]], !DIExpression(), [[META79:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META70:![0-9]+]], !DIExpression(), [[META80:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META71:![0-9]+]], !DIExpression(), [[META81:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META72:![0-9]+]], !DIExpression(), [[META82:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META73:![0-9]+]], !DIExpression(), [[META83:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META74:![0-9]+]], !DIExpression(), [[META84:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META75:![0-9]+]], !DIExpression(), [[META85:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <1 x i16> [[A]], <1 x i16> [[B]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86:![0-9]+]]
-; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i16> [[C]], <1 x i16> [[D]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86]]
-; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <1 x i16> [[E]], <1 x i16> [[F]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86]]
-; DEBUG-NEXT:    [[TMP3:%.*]] = shufflevector <1 x i16> [[G]], <1 x i16> [[H]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG86]]
-; DEBUG-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG86]]
-; DEBUG-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG86]]
-; DEBUG-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG86]]
-; DEBUG-NEXT:      #dbg_value(<8 x i16> [[TMP6]], [[META76:![0-9]+]], !DIExpression(), [[META87:![0-9]+]])
-; DEBUG-NEXT:    ret <8 x i16> [[TMP6]], !dbg [[DBG88:![0-9]+]]
-;
 entry:
-  %alloca = alloca <8 x i16>
+  %alloca = alloca [8 x i16]
 
-  %ptr0 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 0
   store <1 x i16> %a, ptr %ptr0
-  %ptr1 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 1
   store <1 x i16> %b, ptr %ptr1
-  %ptr2 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 2
+  %ptr2 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 2
   store <1 x i16> %c, ptr %ptr2
-  %ptr3 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 3
+  %ptr3 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 3
   store <1 x i16> %d, ptr %ptr3
-  %ptr4 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 4
+  %ptr4 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 4
   store <1 x i16> %e, ptr %ptr4
-  %ptr5 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 5
+  %ptr5 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 5
   store <1 x i16> %f, ptr %ptr5
-  %ptr6 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 6
+  %ptr6 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 6
   store <1 x i16> %g, ptr %ptr6
-  %ptr7 = getelementptr inbounds <8 x i16>, ptr %alloca, i32 0, i32 7
+  %ptr7 = getelementptr inbounds [8 x i16], ptr %alloca, i32 0, i32 7
   store <1 x i16> %h, ptr %ptr7
 
   %result = load <8 x i16>, ptr %alloca
@@ -217,29 +145,16 @@ define <6 x float> @non_power_of_2(<2 x float> %a, <2 x float> %b, <2 x float> %
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    ret <6 x float> [[TMP2]]
 ;
-; DEBUG-LABEL: define <6 x float> @non_power_of_2(
-; DEBUG-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG89:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META91:![0-9]+]], !DIExpression(), [[META96:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META92:![0-9]+]], !DIExpression(), [[META97:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META93:![0-9]+]], !DIExpression(), [[META98:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META94:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG100:![0-9]+]]
-; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG100]]
-; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>, !dbg [[DBG100]]
-; DEBUG-NEXT:      #dbg_value(<6 x float> [[TMP2]], [[META95:![0-9]+]], !DIExpression(), [[META101:![0-9]+]])
-; DEBUG-NEXT:    ret <6 x float> [[TMP2]], !dbg [[DBG102:![0-9]+]]
-;
 entry:
-  %alloca = alloca <6 x float>
+  %alloca = alloca [6 x float]
 
-  %ptr0 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 0
   store <2 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
-  %ptr2 = getelementptr inbounds <6 x float>, ptr %alloca, i32 0, i32 4
+  %ptr2 = getelementptr inbounds [6 x float], ptr %alloca, i32 0, i32 4
   store <2 x float> %c, ptr %ptr2
 
   %result = load <6 x float>, ptr %alloca
@@ -257,30 +172,16 @@ define <7 x float> @store_with_different_size_of_vectors(<1 x float> %a, <4 x fl
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    ret <7 x float> [[TMP3]]
 ;
-; DEBUG-LABEL: define <7 x float> @store_with_different_size_of_vectors(
-; DEBUG-SAME: <1 x float> [[A:%.*]], <4 x float> [[B:%.*]], <2 x float> [[C:%.*]]) !dbg [[DBG103:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META105:![0-9]+]], !DIExpression(), [[META110:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META106:![0-9]+]], !DIExpression(), [[META111:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META107:![0-9]+]], !DIExpression(), [[META112:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META108:![0-9]+]], !DIExpression(), [[META113:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = shufflevector <1 x float> [[A]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, !dbg [[DBG114:![0-9]+]]
-; DEBUG-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[B]], <5 x i32> <i32 0, i32 4, i32 5, i32 6, i32 7>, !dbg [[DBG114]]
-; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>, !dbg [[DBG114]]
-; DEBUG-NEXT:    [[TMP3:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> [[TMP2]], <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>, !dbg [[DBG114]]
-; DEBUG-NEXT:      #dbg_value(<7 x float> [[TMP3]], [[META109:![0-9]+]], !DIExpression(), [[META115:![0-9]+]])
-; DEBUG-NEXT:    ret <7 x float> [[TMP3]], !dbg [[DBG116:![0-9]+]]
-;
 entry:
-  %alloca = alloca <7 x float>
+  %alloca = alloca [7 x float]
 
-  %ptr0 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [7 x float], ptr %alloca, i32 0, i32 0
   store <1 x float> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [7 x float], ptr %alloca, i32 0, i32 1
   store <4 x float> %b, ptr %ptr1
 
-  %ptr2 = getelementptr inbounds <7 x float>, ptr %alloca, i32 0, i32 5
+  %ptr2 = getelementptr inbounds [7 x float], ptr %alloca, i32 0, i32 5
   store <2 x float> %c, ptr %ptr2
 
   %result = load <7 x float>, ptr %alloca
@@ -301,178 +202,100 @@ define <4 x double> @load_store_different_element_type(<2 x i32> %a, <2 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x double> [[TMP6]]
 ;
-; DEBUG-LABEL: define <4 x double> @load_store_different_element_type(
-; DEBUG-SAME: <2 x i32> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x i32> [[D:%.*]]) !dbg [[DBG117:![0-9]+]] {
-; DEBUG-NEXT:  [[ENTRY:.*:]]
-; DEBUG-NEXT:      #dbg_value(ptr poison, [[META119:![0-9]+]], !DIExpression(), [[META125:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META119]], !DIExpression(), [[META125]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META120:![0-9]+]], !DIExpression(), [[META126:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META121:![0-9]+]], !DIExpression(), [[META127:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META122:![0-9]+]], !DIExpression(), [[META128:![0-9]+]])
-; DEBUG-NEXT:      #dbg_value(ptr undef, [[META123:![0-9]+]], !DIExpression(), [[META129:![0-9]+]])
-; DEBUG-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>, !dbg [[DBG130:![0-9]+]]
-; DEBUG-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <1 x double>, !dbg [[DBG130]]
-; DEBUG-NEXT:    [[TMP2:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG130]]
-; DEBUG-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[C]] to <1 x double>, !dbg [[DBG130]]
-; DEBUG-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[D]] to <1 x double>, !dbg [[DBG130]]
-; DEBUG-NEXT:    [[TMP5:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP4]], <2 x i32> <i32 0, i32 1>, !dbg [[DBG130]]
-; DEBUG-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG130]]
-; DEBUG-NEXT:      #dbg_value(<4 x double> [[TMP6]], [[META124:![0-9]+]], !DIExpression(), [[META131:![0-9]+]])
-; DEBUG-NEXT:    ret <4 x double> [[TMP6]], !dbg [[DBG132:![0-9]+]]
-;
 entry:
-  %alloca = alloca <8 x float>
+  %alloca = alloca [8 x float]
 
-  ; Store the vectors at different offsets
-  %ptr0 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 0
+  %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0
   store <2 x i32> %a, ptr %ptr0
 
-  %ptr1 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 2
+  %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2
   store <2 x float> %b, ptr %ptr1
 
-  %ptr2 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 4
+  %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4
   store <2 x float> %c, ptr %ptr2
 
-  %ptr3 = getelementptr inbounds <8 x float>, ptr %alloca, i32 0, i32 6
+  %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6
   store <2 x i32> %d, ptr %ptr3
 
-  ; Load the complete vector
   %result = load <4 x double>, ptr %alloca
   ret <4 x double> %result
 }
 
-;.
-; DEBUG: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
-; DEBUG: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
-; DEBUG: [[DBG5]] = distinct !DISubprogram(name: "basic_tree_merge", linkageName: "basic_tree_merge", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]])
-; DEBUG: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
-; DEBUG: [[META7]] = !{}
-; DEBUG: [[META8]] = !{[[META9]], [[META11]], [[META12]], [[META13]], [[META14]], [[META15]]}
-; DEBUG: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 1, type: [[META10:![0-9]+]])
-; DEBUG: [[META10]] = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
-; DEBUG: [[META11]] = !DILocalVariable(name: "2", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10]])
-; DEBUG: [[META12]] = !DILocalVariable(name: "3", scope: [[DBG5]], file: [[META1]], line: 4, type: [[META10]])
-; DEBUG: [[META13]] = !DILocalVariable(name: "4", scope: [[DBG5]], file: [[META1]], line: 6, type: [[META10]])
-; DEBUG: [[META14]] = !DILocalVariable(name: "5", scope: [[DBG5]], file: [[META1]], line: 8, type: [[META10]])
-; DEBUG: [[META15]] = !DILocalVariable(name: "6", scope: [[DBG5]], file: [[META1]], line: 10, type: [[META16:![0-9]+]])
-; DEBUG: [[META16]] = !DIBasicType(name: "ty256", size: 256, encoding: DW_ATE_unsigned)
-; DEBUG: [[META17]] = !DILocation(line: 1, column: 1, scope: [[DBG5]])
-; DEBUG: [[META18]] = !DILocation(line: 2, column: 1, scope: [[DBG5]])
-; DEBUG: [[META19]] = !DILocation(line: 4, column: 1, scope: [[DBG5]])
-; DEBUG: [[META20]] = !DILocation(line: 6, column: 1, scope: [[DBG5]])
-; DEBUG: [[META21]] = !DILocation(line: 8, column: 1, scope: [[DBG5]])
-; DEBUG: [[DBG22]] = !DILocation(line: 9, column: 1, scope: [[DBG5]])
-; DEBUG: [[META23]] = !DILocation(line: 10, column: 1, scope: [[DBG5]])
-; DEBUG: [[DBG24]] = !DILocation(line: 11, column: 1, scope: [[DBG5]])
-; DEBUG: [[DBG25]] = distinct !DISubprogram(name: "multiple_partitions", linkageName: "multiple_partitions", scope: null, file: [[META1]], line: 12, type: [[META6]], scopeLine: 12, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META26:![0-9]+]])
-; DEBUG: [[META26]] = !{[[META27]], [[META28]], [[META29]], [[META30]], [[META31]], [[META32]], [[META34]], [[META35]]}
-; DEBUG: [[META27]] = !DILocalVariable(name: "7", scope: [[DBG25]], file: [[META1]], line: 12, type: [[META10]])
-; DEBUG: [[META28]] = !DILocalVariable(name: "8", scope: [[DBG25]], file: [[META1]], line: 13, type: [[META10]])
-; DEBUG: [[META29]] = !DILocalVariable(name: "9", scope: [[DBG25]], file: [[META1]], line: 15, type: [[META10]])
-; DEBUG: [[META30]] = !DILocalVariable(name: "10", scope: [[DBG25]], file: [[META1]], line: 17, type: [[META10]])
-; DEBUG: [[META31]] = !DILocalVariable(name: "11", scope: [[DBG25]], file: [[META1]], line: 19, type: [[META10]])
-; DEBUG: [[META32]] = !DILocalVariable(name: "12", scope: [[DBG25]], file: [[META1]], line: 21, type: [[META33:![0-9]+]])
-; DEBUG: [[META33]] = !DIBasicType(name: "ty128", size: 128, encoding: DW_ATE_unsigned)
-; DEBUG: [[META34]] = !DILocalVariable(name: "13", scope: [[DBG25]], file: [[META1]], line: 22, type: [[META10]])
-; DEBUG: [[META35]] = !DILocalVariable(name: "14", scope: [[DBG25]], file: [[META1]], line: 23, type: [[META33]])
-; DEBUG: [[META36]] = !DILocation(line: 12, column: 1, scope: [[DBG25]])
-; DEBUG: [[META37]] = !DILocation(line: 13, column: 1, scope: [[DBG25]])
-; DEBUG: [[META38]] = !DILocation(line: 15, column: 1, scope: [[DBG25]])
-; DEBUG: [[DBG39]] = !DILocation(line: 16, column: 1, scope: [[DBG25]])
-; DEBUG: [[META40]] = !DILocation(line: 17, column: 1, scope: [[DBG25]])
-; DEBUG: [[META41]] = !DILocation(line: 19, column: 1, scope: [[DBG25]])
-; DEBUG: [[DBG42]] = !DILocation(line: 20, column: 1, scope: [[DBG25]])
-; DEBUG: [[META43]] = !DILocation(line: 21, column: 1, scope: [[DBG25]])
-; DEBUG: [[META44]] = !DILocation(line: 22, column: 1, scope: [[DBG25]])
-; DEBUG: [[META45]] = !DILocation(line: 23, column: 1, scope: [[DBG25]])
-; DEBUG: [[DBG46]] = !DILocation(line: 24, column: 1, scope: [[DBG25]])
-; DEBUG: [[DBG47]] = !DILocation(line: 25, column: 1, scope: [[DBG25]])
-; DEBUG: [[DBG48]] = !DILocation(line: 26, column: 1, scope: [[DBG25]])
-; DEBUG: [[DBG49]] = distinct !DISubprogram(name: "out_of_order_stores", linkageName: "out_of_order_stores", scope: null, file: [[META1]], line: 27, type: [[META6]], scopeLine: 27, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META50:![0-9]+]])
-; DEBUG: [[META50]] = !{[[META51]], [[META52]], [[META53]], [[META54]], [[META55]], [[META56]]}
-; DEBUG: [[META51]] = !DILocalVariable(name: "15", scope: [[DBG49]], file: [[META1]], line: 27, type: [[META10]])
-; DEBUG: [[META52]] = !DILocalVariable(name: "16", scope: [[DBG49]], file: [[META1]], line: 28, type: [[META10]])
-; DEBUG: [[META53]] = !DILocalVariable(name: "17", scope: [[DBG49]], file: [[META1]], line: 30, type: [[META10]])
-; DEBUG: [[META54]] = !DILocalVariable(name: "18", scope: [[DBG49]], file: [[META1]], line: 32, type: [[META10]])
-; DEBUG: [[META55]] = !DILocalVariable(name: "19", scope: [[DBG49]], file: [[META1]], line: 34, type: [[META10]])
-; DEBUG: [[META56]] = !DILocalVariable(name: "20", scope: [[DBG49]], file: [[META1]], line: 36, type: [[META16]])
-; DEBUG: [[META57]] = !DILocation(line: 27, column: 1, scope: [[DBG49]])
-; DEBUG: [[META58]] = !DILocation(line: 28, column: 1, scope: [[DBG49]])
-; DEBUG: [[META59]] = !DILocation(line: 30, column: 1, scope: [[DBG49]])
-; DEBUG: [[META60]] = !DILocation(line: 32, column: 1, scope: [[DBG49]])
-; DEBUG: [[DBG61]] = !DILocation(line: 33, column: 1, scope: [[DBG49]])
-; DEBUG: [[META62]] = !DILocation(line: 34, column: 1, scope: [[DBG49]])
-; DEBUG: [[META63]] = !DILocation(line: 36, column: 1, scope: [[DBG49]])
-; DEBUG: [[DBG64]] = !DILocation(line: 37, column: 1, scope: [[DBG49]])
-; DEBUG: [[DBG65]] = distinct !DISubprogram(name: "single_element_stores", linkageName: "single_element_stores", scope: null, file: [[META1]], line: 38, type: [[META6]], scopeLine: 38, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META66:![0-9]+]])
-; DEBUG: [[META66]] = !{[[META67]], [[META68]], [[META69]], [[META70]], [[META71]], [[META72]], [[META73]], [[META74]], [[META75]], [[META76]]}
-; DEBUG: [[META67]] = !DILocalVariable(name: "21", scope: [[DBG65]], file: [[META1]], line: 38, type: [[META10]])
-; DEBUG: [[META68]] = !DILocalVariable(name: "22", scope: [[DBG65]], file: [[META1]], line: 39, type: [[META10]])
-; DEBUG: [[META69]] = !DILocalVariable(name: "23", scope: [[DBG65]], file: [[META1]], line: 41, type: [[META10]])
-; DEBUG: [[META70]] = !DILocalVariable(name: "24", scope: [[DBG65]], file: [[META1]], line: 43, type: [[META10]])
-; DEBUG: [[META71]] = !DILocalVariable(name: "25", scope: [[DBG65]], file: [[META1]], line: 45, type: [[META10]])
-; DEBUG: [[META72]] = !DILocalVariable(name: "26", scope: [[DBG65]], file: [[META1]], line: 47, type: [[META10]])
-; DEBUG: [[META73]] = !DILocalVariable(name: "27", scope: [[DBG65]], file: [[META1]], line: 49, type: [[META10]])
-; DEBUG: [[META74]] = !DILocalVariable(name: "28", scope: [[DBG65]], file: [[META1]], line: 51, type: [[META10]])
-; DEBUG: [[META75]] = !DILocalVariable(name: "29", scope: [[DBG65]], file: [[META1]], line: 53, type: [[META10]])
-; DEBUG: [[META76]] = !DILocalVariable(name: "30", scope: [[DBG65]], file: [[META1]], line: 55, type: [[META33]])
-; DEBUG: [[META77]] = !DILocation(line: 38, column: 1, scope: [[DBG65]])
-; DEBUG: [[META78]] = !DILocation(line: 39, column: 1, scope: [[DBG65]])
-; DEBUG: [[META79]] = !DILocation(line: 41, column: 1, scope: [[DBG65]])
-; DEBUG: [[META80]] = !DILocation(line: 43, column: 1, scope: [[DBG65]])
-; DEBUG: [[META81]] = !DILocation(line: 45, column: 1, scope: [[DBG65]])
-; DEBUG: [[META82]] = !DILocation(line: 47, column: 1, scope: [[DBG65]])
-; DEBUG: [[META83]] = !DILocation(line: 49, column: 1, scope: [[DBG65]])
-; DEBUG: [[META84]] = !DILocation(line: 51, column: 1, scope: [[DBG65]])
-; DEBUG: [[META85]] = !DILocation(line: 53, column: 1, scope: [[DBG65]])
-; DEBUG: [[DBG86]] = !DILocation(line: 54, column: 1, scope: [[DBG65]])
-; DEBUG: [[META87]] = !DILocation(line: 55, column: 1, scope: [[DBG65]])
-; DEBUG: [[DBG88]] = !DILocation(line: 56, column: 1, scope: [[DBG65]])
-; DEBUG: [[DBG89]] = distinct !DISubprogram(name: "non_power_of_2", linkageName: "non_power_of_2", scope: null, file: [[META1]], line: 57, type: [[META6]], scopeLine: 57, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META90:![0-9]+]])
-; DEBUG: [[META90]] = !{[[META91]], [[META92]], [[META93]], [[META94]], [[META95]]}
-; DEBUG: [[META91]] = !DILocalVariable(name: "31", scope: [[DBG89]], file: [[META1]], line: 57, type: [[META10]])
-; DEBUG: [[META92]] = !DILocalVariable(name: "32", scope: [[DBG89]], file: [[META1]], line: 58, type: [[META10]])
-; DEBUG: [[META93]] = !DILocalVariable(name: "33", scope: [[DBG89]], file: [[META1]], line: 60, type: [[META10]])
-; DEBUG: [[META94]] = !DILocalVariable(name: "34", scope: [[DBG89]], file: [[META1]], line: 62, type: [[META10]])
-; DEBUG: [[META95]] = !DILocalVariable(name: "35", scope: [[DBG89]], file: [[META1]], line: 64, type: [[META16]])
-; DEBUG: [[META96]] = !DILocation(line: 57, column: 1, scope: [[DBG89]])
-; DEBUG: [[META97]] = !DILocation(line: 58, column: 1, scope: [[DBG89]])
-; DEBUG: [[META98]] = !DILocation(line: 60, column: 1, scope: [[DBG89]])
-; DEBUG: [[META99]] = !DILocation(line: 62, column: 1, scope: [[DBG89]])
-; DEBUG: [[DBG100]] = !DILocation(line: 63, column: 1, scope: [[DBG89]])
-; DEBUG: [[META101]] = !DILocation(line: 64, column: 1, scope: [[DBG89]])
-; DEBUG: [[DBG102]] = !DILocation(line: 65, column: 1, scope: [[DBG89]])
-; DEBUG: [[DBG103]] = distinct !DISubprogram(name: "store_with_different_size_of_vectors", linkageName: "store_with_different_size_of_vectors", scope: null, file: [[META1]], line: 66, type: [[META6]], scopeLine: 66, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META104:![0-9]+]])
-; DEBUG: [[META104]] = !{[[META105]], [[META106]], [[META107]], [[META108]], [[META109]]}
-; DEBUG: [[META105]] = !DILocalVariable(name: "36", scope: [[DBG103]], file: [[META1]], line: 66, type: [[META10]])
-; DEBUG: [[META106]] = !DILocalVariable(name: "37", scope: [[DBG103]], file: [[META1]], line: 67, type: [[META10]])
-; DEBUG: [[META107]] = !DILocalVariable(name: "38", scope: [[DBG103]], file: [[META1]], line: 69, type: [[META10]])
-; DEBUG: [[META108]] = !DILocalVariable(name: "39", scope: [[DBG103]], file: [[META1]], line: 71, type: [[META10]])
-; DEBUG: [[META109]] = !DILocalVariable(name: "40", scope: [[DBG103]], file: [[META1]], line: 73, type: [[META16]])
-; DEBUG: [[META110]] = !DILocation(line: 66, column: 1, scope: [[DBG103]])
-; DEBUG: [[META111]] = !DILocation(line: 67, column: 1, scope: [[DBG103]])
-; DEBUG: [[META112]] = !DILocation(line: 69, column: 1, scope: [[DBG103]])
-; DEBUG: [[META113]] = !DILocation(line: 71, column: 1, scope: [[DBG103]])
-; DEBUG: [[DBG114]] = !DILocation(line: 72, column: 1, scope: [[DBG103]])
-; DEBUG: [[META115]] = !DILocation(line: 73, column: 1, scope: [[DBG103]])
-; DEBUG: [[DBG116]] = !DILocation(line: 74, column: 1, scope: [[DBG103]])
-; DEBUG: [[DBG117]] = distinct !DISubprogram(name: "load_store_different_element_type", linkageName: "load_store_different_element_type", scope: null, file: [[META1]], line: 75, type: [[META6]], scopeLine: 75, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META118:![0-9]+]])
-; DEBUG: [[META118]] = !{[[META119]], [[META120]], [[META121]], [[META122]], [[META123]], [[META124]]}
-; DEBUG: [[META119]] = !DILocalVariable(name: "41", scope: [[DBG117]], file: [[META1]], line: 75, type: [[META10]])
-; DEBUG: [[META120]] = !DILocalVariable(name: "42", scope: [[DBG117]], file: [[META1]], line: 76, type: [[META10]])
-; DEBUG: [[META121]] = !DILocalVariable(name: "43", scope: [[DBG117]], file: [[META1]], line: 78, type: [[META10]])
-; DEBUG: [[META122]] = !DILocalVariable(name: "44", scope: [[DBG117]], file: [[META1]], line: 80, type: [[META10]])
-; DEBUG: [[META123]] = !DILocalVariable(name: "45", scope: [[DBG117]], file: [[META1]], line: 82, type: [[META10]])
-; DEBUG: [[META124]] = !DILocalVariable(name: "46", scope: [[DBG117]], file: [[META1]], line: 84, type: [[META16]])
-; DEBUG: [[META125]] = !DILocation(line: 75, column: 1, scope: [[DBG117]])
-; DEBUG: [[META126]] = !DILocation(line: 76, column: 1, scope: [[DBG117]])
-; DEBUG: [[META127]] = !DILocation(line: 78, column: 1, scope: [[DBG117]])
-; DEBUG: [[META128]] = !DILocation(line: 80, column: 1, scope: [[DBG117]])
-; DEBUG: [[META129]] = !DILocation(line: 82, column: 1, scope: [[DBG117]])
-; DEBUG: [[DBG130]] = !DILocation(line: 83, column: 1, scope: [[DBG117]])
-; DEBUG: [[META131]] = !DILocation(line: 84, column: 1, scope: [[DBG117]])
-; DEBUG: [[DBG132]] = !DILocation(line: 85, column: 1, scope: [[DBG117]])
-;.
+define <8 x float> @bitcast_needed(<2 x i32> %a, <2 x i16> %b, <12 x i8> %c, <1 x i64> %d) {
+; CHECK-LABEL: define <8 x float> @bitcast_needed(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i16> [[B:%.*]], <12 x i8> [[C:%.*]], <1 x i64> [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[B]] to <1 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> [[TMP2]], <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <12 x i8> [[C]] to <3 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[D]] to <2 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> [[TMP9]], <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP8]], <5 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x float> [[TMP6]]
+;
+entry:
+  %alloca = alloca [8 x float]
+
+  %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0
+  store <2 x i32> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2
+  store <2 x i16> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 3
+  store <12 x i8> %c, ptr %ptr2
+
+  %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6
+  store <1 x i64> %d, ptr %ptr3
+
+  %result = load <8 x float>, ptr %alloca
+  ret <8 x float> %result
+}
+
+define <8 x float> @load_in_different_blocks(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i1 %cond) {
+; CHECK-LABEL: define <8 x float> @load_in_different_blocks(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[C]], <2 x float> [[D]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br i1 [[COND]], label %[[TRUEBRANCH:.*]], label %[[FALSEBRANCH:.*]]
+; CHECK:       [[TRUEBRANCH]]:
+; CHECK-NEXT:    br label %[[FALSEBRANCH]]
+; CHECK:       [[FALSEBRANCH]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <8 x float> [ poison, %[[ENTRY]] ], [ [[TMP2]], %[[TRUEBRANCH]] ]
+; CHECK-NEXT:    ret <8 x float> [[RESULT]]
+;
+entry:
+  %alloca = alloca [8 x float]
+
+  %ptr0 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 0
+  store <2 x float> %a, ptr %ptr0
+
+  %ptr1 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 2
+  store <2 x float> %b, ptr %ptr1
+
+  %ptr2 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 4
+  store <2 x float> %c, ptr %ptr2
+
+  %ptr3 = getelementptr inbounds [8 x float], ptr %alloca, i32 0, i32 6
+  store <2 x float> %d, ptr %ptr3
+
+  br i1 %cond, label %TrueBranch, label %FalseBranch
+
+TrueBranch:
+  %load1 = load <8 x float>, ptr %alloca
+  br label %FalseBranch
+
+FalseBranch:
+  %result = phi <8 x float> [ poison, %entry ], [ %load1, %TrueBranch ]
+  ret <8 x float> %result
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}
 ; CHECK-PRESERVE-CFG: {{.*}}

>From a3c0c0628c721459582cf04906e8fb06969df585 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nividia.com>
Date: Wed, 27 Aug 2025 20:20:18 +0000
Subject: [PATCH 7/7] Fix nits

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index aeea2d31c7a4e..c76510480a070 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3035,6 +3035,12 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 
     // Stores should be in the same basic block
     // The load should not be in the middle of the stores
+    // Note:
+    // If the load is in a different basic block with the stores, we can still
+    // do the tree structured merge. This is because we do not have the
+    // store->load forwarding here. The merged vector will be stored back to
+    // NewAI and the new load will load from NewAI. The forwarding will be
+    // handled later when we try to promote NewAI.
     BasicBlock *LoadBB = TheLoad->getParent();
     BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
 
@@ -3067,8 +3073,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 
     LLVM_DEBUG(dbgs() << "  Rewrite stores into shufflevectors:\n");
     while (VecElements.size() > 1) {
-      uint64_t NumElts = VecElements.size();
-      for (uint64_t i = 0; i < NumElts / 2; i++) {
+      const auto NumElts = VecElements.size();
+      for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
         Value *V0 = VecElements.front();
         VecElements.pop();
         Value *V1 = VecElements.front();
@@ -5268,9 +5274,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
                                PHIUsers, SelectUsers);
   bool Promotable = true;
   // Check whether we can have tree-structured merge.
-  std::optional<SmallVector<Value *, 4>> DeletedValues =
-      Rewriter.rewriteTreeStructuredMerge(P);
-  if (DeletedValues) {
+  if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
     NumUses += DeletedValues->size() + 1;
     for (Value *V : *DeletedValues)
       DeadInsts.push_back(V);



More information about the llvm-commits mailing list