[llvm] [LSV] Merge contiguous chains across scalar types (PR #154069)

Mon Aug 18 00:38:44 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Anshil Gandhi (gandhi56)

<details>
<summary>Changes</summary>

This change enables the Load/Store Vectorizer to merge and vectorize contiguous chains even when their scalar element types differ, as long as the total bitwidth matches. To do so, we rebase offsets between chains, normalize value types to a common integer type, and insert the necessary casts around loads and stores. This uncovers more vectorization opportunities and explains the expected codegen updates across AMDGPU tests.

Key changes:
- Chain merging
  - Build contiguous subchains and then merge adjacent ones when:
    - They refer to the same underlying pointer object and address space.
    - They are either all loads or all stores.
    - A constant leader-to-leader delta exists.
    - Rebasing one chain into the other's coordinate space does not overlap.
    - All elements have equal total bit width.
  - Rebase the second chain by the computed delta and append it to the first.

- Type normalization and casting
  - Normalize merged chains to a common integer type sized to the total bits.
  - For loads: create a new load of the normalized type, copy metadata, and cast back to the original type for uses if needed.
  - For stores: bitcast the value to the normalized type and store that.
  - Insert zext/trunc for integer size changes; use bit-or-pointer casts when sizes match.

- Cleanups
  - Erase replaced instructions and DCE pointer operands when safe.
  - New helpers: computeLeaderDelta, chainsOverlapAfterRebase, rebaseChain, normalizeChainToType, and allElemsMatchTotalBits.

Impact:
- Increases vectorization opportunities across mixed-typed but size-compatible access chains.
- Large set of expected AMDGPU codegen diffs due to more/changed vectorization.

---

Patch is 772.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154069.diff


65 Files Affected:

- (modified) llvm/include/llvm/Transforms/Utils/Local.h (+4) 
- (modified) llvm/lib/Transforms/Utils/Local.cpp (+45) 
- (modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+221-62) 
- (modified) llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll (+8-10) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+144-192) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll (+36-49) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll (+8-12) 
- (modified) llvm/test/CodeGen/AMDGPU/add_i1.ll (+68-10) 
- (modified) llvm/test/CodeGen/AMDGPU/add_i64.ll (+105-28) 
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+121-121) 
- (modified) llvm/test/CodeGen/AMDGPU/alloca.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll (+364-84) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll (+5) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+760-727) 
- (modified) llvm/test/CodeGen/AMDGPU/and.ll (+22-35) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll (+175-68) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+22-25) 
- (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll (+19-17) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+27-76) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+53-51) 
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+171-162) 
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+602-706) 
- (modified) llvm/test/CodeGen/AMDGPU/fnearbyint.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+26-28) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+54-54) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+13-13) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+168-192) 
- (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+92-100) 
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+89-106) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+633-718) 
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+65-109) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+27-27) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+10-12) 
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+12-12) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll (+189-86) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+36-20) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+43-43) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+43-43) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+38-38) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+38-38) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+28-28) 
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+96-96) 
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+35-33) 
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+27-25) 
- (modified) llvm/test/CodeGen/AMDGPU/s_addk_i32.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll (+23-21) 
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+264-236) 
- (modified) llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll (+32-32) 
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+130-129) 
- (modified) llvm/test/CodeGen/AMDGPU/udivrem.ll (+65-61) 
- (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll (+11-10) 
- (modified) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll (+24-15) 
- (modified) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll (+21-3) 


``````````diff

diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index df146458b4e6f..ad29d69900235 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -448,6 +448,10 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J);
 /// replacement for the source instruction).
 LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source);
 
+/// Copy the metadata from the source instruction to the destination (the
+/// replacement for the source instruction).
+LLVM_ABI void copyMetadataForStore(StoreInst &Dest, const StoreInst &Source);
+
 /// Patch the replacement so that it is not more restrictive than the value
 /// being replaced. It assumes that the replacement does not get moved from
 /// its original position.
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index f5208d50c6aae..ecdc920d442dc 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3496,6 +3496,51 @@ void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
   }
 }
 
+void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  MDBuilder MDB(Dest.getContext());
+  Type *NewType = Dest.getType();
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_access_group:
+    case LLVMContext::MD_noundef:
+    case LLVMContext::MD_noalias_addrspace:
+    case LLVMContext::MD_mem_parallel_loop_access:
+      Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_tbaa: {
+      MDNode *NewTyNode =
+          MDB.createTBAAScalarTypeNode(NewType->getStructName(), N);
+      Dest.setMetadata(LLVMContext::MD_tbaa, NewTyNode);
+      break;
+    }
+    case LLVMContext::MD_nonnull:
+      break;
+
+    case LLVMContext::MD_align:
+    case LLVMContext::MD_dereferenceable:
+    case LLVMContext::MD_dereferenceable_or_null:
+      // These only directly apply if the new type is also a pointer.
+      if (NewType->isPointerTy())
+        Dest.setMetadata(ID, N);
+      break;
+
+    case LLVMContext::MD_range:
+      break;
+    }
+  }
+}
+
 void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
   auto *ReplInst = dyn_cast<Instruction>(Repl);
   if (!ReplInst)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 89f63c3b66aad..12ac5c891aefa 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -112,6 +112,7 @@
 #include <optional>
 #include <tuple>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -268,11 +269,6 @@ class Vectorizer {
   /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
   bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
 
-  /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
-  /// in the same BB with the same value for getUnderlyingObject() etc.
-  bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                             ArrayRef<Instruction *> EqClass);
-
   /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
   /// where all instructions access a known, constant offset from the first
   /// instruction.
@@ -337,12 +333,24 @@ class Vectorizer {
   EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
                                                 BasicBlock::iterator End);
 
+  /// Inserts a cast instruction to convert Inst to DstTy.
+  Value *insertCast(Value *Val, Type *DstTy);
+
   /// Partitions Instrs into "chains" where every instruction has a known
   /// constant offset from the first instr in the chain.
   ///
   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
   /// in the chain is the leader, and an instr touches distance 0 from itself.
   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+  // Helpers for chain merging.
+  std::optional<APInt> computeLeaderDelta(Instruction *I1, Instruction *I2);
+  bool chainsOverlapAfterRebase(const Chain &A, const Chain &B,
+                                const APInt &Delta) const;
+  static bool allElemsMatchTotalBits(const Chain &C, unsigned TotalBits,
+                                     const DataLayout &DL);
+  static void rebaseChain(Chain &C, const APInt &Delta);
+  void normalizeChainToType(Chain &C, Type *CastTy);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -424,6 +432,20 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
   return Changed ? PA : PreservedAnalyses::all();
 }
 
+static const Value *getUnderlyingPtrObject(const Value *Ptr) {
+  const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+    // The select's themselves are distinct instructions even if they share
+    // the same condition and evaluate to consecutive pointers for true and
+    // false values of the condition. Therefore using the select's themselves
+    // for grouping instructions would put consecutive accesses into different
+    // lists and they won't be even checked for being consecutive, and won't
+    // be vectorized.
+    return Sel->getCondition();
+  }
+  return ObjPtr;
+}
+
 bool Vectorizer::run() {
   bool Changed = false;
   // Break up the BB if there are any instrs which aren't guaranteed to transfer
@@ -467,6 +489,98 @@ bool Vectorizer::run() {
   return Changed;
 }
 
+Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
+  if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) {
+    return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc");
+  }
+
+  // If the types are of different sizes and both are integers, we can use
+  // zext or sext to cast.
+  if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) {
+    if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) {
+      return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc");
+    }
+    return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc");
+  }
+
+  return nullptr;
+}
+
+std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
+                                                    Instruction *I2) {
+  assert((isa<LoadInst>(I1) || isa<StoreInst>(I1)) &&
+         (isa<LoadInst>(I2) || isa<StoreInst>(I2)) &&
+         "computeLeaderDelta must be called with load or store instructions");
+  Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
+  const Value *Ptr1 = getLoadStorePointerOperand(I1);
+  const Value *Ptr2 = getLoadStorePointerOperand(I2);
+  return getConstantOffset(const_cast<Value *>(Ptr1), const_cast<Value *>(Ptr2),
+                           CtxInst);
+}
+
+bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
+                                          const APInt &Delta) const {
+  for (const ChainElem &EB : B) {
+    APInt OffB = EB.OffsetFromLeader + Delta;
+    unsigned SizeB = DL.getTypeStoreSize(getLoadStoreType(EB.Inst));
+    APInt EndB = OffB + SizeB;
+    for (const ChainElem &EA : A) {
+      APInt OffA = EA.OffsetFromLeader;
+      unsigned SizeA = DL.getTypeStoreSize(getLoadStoreType(EA.Inst));
+      APInt EndA = OffA + SizeA;
+      if (OffB.slt(EndA) && OffA.slt(EndB))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool Vectorizer::allElemsMatchTotalBits(const Chain &C, unsigned TotalBits,
+                                        const DataLayout &DL) {
+  return llvm::all_of(C, [&](const ChainElem &E) {
+    return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
+  });
+}
+
+void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {
+  for (ChainElem &E : C)
+    E.OffsetFromLeader += Delta;
+}
+
+void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) {
+  for (ChainElem &Elem : C) {
+    Instruction *Inst = Elem.Inst;
+    Type *OrigValTy = getLoadStoreType(Inst);
+    if (OrigValTy == CastTy)
+      continue;
+
+    if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+      Builder.SetInsertPoint(LI);
+      LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(),
+                                             LI->getName() + ".mut");
+      copyMetadataForLoad(*NewLoad, *LI);
+      Value *CastBack = insertCast(NewLoad, OrigValTy);
+      if (!CastBack)
+        llvm_unreachable("Failed to insert cast");
+      LI->replaceAllUsesWith(CastBack);
+      ToErase.emplace_back(LI);
+      Elem.Inst = NewLoad;
+    } else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+      Builder.SetInsertPoint(SI);
+      Value *CastVal = insertCast(SI->getValueOperand(), CastTy);
+      if (!CastVal)
+        llvm_unreachable("Failed to insert cast");
+      StoreInst *NewStore =
+          Builder.CreateStore(CastVal, SI->getPointerOperand());
+      NewStore->setAlignment(SI->getAlign());
+      NewStore->setVolatile(SI->isVolatile());
+      copyMetadataForStore(*NewStore, *SI);
+      ToErase.emplace_back(SI);
+      Elem.Inst = NewStore;
+    }
+  }
+}
+
 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
                                BasicBlock::iterator End) {
   LLVM_DEBUG({
@@ -479,49 +593,111 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
   });
 
   bool Changed = false;
+  SmallVector<Chain> ContiguousSubChains;
+
   for (const auto &[EqClassKey, EqClass] :
-       collectEquivalenceClasses(Begin, End))
-    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+       collectEquivalenceClasses(Begin, End)) {
 
-  return Changed;
-}
+    LLVM_DEBUG({
+      dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+             << " keyed on " << EqClassKey << ":\n";
+      for (Instruction *I : EqClass)
+        dbgs() << "  " << *I << "\n";
+    });
 
-bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                                       ArrayRef<Instruction *> EqClass) {
-  bool Changed = false;
+    for (Chain &C : gatherChains(EqClass)) {
 
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
-           << " keyed on " << EqClassKey << ":\n";
-    for (Instruction *I : EqClass)
-      dbgs() << "  " << *I << "\n";
-  });
+      // Split up the chain into increasingly smaller chains, until we can
+      // finally vectorize the chains.
+      //
+      // (Don't be scared by the depth of the loop nest here.  These operations
+      // are all at worst O(n lg n) in the number of instructions, and splitting
+      // chains doesn't change the number of instrs.  So the whole loop nest is
+      // O(n lg n).)
+      for (auto &C : splitChainByMayAliasInstrs(C)) {
+        for (auto &C : splitChainByContiguity(C)) {
+          ContiguousSubChains.emplace_back(C);
+        }
+      }
+    }
+  }
 
-  std::vector<Chain> Chains = gatherChains(EqClass);
-  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
-                    << " nontrivial chains.\n";);
-  for (Chain &C : Chains)
-    Changed |= runOnChain(C);
-  return Changed;
-}
+  // Merge chains in reverse order, so that the first chain is the largest.
+  for (int I = ContiguousSubChains.size() - 1; I > 0; I--) {
+    Chain &C1 = ContiguousSubChains[I - 1];
+    Chain &C2 = ContiguousSubChains[I];
 
-bool Vectorizer::runOnChain(Chain &C) {
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
-    dumpChain(C);
-  });
+    // If the scalar types of the chains are the same, we can merge them
+    // without inserting any casts.
+    if (getLoadStoreType(C1[0].Inst)->getScalarType() ==
+        getLoadStoreType(C2[0].Inst)->getScalarType())
+      continue;
+
+    const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst);
+    const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst);
+    unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace();
+    unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace();
+    bool C1IsLoad = isa<LoadInst>(C1[0].Inst);
+    bool C2IsLoad = isa<LoadInst>(C2[0].Inst);
+
+    // If the chains are mapped to different types, have distinct underlying
+    // pointer objects, or include both loads and stores, skip.
+    if (getUnderlyingPtrObject(C1Ptr) != getUnderlyingPtrObject(C2Ptr) ||
+        C1IsLoad != C2IsLoad || AS1 != AS2)
+      continue;
+
+    // Compute constant offset between chain leaders; if unknown, skip.
+    std::optional<APInt> DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst);
+    if (!DeltaOpt)
+      continue;
+
+    // Check that rebasing C2 into C1's coordinate space will not overlap C1.
+    if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt))
+      continue;
+
+    // Determine the common integer cast type for normalization and ensure total
+    // bitwidth matches across all elements of both chains.
+    Type *C1ElemTy = getLoadStoreType(C1[0].Inst);
+    unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy);
+    auto AllElemsMatchTotalBits = [&](const Chain &C) {
+      return llvm::all_of(C, [&](const ChainElem &E) {
+        return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
+      });
+    };
+    if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
+      continue;
+
+    // Rebase C2's offsets into C1's coordinate space prior to merging.
+    rebaseChain(C2, *DeltaOpt);
+
+    // Merge C2 into C1 by appending all elements of C2 to C1, then erase C2
+    // from ContiguousSubChains.
+    C1.insert(C1.end(), C2.begin(), C2.end());
+    ContiguousSubChains.erase(ContiguousSubChains.begin() + I);
+
+    // Normalize the value operand/result type of each instruction in C1 to
+    // C1CastTy.
+    Type *C1CastTy =
+        Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy));
+    normalizeChainToType(C1, C1CastTy);
+  }
+
+  for (auto &C : ContiguousSubChains) {
+    if (C.size() <= 1)
+      continue;
+    for (auto &AlignedSubChain : splitChainByAlignment(C))
+      Changed |= vectorizeChain(AlignedSubChain);
+  }
+
+  // Erase all instructions scheduled for deletion in this pseudo-BB.
+  for (Instruction *I : ToErase) {
+    auto *PtrOperand = getLoadStorePointerOperand(I);
+    if (I->use_empty())
+      I->eraseFromParent();
+    RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+  }
+  ToErase.clear();
 
-  // Split up the chain into increasingly smaller chains, until we can finally
-  // vectorize the chains.
-  //
-  // (Don't be scared by the depth of the loop nest here.  These operations are
-  // all at worst O(n lg n) in the number of instructions, and splitting chains
-  // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
-  bool Changed = false;
-  for (auto &C : splitChainByMayAliasInstrs(C))
-    for (auto &C : splitChainByContiguity(C))
-      for (auto &C : splitChainByAlignment(C))
-        Changed |= vectorizeChain(C);
   return Changed;
 }
 
@@ -578,7 +754,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
         LLVM_DEBUG(
             dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
                    << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
-        if (NewChain.size() > 1) {
+        if (!NewChain.empty()) {
           LLVM_DEBUG({
             dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
             dumpChain(NewChain);
@@ -590,7 +766,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
         NewChain = SmallVector<ChainElem, 1>({*ChainIt});
       }
     }
-    if (NewChain.size() > 1) {
+    if (!NewChain.empty()) {
       LLVM_DEBUG({
         dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
         dumpChain(NewChain);
@@ -643,8 +819,6 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
       Ret.push_back({*It});
   }
 
-  // Filter out length-1 chains, these are uninteresting.
-  llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
   return Ret;
 }
 
@@ -1427,20 +1601,6 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                       BasicBlock::iterator End) {
   EquivalenceClassMap Ret;
 
-  auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
-    const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
-    if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
-      // The select's themselves are distinct instructions even if they share
-      // the same condition and evaluate to consecutive pointers for true and
-      // false values of the condition. Therefore using the select's themselves
-      // for grouping instructions would put consecutive accesses into different
-      // lists and they won't be even checked for being consecutive, and won't
-      // be vectorized.
-      return Sel->getCondition();
-    }
-    return ObjPtr;
-  };
-
   for (Instruction &I : make_range(Begin, End)) {
     auto *LI = dyn_cast<LoadInst>(&I);
     auto *SI = dyn_cast<StoreInst>(&I);
@@ -1488,7 +1648,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
         (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
       continue;
 
-    Ret[{GetUnderlyingObject(Ptr), AS,
+    Ret[{getUnderlyingPtrObject(Ptr), AS,
          DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
          /*IsLoad=*/LI != nullptr}]
         .emplace_back(&I);
@@ -1583,8 +1743,7 @@ std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
   Ret.reserve(Chains.size());
   // Iterate over MRU rather than Chains so the order is deterministic.
   for (auto &E : MRU)
-    if (E.second.size() > 1)
-      Ret.emplace_back(std::move(E.second));
+    Ret.emplace_back(std::move(E.second));
   return Ret;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 840165d5a7e7a..fc1b636f525fd 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -298,26 +298,24 @@ define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val)
 define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
 ; GFX7-LABEL: local_address_gep_store:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    s_add_i32 s0, s2, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX7-NEXT:    s_add_i32 s0, s0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX7-NEXT:    ds_write_b32 v1, v0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: local_address_gep_store:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    s_add_i32 s0, s2, s1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX8-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    ds_write_b32 v1, v0
 ; GFX8-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index 523d51ddcd2bc..a9271794bc03b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -m...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/154069