[llvm] [MemCpyOpt] Merge memset and skip unrelated clobber in one scan (PR #90350)

Mon Apr 29 05:07:45 PDT 2024

https://github.com/XChy updated https://github.com/llvm/llvm-project/pull/90350

>From fbcf682fbba3870b2750f1e1cf5185ff6de9969d Mon Sep 17 00:00:00 2001
From: XChy <xxs_chy at outlook.com>
Date: Mon, 22 Apr 2024 00:46:18 +0800
Subject: [PATCH 1/2] [MemCpyOpt] Precommit tests for merging into memset (NFC)

---
 .../Transforms/MemCpyOpt/merge-into-memset.ll | 174 +++++++++++++++++-
 1 file changed, 172 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
index 78aa769982404a0..ca2ffd72818496f 100644
--- a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
@@ -36,5 +36,175 @@ exit:
   ret void
 }
 
-declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
-declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
+define void @memset_clobber_no_alias(ptr %p) {
+; CHECK-LABEL: @memset_clobber_no_alias(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 16, i1 false)
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @store_clobber_no_alias1(i64 %a, ptr %p) {
+; CHECK-LABEL: @store_clobber_no_alias1(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    store i64 [[A:%.*]], ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  store i64 %a, ptr %p, align 8
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @store_clobber_no_alias2(i64 %a, ptr %p) {
+; CHECK-LABEL: @store_clobber_no_alias2(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    store i64 [[A:%.*]], ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  store i64 %a, ptr %p, align 8
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @store_clobber_no_alias_precise_fail(i64 %a) {
+; CHECK-LABEL: @store_clobber_no_alias_precise_fail(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    store i64 [[A:%.*]], ptr [[STACK]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  store i64 %a, ptr %stack, align 8
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @store_clobber_may_alias_fail(ptr %p, ptr %p1) {
+; CHECK-LABEL: @store_clobber_may_alias_fail(
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK:%.*]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    store i64 0, ptr [[P1:%.*]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack1 = getelementptr inbounds i8, ptr %p, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  store i64 0, ptr %p1, align 8
+  %stack2 = getelementptr inbounds i8, ptr %p, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @load_clobber_no_alias(ptr %p, ptr %p1) {
+; CHECK-LABEL: @load_clobber_no_alias(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    [[A:%.*]] = load i64, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    store i64 [[A]], ptr [[P1:%.*]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  %a = load i64, ptr %p, align 8
+  store i64 %a, ptr %p1, align 8
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @load_clobber_alias_fail(ptr %p, ptr %p1) {
+; CHECK-LABEL: @load_clobber_alias_fail(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    [[A:%.*]] = load i64, ptr [[STACK]], align 8
+; CHECK-NEXT:    store i64 [[A]], ptr [[P1:%.*]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  %a = load i64, ptr %stack, align 8
+  store i64 %a, ptr %p1, align 8
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @memset_volatile_fail(ptr %p) {
+; CHECK-LABEL: @memset_volatile_fail(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 0, i64 16, i1 true)
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 16, i1 true)
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}
+
+define void @store_volatile_fail(i64 %a, ptr %p) {
+; CHECK-LABEL: @store_volatile_fail(
+; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
+; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
+; CHECK-NEXT:    store volatile i64 [[A:%.*]], ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca <256 x i8>, align 8
+  %stack1 = getelementptr inbounds i8, ptr %stack, i64 8
+  call void @llvm.memset.p0.i64(ptr %stack1, i8 0, i64 136, i1 false)
+  store volatile i64 %a, ptr %p
+  %stack2 = getelementptr inbounds i8, ptr %stack, i64 24
+  call void @llvm.memset.p0.i64(ptr %stack2, i8 0, i64 24, i1 false)
+  ret void
+}

>From cd5a161c5ecaa7868532cee60ae95fca645641ca Mon Sep 17 00:00:00 2001
From: XChy <xxs_chy at outlook.com>
Date: Mon, 22 Apr 2024 00:29:53 +0800
Subject: [PATCH 2/2] [MemCpyOpt] Continue merge memset with unrelated clobber

---
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h  |   3 +-
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 343 +++++++++++-------
 llvm/test/Transforms/MemCpyOpt/form-memset.ll |   4 +-
 .../Transforms/MemCpyOpt/merge-into-memset.ll |  12 +-
 4 files changed, 214 insertions(+), 148 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 6c809bc881d050d..00ed731e0cda00b 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -78,8 +78,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
                                   BatchAAResults &BAA);
   bool processByValArgument(CallBase &CB, unsigned ArgNo);
   bool processImmutArgument(CallBase &CB, unsigned ArgNo);
-  Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
-                                    Value *ByteVal);
+  bool tryMergingIntoMemset(BasicBlock *BB);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
   bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
                              AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 7ef5dceffec0d2a..fa4be64ec9e8bad 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -158,11 +158,16 @@ class MemsetRanges {
 
   /// A sorted list of the memset ranges.
   SmallVector<MemsetRange, 8> Ranges;
-
-  const DataLayout &DL;
+  const DataLayout *DL;
 
 public:
-  MemsetRanges(const DataLayout &DL) : DL(DL) {}
+  MemsetRanges() {}
+  MemsetRanges(const DataLayout *DL) : DL(DL) {}
+  MemsetRanges(const DataLayout *DL, Instruction *I, Value *StartPtr,
+               Value *ByteVal)
+      : DL(DL), StartInst(I), StartPtr(StartPtr),
+        StartPtrLocation(MemoryLocation::getBeforeOrAfter(StartPtr)),
+        ByteVal(ByteVal) {}
 
   using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
 
@@ -178,7 +183,7 @@ class MemsetRanges {
   }
 
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
-    TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+    TypeSize StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType());
     assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
     addRange(OffsetFromFirst, StoreSize.getFixedValue(),
              SI->getPointerOperand(), SI->getAlign(), SI);
@@ -191,6 +196,18 @@ class MemsetRanges {
 
   void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment,
                 Instruction *Inst);
+
+  bool flush(MemorySSAUpdater *MSSAU, Instruction *InsertPoint,
+             MemoryUseOrDef *&MemInsertPoint);
+
+  // The first store/memset instruction
+  Instruction *StartInst;
+  // The start pointer written by the first instruction
+  Value *StartPtr;
+  // The memory location of StartPtr
+  MemoryLocation StartPtrLocation;
+  // The byte value of memset
+  Value *ByteVal;
 };
 
 } // end anonymous namespace
@@ -255,6 +272,73 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
   }
 }
 
+bool MemsetRanges::flush(MemorySSAUpdater *MSSAU, Instruction *InsertPoint,
+                         MemoryUseOrDef *&MemInsertPoint) {
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (empty())
+    return false;
+
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  addInst(0, StartInst);
+
+  // If we create any memsets, we put it right before the first instruction that
+  // isn't part of the memset block.  This ensure that the memset is dominated
+  // by any addressing instruction needed by the start of the block.
+  IRBuilder<> Builder(InsertPoint);
+
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  Instruction *AMemSet = nullptr;
+
+  // Keeps track of the last memory use or def before the insertion point for
+  // the new memset. The new MemoryDef for the inserted memsets will be inserted
+  // after MemInsertPoint.
+  for (const MemsetRange &Range : *this) {
+    if (Range.TheStores.size() == 1)
+      continue;
+
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(*DL))
+      continue;
+
+    // Otherwise, we do want to transform this!  Create a new memset.
+    // Get the starting pointer of the block.
+    Value *CurrentStart = Range.StartPtr;
+
+    AMemSet = Builder.CreateMemSet(CurrentStart, ByteVal,
+                                   Range.End - Range.Start, Range.Alignment);
+    AMemSet->mergeDIAssignID(Range.TheStores);
+
+    LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
+                                                   : Range.TheStores) dbgs()
+                                              << *SI << '\n';
+               dbgs() << "With: " << *AMemSet << '\n');
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+    auto *NewDef = cast<MemoryDef>(
+        MemInsertPoint->getMemoryInst() == InsertPoint
+            ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint)
+            : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint));
+    MSSAU->insertDef(NewDef, /*RenameUses=*/true);
+    MemInsertPoint = NewDef;
+
+    // Zap all the stores.
+    for (Instruction *SI : Range.TheStores) {
+      MSSAU->removeMemoryAccess(SI);
+      SI->eraseFromParent();
+    }
+
+    ++NumMemSetInfer;
+  }
+
+  return AMemSet;
+}
+
 //===----------------------------------------------------------------------===//
 //                         MemCpyOptLegacyPass Pass
 //===----------------------------------------------------------------------===//
@@ -350,157 +434,156 @@ static void combineAAMetadata(Instruction *ReplInst, Instruction *I) {
   combineMetadata(ReplInst, I, KnownIDs, true);
 }
 
+static bool isCandidateToMergeIntoMemset(Instruction *I, const DataLayout &DL,
+                                         Value *&ByteVal) {
+
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Value *StoredVal = SI->getValueOperand();
+
+    // Avoid merging nontemporal stores since the resulting
+    // memcpy/memset would not be able to preserve the nontemporal hint.
+    if (SI->getMetadata(LLVMContext::MD_nontemporal))
+      return false;
+    // Don't convert stores of non-integral pointer types to memsets (which
+    // stores integers).
+    if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
+      return false;
+
+    // We can't track ranges involving scalable types.
+    if (DL.getTypeStoreSize(StoredVal->getType()).isScalable())
+      return false;
+
+    ByteVal = isBytewiseValue(StoredVal, DL);
+    if (!ByteVal)
+      return false;
+
+    return true;
+  }
+
+  if (auto *MSI = dyn_cast<MemSetInst>(I)) {
+    if (!isa<ConstantInt>(MSI->getLength()))
+      return false;
+
+    ByteVal = MSI->getValue();
+    return true;
+  }
+
+  return false;
+}
+
+static Value *getWrittenPtr(Instruction *I) {
+  if (auto *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  if (auto *MSI = dyn_cast<MemSetInst>(I))
+    return MSI->getDest();
+  static_assert("Only support store and memset");
+  return nullptr;
+}
+
 /// When scanning forward over instructions, we look for some other patterns to
 /// fold away. In particular, this looks for stores to neighboring locations of
 /// memory. If it sees enough consecutive ones, it attempts to merge them
 /// together into a memcpy/memset.
-Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
-                                                 Value *StartPtr,
-                                                 Value *ByteVal) {
-  const DataLayout &DL = StartInst->getModule()->getDataLayout();
+bool MemCpyOptPass::tryMergingIntoMemset(BasicBlock *BB) {
+  MapVector<Value *, MemsetRanges> ObjToRanges;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  MemoryUseOrDef *MemInsertPoint = nullptr;
+  bool MadeChanged = false;
 
-  // We can't track scalable types
-  if (auto *SI = dyn_cast<StoreInst>(StartInst))
-    if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
-      return nullptr;
+  // The following code creates memset intrinsics out of thin air. Don't do
+  // this if the corresponding libfunc is not available.
+  if (!(TLI->has(LibFunc_memset) || EnableMemCpyOptWithoutLibcalls))
+    return false;
 
-  // Okay, so we now have a single store that can be splatable.  Scan to find
-  // all subsequent stores of the same value to offset from the same pointer.
-  // Join these together into ranges, so we can decide whether contiguous blocks
-  // are stored.
-  MemsetRanges Ranges(DL);
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; BI++) {
+    BatchAAResults BAA(*AA);
+    Instruction *I = &*BI;
 
-  BasicBlock::iterator BI(StartInst);
+    if (!I->mayReadOrWriteMemory())
+      continue;
 
-  // Keeps track of the last memory use or def before the insertion point for
-  // the new memset. The new MemoryDef for the inserted memsets will be inserted
-  // after MemInsertPoint.
-  MemoryUseOrDef *MemInsertPoint = nullptr;
-  for (++BI; !BI->isTerminator(); ++BI) {
-    auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
-        MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
+    auto *CurrentAcc =
+        cast_or_null<MemoryUseOrDef>(MSSAU->getMemorySSA()->getMemoryAccess(I));
     if (CurrentAcc)
       MemInsertPoint = CurrentAcc;
 
     // Calls that only access inaccessible memory do not block merging
     // accessible stores.
-    if (auto *CB = dyn_cast<CallBase>(BI)) {
+    if (auto *CB = dyn_cast<CallBase>(BI))
       if (CB->onlyAccessesInaccessibleMemory())
         continue;
-    }
 
-    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
-      // If the instruction is readnone, ignore it, otherwise bail out.  We
-      // don't even allow readonly here because we don't want something like:
-      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
-      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
-        break;
+    if (I->isVolatile() || I->isAtomic()) {
+      // Flush all MemsetRanges if reaching a fence.
+      for (auto [Obj, Ranges] : ObjToRanges)
+        MadeChanged |= Ranges.flush(MSSAU, I, MemInsertPoint);
+      ObjToRanges.clear();
       continue;
     }
 
-    if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
-      // If this is a store, see if we can merge it in.
-      if (!NextStore->isSimple())
-        break;
+    // Handle other clobbers
+    if (!isa<StoreInst>(I) && !isa<MemSetInst>(I)) {
+      // Flush all may-aliased MemsetRanges.
+      ObjToRanges.remove_if([&](std::pair<Value *, MemsetRanges> &Entry) {
+        auto &Ranges = Entry.second;
+        bool ShouldFlush =
+            isModOrRefSet(BAA.getModRefInfo(I, Ranges.StartPtrLocation));
+        if (ShouldFlush)
+          MadeChanged |= Ranges.flush(MSSAU, I, MemInsertPoint);
+        return ShouldFlush;
+      });
+      continue;
+    }
 
-      Value *StoredVal = NextStore->getValueOperand();
+    Value *WrittenPtr = getWrittenPtr(I);
+    Value *Obj = getUnderlyingObject(WrittenPtr);
+    Value *ByteVal = nullptr;
 
-      // Don't convert stores of non-integral pointer types to memsets (which
-      // stores integers).
-      if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
-        break;
+    // If this is a store, see if we can merge it in.
+    if (ObjToRanges.contains(Obj)) {
+      MemsetRanges &Ranges = ObjToRanges[Obj];
 
-      // We can't track ranges involving scalable types.
-      if (DL.getTypeStoreSize(StoredVal->getType()).isScalable())
-        break;
+      if (!isCandidateToMergeIntoMemset(I, DL, ByteVal)) {
+        MadeChanged |= Ranges.flush(MSSAU, I, MemInsertPoint);
+        ObjToRanges.erase(Obj);
+        continue;
+      }
 
-      // Check to see if this stored value is of the same byte-splattable value.
-      Value *StoredByte = isBytewiseValue(StoredVal, DL);
-      if (isa<UndefValue>(ByteVal) && StoredByte)
-        ByteVal = StoredByte;
-      if (ByteVal != StoredByte)
-        break;
+      if (isa<UndefValue>(Ranges.ByteVal))
+        Ranges.ByteVal = ByteVal;
 
-      // Check to see if this store is to a constant offset from the start ptr.
       std::optional<int64_t> Offset =
-          NextStore->getPointerOperand()->getPointerOffsetFrom(StartPtr, DL);
-      if (!Offset)
-        break;
-
-      Ranges.addStore(*Offset, NextStore);
-    } else {
-      auto *MSI = cast<MemSetInst>(BI);
+          WrittenPtr->getPointerOffsetFrom(Ranges.StartPtr, DL);
 
-      if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
-          !isa<ConstantInt>(MSI->getLength()))
-        break;
+      if (!Offset || Ranges.ByteVal != ByteVal) {
+        MadeChanged |= Ranges.flush(MSSAU, I, MemInsertPoint);
+        ObjToRanges[Obj] = MemsetRanges(&DL, I, WrittenPtr, ByteVal);
+        continue;
+      }
 
-      // Check to see if this store is to a constant offset from the start ptr.
-      std::optional<int64_t> Offset =
-          MSI->getDest()->getPointerOffsetFrom(StartPtr, DL);
-      if (!Offset)
-        break;
+      Ranges.addInst(*Offset, I);
+    } else {
+      // Flush all may-aliased MemsetRanges.
+      ObjToRanges.remove_if([&](std::pair<Value *, MemsetRanges> &Entry) {
+        auto &Ranges = Entry.second;
+        bool ShouldFlush =
+            isModOrRefSet(BAA.getModRefInfo(I, Ranges.StartPtrLocation));
+        if (ShouldFlush)
+          MadeChanged |= Ranges.flush(MSSAU, I, MemInsertPoint);
+        return ShouldFlush;
+      });
 
-      Ranges.addMemSet(*Offset, MSI);
+      // Create a new MemsetRanges.
+      if (isCandidateToMergeIntoMemset(I, DL, ByteVal))
+        ObjToRanges.insert({Obj, MemsetRanges(&DL, I, WrittenPtr, ByteVal)});
     }
   }
 
-  // If we have no ranges, then we just had a single store with nothing that
-  // could be merged in.  This is a very common case of course.
-  if (Ranges.empty())
-    return nullptr;
-
-  // If we had at least one store that could be merged in, add the starting
-  // store as well.  We try to avoid this unless there is at least something
-  // interesting as a small compile-time optimization.
-  Ranges.addInst(0, StartInst);
-
-  // If we create any memsets, we put it right before the first instruction that
-  // isn't part of the memset block.  This ensure that the memset is dominated
-  // by any addressing instruction needed by the start of the block.
-  IRBuilder<> Builder(&*BI);
-
-  // Now that we have full information about ranges, loop over the ranges and
-  // emit memset's for anything big enough to be worthwhile.
-  Instruction *AMemSet = nullptr;
-  for (const MemsetRange &Range : Ranges) {
-    if (Range.TheStores.size() == 1)
-      continue;
-
-    // If it is profitable to lower this range to memset, do so now.
-    if (!Range.isProfitableToUseMemset(DL))
-      continue;
-
-    // Otherwise, we do want to transform this!  Create a new memset.
-    // Get the starting pointer of the block.
-    StartPtr = Range.StartPtr;
-
-    AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
-                                   Range.Alignment);
-    AMemSet->mergeDIAssignID(Range.TheStores);
-
-    LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
-                                                   : Range.TheStores) dbgs()
-                                              << *SI << '\n';
-               dbgs() << "With: " << *AMemSet << '\n');
-    if (!Range.TheStores.empty())
-      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
-
-    auto *NewDef = cast<MemoryDef>(
-        MemInsertPoint->getMemoryInst() == &*BI
-            ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint)
-            : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint));
-    MSSAU->insertDef(NewDef, /*RenameUses=*/true);
-    MemInsertPoint = NewDef;
-
-    // Zap all the stores.
-    for (Instruction *SI : Range.TheStores)
-      eraseInstruction(SI);
-
-    ++NumMemSetInfer;
+  for (auto [Obj, Ranges] : ObjToRanges) {
+    MadeChanged |= Ranges.flush(MSSAU, &BB->back(), MemInsertPoint);
   }
 
-  return AMemSet;
+  return MadeChanged;
 }
 
 // This method try to lift a store instruction before position P.
@@ -797,12 +880,6 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // 0xA0A0A0A0 and 0.0.
   auto *V = SI->getOperand(0);
   if (Value *ByteVal = isBytewiseValue(V, DL)) {
-    if (Instruction *I =
-            tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) {
-      BBI = I->getIterator(); // Don't invalidate iterator.
-      return true;
-    }
-
     // If we have an aggregate, we try to promote it to memset regardless
     // of opportunity for merging as it can expose optimization opportunities
     // in subsequent passes.
@@ -835,14 +912,6 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 }
 
 bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
-  // See if there is another memset or store neighboring this memset which
-  // allows us to widen out the memset to do a single larger store.
-  if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
-    if (Instruction *I =
-            tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) {
-      BBI = I->getIterator(); // Don't invalidate iterator.
-      return true;
-    }
   return false;
 }
 
@@ -1991,6 +2060,8 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
     if (!DT->isReachableFromEntry(&BB))
       continue;
 
+    MadeChange |= tryMergingIntoMemset(&BB);
+
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
       // Avoid invalidating the iterator.
       Instruction *I = &*BI++;
diff --git a/llvm/test/Transforms/MemCpyOpt/form-memset.ll b/llvm/test/Transforms/MemCpyOpt/form-memset.ll
index 020a72183e9ea1c..edcf27e48f3bda6 100644
--- a/llvm/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/form-memset.ll
@@ -97,7 +97,6 @@ define void @test2() nounwind  {
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr [8 x i8], ptr [[REF_IDX]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr [8 x i8], ptr [[REF_IDX]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 7, i32 0
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP41]], i8 -1, i64 8, i1 false)
 ; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 7, i32 1
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 6, i32 0
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 6, i32 1
@@ -114,7 +113,6 @@ define void @test2() nounwind  {
 ; CHECK-NEXT:    [[TMP141:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP144:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 0, i32 1
 ; CHECK-NEXT:    [[TMP148:%.*]] = getelementptr [8 x %struct.MV], ptr [[LEFT_MVD]], i32 0, i32 7, i32 0
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP141]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP151:%.*]] = getelementptr [8 x %struct.MV], ptr [[LEFT_MVD]], i32 0, i32 7, i32 1
 ; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr [8 x %struct.MV], ptr [[LEFT_MVD]], i32 0, i32 6, i32 0
 ; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr [8 x %struct.MV], ptr [[LEFT_MVD]], i32 0, i32 6, i32 1
@@ -132,6 +130,8 @@ define void @test2() nounwind  {
 ; CHECK-NEXT:    [[TMP249:%.*]] = getelementptr [8 x %struct.MV], ptr [[LEFT_MVD]], i32 0, i32 0, i32 1
 ; CHECK-NEXT:    [[UP_MVD252:%.*]] = getelementptr [8 x %struct.MV], ptr [[UP_MVD]], i32 0, i32 0
 ; CHECK-NEXT:    [[LEFT_MVD253:%.*]] = getelementptr [8 x %struct.MV], ptr [[LEFT_MVD]], i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP41]], i8 -1, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP141]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP246]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @foo(ptr [[UP_MVD252]], ptr [[LEFT_MVD253]], ptr [[TMP41]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
index ca2ffd72818496f..7e2d617d1ed2751 100644
--- a/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/merge-into-memset.ll
@@ -40,10 +40,9 @@ define void @memset_clobber_no_alias(ptr %p) {
 ; CHECK-LABEL: @memset_clobber_no_alias(
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
 ; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %stack = alloca <256 x i8>, align 8
@@ -59,10 +58,9 @@ define void @store_clobber_no_alias1(i64 %a, ptr %p) {
 ; CHECK-LABEL: @store_clobber_no_alias1(
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
 ; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    store i64 [[A:%.*]], ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %stack = alloca <256 x i8>, align 8
@@ -78,10 +76,9 @@ define void @store_clobber_no_alias2(i64 %a, ptr %p) {
 ; CHECK-LABEL: @store_clobber_no_alias2(
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
 ; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    store i64 [[A:%.*]], ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %stack = alloca <256 x i8>, align 8
@@ -133,11 +130,10 @@ define void @load_clobber_no_alias(ptr %p, ptr %p1) {
 ; CHECK-LABEL: @load_clobber_no_alias(
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca <256 x i8>, align 8
 ; CHECK-NEXT:    [[STACK1:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 8
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    [[A:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    store i64 [[A]], ptr [[P1:%.*]], align 8
 ; CHECK-NEXT:    [[STACK2:%.*]] = getelementptr inbounds i8, ptr [[STACK]], i64 24
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[STACK1]], i8 0, i64 136, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %stack = alloca <256 x i8>, align 8