[llvm] [MemCpyOpt] Forward memcpy source to load instruction (PR #138490)

Mon May 5 00:58:32 PDT 2025

https://github.com/dianqk created https://github.com/llvm/llvm-project/pull/138490

Fixes #137810.

A headache compiling time. :\
This PR is only for the context of subsequent PRs.
Perhaps a cheaper check should be implemented in InstCombine.

https://llvm-compile-time-tracker.com/compare.php?from=cfb9991c0812cde4afc3dc1b84221814a0e8b0ff&to=0fd9da5b5a0c53a90181451f63e0f6fd90b218f8&stat=instructions:u

>From cfb9991c0812cde4afc3dc1b84221814a0e8b0ff Mon Sep 17 00:00:00 2001
From: dianqk <dianqk at dianqk.net>
Date: Sun, 4 May 2025 20:15:14 +0800
Subject: [PATCH 1/4] Pre-commit test cases

---
 llvm/test/Transforms/MemCpyOpt/memcpy-load.ll | 96 +++++++++++++++++++
 .../PhaseOrdering/pr137810-forward-load.ll    | 68 +++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll

diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
new file mode 100644
index 0000000000000..79f62cdbfdab4
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s
+
+define i24 @forward_load(ptr %src) {
+; CHECK-LABEL: define i24 @forward_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false)
+; CHECK-NEXT:    [[VAL1:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT:    ret i24 [[VAL1]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+  %val = load i24, ptr %dest
+  ret i24 %val
+}
+
+define i16 @forward_load_2(ptr %src) {
+; CHECK-LABEL: define i16 @forward_load_2(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 2, i1 false)
+; CHECK-NEXT:    [[VAL1:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT:    ret i16 [[VAL1]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+  %val = load i16, ptr %dest
+  ret i16 %val
+}
+
+define i32 @forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i32 @forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca { i8, i32 }, align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 8, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[VAL1]]
+;
+  %dest = alloca { i8, i32 }
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false)
+  %gep = getelementptr inbounds i8, ptr %dest, i64 4
+  %val = load i32, ptr %gep
+  ret i32 %val
+}
+
+; Negative tests
+
+define i24 @failed_forward_load_write_src(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_src(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false)
+; CHECK-NEXT:    store i1 true, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT:    ret i24 [[VAL]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+  store i1 true, ptr %src
+  %val = load i24, ptr %dest
+  ret i24 %val
+}
+
+define i16 @failed_forward_load_size(ptr %src) {
+; CHECK-LABEL: define i16 @failed_forward_load_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 1, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT:    ret i16 [[VAL]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+  %val = load i16, ptr %dest
+  ret i16 %val
+}
+
+define i32 @failed_forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i32 @failed_forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca { i8, i32 }, align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %dest = alloca { i8, i32 }
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 5, i1 false)
+  %gep = getelementptr inbounds i8, ptr %dest, i64 4
+  %val = load i32, ptr %gep
+  ret i32 %val
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
new file mode 100644
index 0000000000000..224258530ecc0
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+; FIXME: It can return true.
+define i1 @main(ptr %i2) {
+; CHECK-LABEL: define noundef i1 @main(
+; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[I1:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    store i8 0, ptr [[I2]], align 1
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT:    store i8 1, ptr [[I3]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT:    store i8 2, ptr [[I4]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false)
+; CHECK-NEXT:    [[I51:%.*]] = load i8, ptr [[I1]], align 1
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i8 [[I51]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 1
+; CHECK-NEXT:    [[I82:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[I9:%.*]] = icmp eq i8 [[I82]], 1
+; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 2
+; CHECK-NEXT:    [[I123:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[I13:%.*]] = icmp eq i8 [[I123]], 2
+; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false
+; CHECK-NEXT:    br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret i1 [[I14]]
+; CHECK:       [[TRUE]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[FALSE]]:
+; CHECK-NEXT:    call void @assert_failed(ptr nonnull [[I1]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %i1 = alloca [3 x i8], align 1
+  store i8 0, ptr %i2, align 1
+  %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1
+  store i8 1, ptr %i3, align 1
+  %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2
+  store i8 2, ptr %i4, align 1
+  call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1)
+  call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false)
+  %i5 = load i8, ptr %i1, align 1
+  %i6 = icmp eq i8 %i5, 0
+  %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1
+  %i8 = load i8, ptr %i7, align 1
+  %i9 = icmp eq i8 %i8, 1
+  %i10 = select i1 %i6, i1 %i9, i1 false
+  %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2
+  %i12 = load i8, ptr %i11, align 1
+  %i13 = icmp eq i8 %i12, 2
+  %i14 = select i1 %i10, i1 %i13, i1 false
+  br i1 %i14, label %true, label %false
+
+true:
+  call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1)
+  ret i1 true
+
+false:
+  call void @assert_failed(ptr %i1)
+  ret i1 false
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.lifetime.start.p0(i64, ptr)
+declare void @llvm.lifetime.end.p0(i64, ptr)
+declare void @assert_failed(ptr)

>From 0fd9da5b5a0c53a90181451f63e0f6fd90b218f8 Mon Sep 17 00:00:00 2001
From: dianqk <dianqk at dianqk.net>
Date: Sun, 4 May 2025 20:55:34 +0800
Subject: [PATCH 2/4] [MemCpyOpt] Forward memcpy source to load instruction

---
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h  |  14 +-
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 275 +++++++++++-------
 llvm/test/Transforms/MemCpyOpt/memcpy-load.ll |   7 +-
 llvm/test/Transforms/MemCpyOpt/memcpy.ll      |   6 +-
 llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll |   2 +-
 .../PhaseOrdering/pr137810-forward-load.ll    |   6 +-
 6 files changed, 191 insertions(+), 119 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 496d2958fc2d0..d1369ae918959 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H
 #define LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H
 
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
 
@@ -64,21 +65,28 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
 private:
   // Helper functions
   bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
+  bool processLoad(LoadInst *LI, BasicBlock::iterator &BBI,
+                   SmallVectorImpl<Instruction *> &NewInsts);
   bool processStoreOfLoad(StoreInst *SI, LoadInst *LI, const DataLayout &DL,
                           BasicBlock::iterator &BBI);
   bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-  bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
+  bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI,
+                     SmallVectorImpl<Instruction *> &NewInsts);
   bool processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI);
   bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
                             Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
                             Align cpyAlign, BatchAAResults &BAA,
                             std::function<CallInst *()> GetC);
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
-                                     BatchAAResults &BAA);
+                                     BatchAAResults &BAA,
+                                     SmallVectorImpl<Instruction *> &NewInsts);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet,
                                      BatchAAResults &BAA);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet,
                                   BatchAAResults &BAA);
+  bool findNewSrc(MemCpyInst *MDep, Instruction *UseInstr, BatchAAResults &BAA,
+                  Value *&NewSrc, MaybeAlign &NewAlign,
+                  SmallVectorImpl<Instruction *> &NewInsts);
   bool processByValArgument(CallBase &CB, unsigned ArgNo);
   bool processImmutArgument(CallBase &CB, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
@@ -90,7 +98,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   bool isMemMoveMemSetDependency(MemMoveInst *M);
 
   void eraseInstruction(Instruction *I);
-  bool iterateOnFunction(Function &F);
+  bool iterateOnFunction(Function &F, SmallVectorImpl<Instruction *> &NewInsts);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 6407f48dc2c05..1dfa6bc787278 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -74,6 +74,7 @@ STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
 STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
+STATISTIC(NumLoadInstr, "Number of load instruction optimizations performed");
 
 namespace {
 
@@ -739,6 +740,145 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   return false;
 }
 
+bool MemCpyOptPass::findNewSrc(MemCpyInst *MDep, Instruction *UseInstr,
+                               BatchAAResults &BAA, Value *&NewSrc,
+                               MaybeAlign &NewAlign,
+                               SmallVectorImpl<Instruction *> &NewInsts) {
+  auto *MemCpy = dyn_cast<MemCpyInst>(UseInstr);
+  auto *LoadI = dyn_cast<LoadInst>(UseInstr);
+  MemoryLocation UseLoc;
+  Value *OldSrc;
+  if (MemCpy) {
+    UseLoc = MemoryLocation::getForSource(MemCpy);
+    OldSrc = MemCpy->getSource();
+  } else if (LoadI) {
+    UseLoc = MemoryLocation::get(LoadI);
+    OldSrc = LoadI->getPointerOperand();
+  } else
+    return false;
+  uint64_t UseLen = 0;
+  if (UseLoc.Size.hasValue())
+    UseLen = UseLoc.Size.getValue().getKnownMinValue();
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction. Just
+  // ignore the input and let someone else zap MDep. This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (OldSrc == MDep->getSource())
+    return false;
+
+  // We can only optimize non-volatile memcpy's.
+  if (MDep->isVolatile())
+    return false;
+
+  int64_t MForwardOffset = 0;
+  const DataLayout &DL = MDep->getDataLayout();
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other, or they have an offset in a range.
+  if (OldSrc != MDep->getDest()) {
+    std::optional<int64_t> Offset =
+        OldSrc->getPointerOffsetFrom(MDep->getDest(), DL);
+    if (!Offset || *Offset < 0)
+      return false;
+    MForwardOffset = *Offset;
+  }
+
+  // The length of the memcpy's must be the same, or the preceding one
+  // must be larger than the following one.
+  if (MForwardOffset != 0 || LoadI ||
+      (MemCpy && MDep->getLength() != MemCpy->getLength())) {
+    auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+    if (UseLen == 0 || !MDepLen ||
+        MDepLen->getZExtValue() < UseLen + MForwardOffset)
+      return false;
+  }
+  IRBuilder<> Builder(UseInstr);
+  NewSrc = MDep->getSource();
+  NewAlign = MDep->getSourceAlign();
+  // We just need to calculate the actual size of the copy.
+  auto MCopyLoc =
+      MemoryLocation::getForSource(MDep).getWithNewSize(UseLoc.Size);
+
+  // When the forwarding offset is greater than 0, we transform
+  //    memcpy(d1 <- s1)
+  //    memcpy(d2 <- d1+o)
+  // to
+  //    memcpy(d2 <- s1+o)
+  if (MForwardOffset > 0) {
+    // The copy destination of `M` maybe can serve as the source of copying.
+    if (MemCpy && (MForwardOffset == MemCpy->getRawDest()->getPointerOffsetFrom(
+                                         MDep->getRawSource(), DL))) {
+      NewSrc = cast<MemCpyInst>(UseInstr)->getDest();
+    } else {
+      NewSrc = Builder.CreateInBoundsPtrAdd(NewSrc,
+                                            Builder.getInt64(MForwardOffset));
+      if (Instruction *NewI = dyn_cast<Instruction>(NewSrc))
+        NewInsts.push_back(NewI);
+    }
+    // We need to update `MCopyLoc` if an offset exists.
+    MCopyLoc = MCopyLoc.getWithNewPtr(NewSrc);
+    if (NewAlign)
+      NewAlign = commonAlignment(*NewAlign, MForwardOffset);
+  }
+
+  // Avoid infinite loops
+  if (BAA.isMustAlias(OldSrc, NewSrc))
+    return false;
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
+                     MSSA->getMemoryAccess(UseInstr)))
+    return false;
+  return true;
+}
+
+/// Perform simplification of load's. If we have memcpy A which copies X to Y,
+/// and load instruction B which loads from Y, then we can rewrite B to be a
+/// load instruction loads from X. This allows later passes to remove the memcpy
+/// A or identify the source of the load instruction.
+bool MemCpyOptPass::processLoad(LoadInst *LI, BasicBlock::iterator &BBI,
+                                SmallVectorImpl<Instruction *> &NewInsts) {
+  if (!LI->isSimple())
+    return false;
+  MemoryUseOrDef *MA = MSSA->getMemoryAccess(LI);
+  if (!MA)
+    return false;
+  BatchAAResults BAA(*AA, EEA);
+
+  MemoryAccess *AnyClobber = MA->getDefiningAccess();
+  const MemoryAccess *DestClobber =
+      MSSA->getWalker()->getClobberingMemoryAccess(
+          AnyClobber, MemoryLocation::get(LI), BAA);
+  MemCpyInst *MDep = nullptr;
+  if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+    if (Instruction *MI = MD->getMemoryInst())
+      MDep = dyn_cast<MemCpyInst>(MI);
+
+  if (!MDep)
+    return false;
+
+  Value *NewSrc;
+  MaybeAlign NewAlign;
+  if (!findNewSrc(MDep, LI, BAA, NewSrc, NewAlign, NewInsts))
+    return false;
+  IRBuilder<> Builder(LI);
+  Instruction *NewLI =
+      Builder.CreateAlignedLoad(LI->getType(), NewSrc, NewAlign, LI->getName());
+  auto *NewAccess = MSSAU->createMemoryAccessAfter(NewLI, nullptr, MA);
+  MSSAU->insertUse(cast<MemoryUse>(NewAccess), /*RenameUses=*/true);
+  LI->replaceAllUsesWith(NewLI);
+  eraseInstruction(LI);
+  ++NumLoadInstr;
+  return true;
+}
+
 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple())
     return false;
@@ -1101,101 +1241,18 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
 
 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
-bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
-                                                  MemCpyInst *MDep,
-                                                  BatchAAResults &BAA) {
-  // If dep instruction is reading from our current input, then it is a noop
-  // transfer and substituting the input won't change this instruction. Just
-  // ignore the input and let someone else zap MDep. This handles cases like:
-  //    memcpy(a <- a)
-  //    memcpy(b <- a)
-  if (M->getSource() == MDep->getSource())
-    return false;
-
-  // We can only optimize non-volatile memcpy's.
-  if (MDep->isVolatile())
+bool MemCpyOptPass::processMemCpyMemCpyDependence(
+    MemCpyInst *M, MemCpyInst *MDep, BatchAAResults &BAA,
+    SmallVectorImpl<Instruction *> &NewInsts) {
+  Value *NewSrc;
+  MaybeAlign NewAlign;
+  if (!findNewSrc(MDep, M, BAA, NewSrc, NewAlign, NewInsts))
     return false;
 
-  int64_t MForwardOffset = 0;
-  const DataLayout &DL = M->getModule()->getDataLayout();
-  // We can only transforms memcpy's where the dest of one is the source of the
-  // other, or they have an offset in a range.
-  if (M->getSource() != MDep->getDest()) {
-    std::optional<int64_t> Offset =
-        M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL);
-    if (!Offset || *Offset < 0)
-      return false;
-    MForwardOffset = *Offset;
-  }
-
-  // The length of the memcpy's must be the same, or the preceding one
-  // must be larger than the following one.
-  if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) {
-    auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
-    auto *MLen = dyn_cast<ConstantInt>(M->getLength());
-    if (!MDepLen || !MLen ||
-        MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset)
-      return false;
-  }
-
   IRBuilder<> Builder(M);
-  auto *CopySource = MDep->getSource();
-  Instruction *NewCopySource = nullptr;
-  auto CleanupOnRet = llvm::make_scope_exit([&] {
-    if (NewCopySource && NewCopySource->use_empty())
-      // Safety: It's safe here because we will only allocate more instructions
-      // after finishing all BatchAA queries, but we have to be careful if we
-      // want to do something like this in another place. Then we'd probably
-      // have to delay instruction removal until all transforms on an
-      // instruction finished.
-      eraseInstruction(NewCopySource);
-  });
-  MaybeAlign CopySourceAlign = MDep->getSourceAlign();
-  // We just need to calculate the actual size of the copy.
-  auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
-      MemoryLocation::getForSource(M).Size);
-
-  // When the forwarding offset is greater than 0, we transform
-  //    memcpy(d1 <- s1)
-  //    memcpy(d2 <- d1+o)
-  // to
-  //    memcpy(d2 <- s1+o)
-  if (MForwardOffset > 0) {
-    // The copy destination of `M` maybe can serve as the source of copying.
-    std::optional<int64_t> MDestOffset =
-        M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
-    if (MDestOffset == MForwardOffset)
-      CopySource = M->getDest();
-    else {
-      CopySource = Builder.CreateInBoundsPtrAdd(
-          CopySource, Builder.getInt64(MForwardOffset));
-      NewCopySource = dyn_cast<Instruction>(CopySource);
-    }
-    // We need to update `MCopyLoc` if an offset exists.
-    MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
-    if (CopySourceAlign)
-      CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);
-  }
-
-  // Avoid infinite loops
-  if (BAA.isMustAlias(M->getSource(), CopySource))
-    return false;
-
-  // Verify that the copied-from memory doesn't change in between the two
-  // transfers.  For example, in:
-  //    memcpy(a <- b)
-  //    *b = 42;
-  //    memcpy(c <- a)
-  // It would be invalid to transform the second memcpy into memcpy(c <- b).
-  //
-  // TODO: If the code between M and MDep is transparent to the destination "c",
-  // then we could still perform the xform by moving M up to the first memcpy.
-  if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
-                     MSSA->getMemoryAccess(M)))
-    return false;
 
   // No need to create `memcpy(a <- a)`.
-  if (BAA.isMustAlias(M->getDest(), CopySource)) {
+  if (BAA.isMustAlias(M->getDest(), NewSrc)) {
     // Remove the instruction we're replacing.
     eraseInstruction(M);
     ++NumMemCpyInstr;
@@ -1226,20 +1283,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // example we could be moving from movaps -> movq on x86.
   Instruction *NewM;
   if (UseMemMove)
-    NewM =
-        Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource,
-                              CopySourceAlign, M->getLength(), M->isVolatile());
+    NewM = Builder.CreateMemMove(M->getDest(), M->getDestAlign(), NewSrc,
+                                 NewAlign, M->getLength(), M->isVolatile());
   else if (isa<MemCpyInlineInst>(M)) {
     // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
     // never allowed since that would allow the latter to be lowered as a call
     // to an external function.
-    NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(),
-                                      CopySource, CopySourceAlign,
-                                      M->getLength(), M->isVolatile());
-  } else
     NewM =
-        Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource,
-                             CopySourceAlign, M->getLength(), M->isVolatile());
+        Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), NewSrc,
+                                   NewAlign, M->getLength(), M->isVolatile());
+  } else
+    NewM = Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), NewSrc,
+                                NewAlign, M->getLength(), M->isVolatile());
   NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);
 
   assert(isa<MemoryDef>(MSSA->getMemoryAccess(M)));
@@ -1703,7 +1758,8 @@ static bool isZeroSize(Value *Size) {
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI,
+                                  SmallVectorImpl<Instruction *> &NewInsts) {
   // We can only optimize non-volatile memcpy's.
   if (M->isVolatile())
     return false;
@@ -1791,7 +1847,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         }
       }
       if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-        if (processMemCpyMemCpyDependence(M, MDep, BAA))
+        if (processMemCpyMemCpyDependence(M, MDep, BAA, NewInsts))
           return true;
       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
@@ -2096,7 +2152,8 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
 }
 
 /// Executes one iteration of MemCpyOptPass.
-bool MemCpyOptPass::iterateOnFunction(Function &F) {
+bool MemCpyOptPass::iterateOnFunction(
+    Function &F, SmallVectorImpl<Instruction *> &NewInsts) {
   bool MadeChange = false;
 
   // Walk all instruction in the function.
@@ -2114,12 +2171,14 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 
       bool RepeatInstruction = false;
 
-      if (auto *SI = dyn_cast<StoreInst>(I))
+      if (auto *LI = dyn_cast<LoadInst>(I))
+        MadeChange |= processLoad(LI, BI, NewInsts);
+      else if (auto *SI = dyn_cast<StoreInst>(I))
         MadeChange |= processStore(SI, BI);
       else if (auto *M = dyn_cast<MemSetInst>(I))
         RepeatInstruction = processMemSet(M, BI);
       else if (auto *M = dyn_cast<MemCpyInst>(I))
-        RepeatInstruction = processMemCpy(M, BI);
+        RepeatInstruction = processMemCpy(M, BI, NewInsts);
       else if (auto *M = dyn_cast<MemMoveInst>(I))
         RepeatInstruction = processMemMove(M, BI);
       else if (auto *CB = dyn_cast<CallBase>(I)) {
@@ -2176,13 +2235,19 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   MSSAU = &MSSAU_;
   EarliestEscapeAnalysis EEA_(*DT);
   EEA = &EEA_;
+  SmallVector<Instruction *, 4> NewInsts;
 
   while (true) {
-    if (!iterateOnFunction(F))
+    if (!iterateOnFunction(F, NewInsts))
       break;
     MadeChange = true;
   }
 
+  for (auto *I : NewInsts) {
+    if (I->use_empty())
+      eraseInstruction(I);
+  }
+
   if (VerifyMemorySSA)
     MSSA_->verifyMemorySSA();
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
index 79f62cdbfdab4..462e03f22c2f1 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
@@ -6,7 +6,7 @@ define i24 @forward_load(ptr %src) {
 ; CHECK-SAME: ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false)
-; CHECK-NEXT:    [[VAL1:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT:    [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4
 ; CHECK-NEXT:    ret i24 [[VAL1]]
 ;
   %dest = alloca [3 x i8]
@@ -20,7 +20,7 @@ define i16 @forward_load_2(ptr %src) {
 ; CHECK-SAME: ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 2, i1 false)
-; CHECK-NEXT:    [[VAL1:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT:    [[VAL1:%.*]] = load i16, ptr [[SRC]], align 2
 ; CHECK-NEXT:    ret i16 [[VAL1]]
 ;
   %dest = alloca [3 x i8]
@@ -35,7 +35,8 @@ define i32 @forward_load_padding(ptr %src) {
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca { i8, i32 }, align 8
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 8, i1 false)
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[VAL1]]
 ;
   %dest = alloca { i8, i32 }
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 89d8eb1ee6711..066325086b7f0 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -229,10 +229,8 @@ define void @test4_write_between(ptr %P) {
 
 define i8 @test4_read_between(ptr %P) {
 ; CHECK-LABEL: @test4_read_between(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A1]], ptr align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[A1]], align 1
-; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[P]])
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[A1:%.*]], align 4
+; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[A1]])
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
   %a1 = alloca %1
diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
index 5e13432746bf7..51689cc6fd452 100644
--- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -20,7 +20,7 @@ define i32 @foo(i1 %z) {
 ; CHECK-NEXT:    br label [[FOR_INC7_1]]
 ; CHECK:       for.inc7.1:
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SCEVGEP]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
index 224258530ecc0..006f15a31c4e1 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -13,13 +13,13 @@ define i1 @main(ptr %i2) {
 ; CHECK-NEXT:    store i8 2, ptr [[I4]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false)
-; CHECK-NEXT:    [[I51:%.*]] = load i8, ptr [[I1]], align 1
+; CHECK-NEXT:    [[I51:%.*]] = load i8, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I6:%.*]] = icmp eq i8 [[I51]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
 ; CHECK-NEXT:    [[I82:%.*]] = load i8, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[I9:%.*]] = icmp eq i8 [[I82]], 1
 ; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
 ; CHECK-NEXT:    [[I123:%.*]] = load i8, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[I13:%.*]] = icmp eq i8 [[I123]], 2
 ; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false

>From 0fb1679edc6477fea32e52a33c2bf92c6c57c26c Mon Sep 17 00:00:00 2001
From: dianqk <dianqk at dianqk.net>
Date: Sun, 4 May 2025 20:55:44 +0800
Subject: [PATCH 3/4] [InstCombine] Accumulate the limit only on the
 instructions that require scanning

---
 llvm/lib/Analysis/Loads.cpp                   | 14 +++++-
 .../Coroutines/coro-retcon-resume-values.ll   |  9 ++--
 .../JumpThreading/unreachable-loops.ll        |  8 ++-
 .../LowerMatrixIntrinsics/multiply-fused.ll   | 24 +++------
 .../early-arg-attrs-inference.ll              |  2 +-
 .../PhaseOrdering/pr137810-forward-load.ll    | 27 ++--------
 .../SLPVectorizer/revec-shufflevector.ll      |  5 +-
 .../SampleProfile/pseudo-probe-instcombine.ll | 50 +++++++++++++++++++
 8 files changed, 85 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index b461c41d29e84..a8d45bbbe2974 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -539,6 +539,16 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
   return LoadRange.intersectWith(StoreRange).isEmptySet();
 }
 
+static bool maybeAvailableLoadStore(Instruction *Inst) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+    return true;
+  default:
+    return isa<MemSetInst>(Inst);
+  }
+}
+
 static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
                                     Type *AccessTy, bool AtLeastAtomic,
                                     const DataLayout &DL, bool *IsLoadCSE) {
@@ -653,7 +663,7 @@ Value *llvm::findAvailablePtrLoadStore(
       ++(*NumScanedInst);
 
     // Don't scan huge blocks.
-    if (MaxInstsToScan-- == 0)
+    if (maybeAvailableLoadStore(Inst) && MaxInstsToScan-- == 0)
       return nullptr;
 
     --ScanFrom;
@@ -734,7 +744,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
     if (Inst.isDebugOrPseudoInst())
       continue;
 
-    if (MaxInstsToScan-- == 0)
+    if (maybeAvailableLoadStore(&Inst) && MaxInstsToScan-- == 0)
       return nullptr;
 
     Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
index 907d7e588ffe0..bf78174533d5a 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
@@ -38,15 +38,18 @@ define i32 @main() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 12)
 ; CHECK-NEXT:    store i32 1, ptr [[TMP0]], align 4
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
 ; CHECK-NEXT:    [[N_VAL3_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0]]
 ; CHECK-NEXT:    [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    store i32 2, ptr [[INPUT_SPILL_ADDR_I]], align 4, !noalias [[META0]]
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3]]
 ; CHECK-NEXT:    store i32 4, ptr [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias [[META3]]
-; CHECK-NEXT:    tail call void @print(i32 7), !noalias [[META6:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+; CHECK-NEXT:    tail call void @print(i32 7), !noalias [[META6]]
 ; CHECK-NEXT:    tail call void @deallocate(ptr nonnull [[TMP0]]), !noalias [[META6]]
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
index 79c5e9217312d..f4d5fc9a26728 100644
--- a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
+++ b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
@@ -191,11 +191,8 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) {
 ; CHECK-LABEL: @constant_phi_leads_to_self_reference(
 ; CHECK-NEXT:    [[A9:%.*]] = alloca i1, align 1
 ; CHECK-NEXT:    br label [[F6:%.*]]
-; CHECK:       T3:
+; CHECK:       BB5.thread:
 ; CHECK-NEXT:    br label [[BB5:%.*]]
-; CHECK:       BB5:
-; CHECK-NEXT:    [[L10:%.*]] = load i1, ptr [[A9]], align 1
-; CHECK-NEXT:    br i1 [[L10]], label [[BB6:%.*]], label [[F6]]
 ; CHECK:       BB6:
 ; CHECK-NEXT:    [[LGV3:%.*]] = load i1, ptr [[PTR:%.*]], align 1
 ; CHECK-NEXT:    [[C4:%.*]] = icmp sle i1 [[C4]], true
@@ -204,7 +201,8 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) {
 ; CHECK:       F6:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       F7:
-; CHECK-NEXT:    br label [[BB5]]
+; CHECK-NEXT:    [[L10_PR:%.*]] = load i1, ptr [[A9]], align 1
+; CHECK-NEXT:    br i1 [[L10_PR]], label [[BB5]], label [[F6]]
 ;
   %A9 = alloca i1, align 1
   br i1 false, label %BB4, label %F6
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 155f7755c2095..6cbbb534b98b0 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -263,21 +263,17 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[VEC_GEP34:%.*]] = getelementptr i8, ptr [[C]], i64 32
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 16
-; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[VEC_GEP36:%.*]] = getelementptr i8, ptr [[A]], i64 48
-; CHECK-NEXT:    [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8
 ; CHECK-NEXT:    [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[VEC_GEP39:%.*]] = getelementptr i8, ptr [[A]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT43]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP10]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT49:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT49]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[VEC_GEP54:%.*]] = getelementptr i8, ptr [[A]], i64 112
@@ -313,22 +309,18 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[A]], i64 64
-; CHECK-NEXT:    [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
-; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[A]], i64 96
-; CHECK-NEXT:    [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
 ; CHECK-NEXT:    [[VEC_GEP96:%.*]] = getelementptr i8, ptr [[A]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
+; CHECK-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP29]])
+; CHECK-NEXT:    [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP27]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]])
+; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP29]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[C]], i64 64
 ; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP33]], align 8
 ; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 96
diff --git a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
index 93a8c803aba37..b14d5e590ae7b 100644
--- a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
+++ b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
@@ -3,7 +3,7 @@
 
 define i32 @f(ptr noalias %p, i32 %c) {
 ; CHECK-LABEL: define noundef i32 @f
-; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr {
+; CHECK-SAME: (ptr noalias readnone captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    tail call void @g()
 ; CHECK-NEXT:    tail call void @g()
 ; CHECK-NEXT:    tail call void @g()
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
index 006f15a31c4e1..98eb90c184d74 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -1,37 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -O2 -S < %s | FileCheck %s
 
-; FIXME: It can return true.
 define i1 @main(ptr %i2) {
 ; CHECK-LABEL: define noundef i1 @main(
-; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:    [[I1:%.*]] = alloca [3 x i8], align 1
+; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[TRUE:.*:]]
 ; CHECK-NEXT:    store i8 0, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
 ; CHECK-NEXT:    store i8 1, ptr [[I3]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
 ; CHECK-NEXT:    store i8 2, ptr [[I4]], align 1
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false)
-; CHECK-NEXT:    [[I51:%.*]] = load i8, ptr [[I2]], align 1
-; CHECK-NEXT:    [[I6:%.*]] = icmp eq i8 [[I51]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
-; CHECK-NEXT:    [[I82:%.*]] = load i8, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[I9:%.*]] = icmp eq i8 [[I82]], 1
-; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
-; CHECK-NEXT:    [[I123:%.*]] = load i8, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[I13:%.*]] = icmp eq i8 [[I123]], 2
-; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false
-; CHECK-NEXT:    br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]]
-; CHECK:       [[COMMON_RET:.*]]:
-; CHECK-NEXT:    ret i1 [[I14]]
-; CHECK:       [[TRUE]]:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]])
-; CHECK-NEXT:    br label %[[COMMON_RET]]
-; CHECK:       [[FALSE]]:
-; CHECK-NEXT:    call void @assert_failed(ptr nonnull [[I1]])
-; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK-NEXT:    ret i1 true
 ;
   %i1 = alloca [3 x i8], align 1
   store i8 0, ptr %i2, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index b85c78ec8d2d0..d91dfc01649bc 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -231,14 +231,13 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; COMBINE-NEXT:    [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]]
 ; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32
 ; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128
-; COMBINE-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2]], align 16
 ; COMBINE-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
 ; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float>
 ; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
index ff1e165c8c54a..34839b5140b7f 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
@@ -106,6 +106,56 @@ define i32 @load(ptr nocapture %a, ptr nocapture %b) {
   ret i32 %5
 }
 
+;; Check the load is deleted.
+define i32 @load_not_pseudo(ptr noalias %arg, ptr noalias %arg1) {
+; CHECK-LABEL: @load_not_pseudo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i32 1, ptr [[ARG1:%.*]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[ARG2:%.*]], align 4
+; CHECK-NEXT:    ret i32 1
+;
+bb:
+  store i32 1, ptr %arg, align 4
+  store i32 1, ptr %arg1, align 4
+  %i = load i32, ptr %arg, align 4
+  ret i32 %i
+}
+
+;; Check the load is deleted.
+define i32 @load_not_pseudo_2(ptr noalias %arg, ptr noalias %arg1) {
+; CHECK-LABEL: @load_not_pseudo_2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i32 1, ptr [[ARG:%.*]], align 4
+; CHECK-NEXT:    [[ARG1_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1:%.*]], i64 4
+; CHECK-NEXT:    store i32 1, ptr [[ARG1_1]], align 4
+; CHECK-NEXT:    ret i32 1
+;
+bb:
+  store i32 1, ptr %arg, align 4
+  %arg1_1 = getelementptr inbounds i32, ptr %arg1, i32 1
+  store i32 1, ptr %arg1_1, align 4
+  %i = load i32, ptr %arg, align 4
+  ret i32 %i
+}
+
+;; Check the load is not deleted.
+define i32 @load_not_pseudo_3(ptr noalias %arg, ptr noalias %arg1, ptr noalias %arg2) {
+; CHECK-LABEL: @load_not_pseudo_3(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i32 1, ptr [[ARG:%.*]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[ARG1:%.*]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[ARG2:%.*]], align 4
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[ARG]], align 4
+; CHECK-NEXT:    ret i32 [[I]]
+;
+bb:
+  store i32 1, ptr %arg, align 4
+  store i32 1, ptr %arg1, align 4
+  store i32 1, ptr %arg2, align 4
+  %i = load i32, ptr %arg, align 4
+  ret i32 %i
+}
+
 ;; Check the first store is deleted.
 define void @dse(ptr %p) {
 ; CHECK-LABEL: @dse(

>From 6efcb44027cf509d65b4c7a2b112604bd5e3d0e9 Mon Sep 17 00:00:00 2001
From: dianqk <dianqk at dianqk.net>
Date: Sun, 4 May 2025 22:00:07 +0800
Subject: [PATCH 4/4] Revert "[InstCombine] Accumulate the limit only on the
 instructions that require"

This reverts commit 0fb1679edc6477fea32e52a33c2bf92c6c57c26c.
---
 llvm/lib/Analysis/Loads.cpp                   | 14 +-----
 .../Coroutines/coro-retcon-resume-values.ll   |  9 ++--
 .../JumpThreading/unreachable-loops.ll        |  8 +--
 .../LowerMatrixIntrinsics/multiply-fused.ll   | 24 ++++++---
 .../early-arg-attrs-inference.ll              |  2 +-
 .../PhaseOrdering/pr137810-forward-load.ll    | 27 ++++++++--
 .../SLPVectorizer/revec-shufflevector.ll      |  5 +-
 .../SampleProfile/pseudo-probe-instcombine.ll | 50 -------------------
 8 files changed, 54 insertions(+), 85 deletions(-)

diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index a8d45bbbe2974..b461c41d29e84 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -539,16 +539,6 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
   return LoadRange.intersectWith(StoreRange).isEmptySet();
 }
 
-static bool maybeAvailableLoadStore(Instruction *Inst) {
-  switch (Inst->getOpcode()) {
-  case Instruction::Load:
-  case Instruction::Store:
-    return true;
-  default:
-    return isa<MemSetInst>(Inst);
-  }
-}
-
 static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
                                     Type *AccessTy, bool AtLeastAtomic,
                                     const DataLayout &DL, bool *IsLoadCSE) {
@@ -663,7 +653,7 @@ Value *llvm::findAvailablePtrLoadStore(
       ++(*NumScanedInst);
 
     // Don't scan huge blocks.
-    if (maybeAvailableLoadStore(Inst) && MaxInstsToScan-- == 0)
+    if (MaxInstsToScan-- == 0)
       return nullptr;
 
     --ScanFrom;
@@ -744,7 +734,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
     if (Inst.isDebugOrPseudoInst())
       continue;
 
-    if (maybeAvailableLoadStore(&Inst) && MaxInstsToScan-- == 0)
+    if (MaxInstsToScan-- == 0)
       return nullptr;
 
     Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
index bf78174533d5a..907d7e588ffe0 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
@@ -38,18 +38,15 @@ define i32 @main() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 12)
 ; CHECK-NEXT:    store i32 1, ptr [[TMP0]], align 4
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
 ; CHECK-NEXT:    [[N_VAL3_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0]]
+; CHECK-NEXT:    store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    store i32 2, ptr [[INPUT_SPILL_ADDR_I]], align 4, !noalias [[META0]]
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3]]
+; CHECK-NEXT:    store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    store i32 4, ptr [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias [[META3]]
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
-; CHECK-NEXT:    tail call void @print(i32 7), !noalias [[META6]]
+; CHECK-NEXT:    tail call void @print(i32 7), !noalias [[META6:![0-9]+]]
 ; CHECK-NEXT:    tail call void @deallocate(ptr nonnull [[TMP0]]), !noalias [[META6]]
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
index f4d5fc9a26728..79c5e9217312d 100644
--- a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
+++ b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll
@@ -191,8 +191,11 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) {
 ; CHECK-LABEL: @constant_phi_leads_to_self_reference(
 ; CHECK-NEXT:    [[A9:%.*]] = alloca i1, align 1
 ; CHECK-NEXT:    br label [[F6:%.*]]
-; CHECK:       BB5.thread:
+; CHECK:       T3:
 ; CHECK-NEXT:    br label [[BB5:%.*]]
+; CHECK:       BB5:
+; CHECK-NEXT:    [[L10:%.*]] = load i1, ptr [[A9]], align 1
+; CHECK-NEXT:    br i1 [[L10]], label [[BB6:%.*]], label [[F6]]
 ; CHECK:       BB6:
 ; CHECK-NEXT:    [[LGV3:%.*]] = load i1, ptr [[PTR:%.*]], align 1
 ; CHECK-NEXT:    [[C4:%.*]] = icmp sle i1 [[C4]], true
@@ -201,8 +204,7 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) {
 ; CHECK:       F6:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       F7:
-; CHECK-NEXT:    [[L10_PR:%.*]] = load i1, ptr [[A9]], align 1
-; CHECK-NEXT:    br i1 [[L10_PR]], label [[BB5]], label [[F6]]
+; CHECK-NEXT:    br label [[BB5]]
 ;
   %A9 = alloca i1, align 1
   br i1 false, label %BB4, label %F6
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 6cbbb534b98b0..155f7755c2095 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -263,17 +263,21 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[VEC_GEP34:%.*]] = getelementptr i8, ptr [[C]], i64 32
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[VEC_GEP36:%.*]] = getelementptr i8, ptr [[A]], i64 48
+; CHECK-NEXT:    [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8
 ; CHECK-NEXT:    [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[VEC_GEP39:%.*]] = getelementptr i8, ptr [[A]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT43]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP11]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT49:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT49]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[VEC_GEP54:%.*]] = getelementptr i8, ptr [[A]], i64 112
@@ -309,18 +313,22 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]])
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[A]], i64 64
+; CHECK-NEXT:    [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
+; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[A]], i64 96
+; CHECK-NEXT:    [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
 ; CHECK-NEXT:    [[VEC_GEP96:%.*]] = getelementptr i8, ptr [[A]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
+; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP27]])
+; CHECK-NEXT:    [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP29]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
+; CHECK-NEXT:    [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP29]])
+; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[C]], i64 64
 ; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP33]], align 8
 ; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 96
diff --git a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
index b14d5e590ae7b..93a8c803aba37 100644
--- a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
+++ b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
@@ -3,7 +3,7 @@
 
 define i32 @f(ptr noalias %p, i32 %c) {
 ; CHECK-LABEL: define noundef i32 @f
-; CHECK-SAME: (ptr noalias readnone captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr {
+; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    tail call void @g()
 ; CHECK-NEXT:    tail call void @g()
 ; CHECK-NEXT:    tail call void @g()
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
index 98eb90c184d74..006f15a31c4e1 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -1,16 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -O2 -S < %s | FileCheck %s
 
+; FIXME: It can return true.
 define i1 @main(ptr %i2) {
 ; CHECK-LABEL: define noundef i1 @main(
-; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[TRUE:.*:]]
+; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[I1:%.*]] = alloca [3 x i8], align 1
 ; CHECK-NEXT:    store i8 0, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
 ; CHECK-NEXT:    store i8 1, ptr [[I3]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
 ; CHECK-NEXT:    store i8 2, ptr [[I4]], align 1
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false)
+; CHECK-NEXT:    [[I51:%.*]] = load i8, ptr [[I2]], align 1
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i8 [[I51]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT:    [[I82:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[I9:%.*]] = icmp eq i8 [[I82]], 1
+; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT:    [[I123:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[I13:%.*]] = icmp eq i8 [[I123]], 2
+; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false
+; CHECK-NEXT:    br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret i1 [[I14]]
+; CHECK:       [[TRUE]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[FALSE]]:
+; CHECK-NEXT:    call void @assert_failed(ptr nonnull [[I1]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
 ;
   %i1 = alloca [3 x i8], align 1
   store i8 0, ptr %i2, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index d91dfc01649bc..b85c78ec8d2d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -231,13 +231,14 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; COMBINE-NEXT:    [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]]
 ; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32
 ; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128
+; COMBINE-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2]], align 16
 ; COMBINE-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
 ; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float>
 ; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
index 34839b5140b7f..ff1e165c8c54a 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
@@ -106,56 +106,6 @@ define i32 @load(ptr nocapture %a, ptr nocapture %b) {
   ret i32 %5
 }
 
-;; Check the load is deleted.
-define i32 @load_not_pseudo(ptr noalias %arg, ptr noalias %arg1) {
-; CHECK-LABEL: @load_not_pseudo(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store i32 1, ptr [[ARG1:%.*]], align 4
-; CHECK-NEXT:    store i32 1, ptr [[ARG2:%.*]], align 4
-; CHECK-NEXT:    ret i32 1
-;
-bb:
-  store i32 1, ptr %arg, align 4
-  store i32 1, ptr %arg1, align 4
-  %i = load i32, ptr %arg, align 4
-  ret i32 %i
-}
-
-;; Check the load is deleted.
-define i32 @load_not_pseudo_2(ptr noalias %arg, ptr noalias %arg1) {
-; CHECK-LABEL: @load_not_pseudo_2(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store i32 1, ptr [[ARG:%.*]], align 4
-; CHECK-NEXT:    [[ARG1_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1:%.*]], i64 4
-; CHECK-NEXT:    store i32 1, ptr [[ARG1_1]], align 4
-; CHECK-NEXT:    ret i32 1
-;
-bb:
-  store i32 1, ptr %arg, align 4
-  %arg1_1 = getelementptr inbounds i32, ptr %arg1, i32 1
-  store i32 1, ptr %arg1_1, align 4
-  %i = load i32, ptr %arg, align 4
-  ret i32 %i
-}
-
-;; Check the load is not deleted.
-define i32 @load_not_pseudo_3(ptr noalias %arg, ptr noalias %arg1, ptr noalias %arg2) {
-; CHECK-LABEL: @load_not_pseudo_3(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store i32 1, ptr [[ARG:%.*]], align 4
-; CHECK-NEXT:    store i32 1, ptr [[ARG1:%.*]], align 4
-; CHECK-NEXT:    store i32 1, ptr [[ARG2:%.*]], align 4
-; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[ARG]], align 4
-; CHECK-NEXT:    ret i32 [[I]]
-;
-bb:
-  store i32 1, ptr %arg, align 4
-  store i32 1, ptr %arg1, align 4
-  store i32 1, ptr %arg2, align 4
-  %i = load i32, ptr %arg, align 4
-  ret i32 %i
-}
-
 ;; Check the first store is deleted.
 define void @dse(ptr %p) {
 ; CHECK-LABEL: @dse(