[llvm] [MemCpyOpt] Forward `memcpy` based on the actual copy memory location. (PR #87190)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 11 16:49:45 PDT 2024


https://github.com/DianQK updated https://github.com/llvm/llvm-project/pull/87190

>From fc89492789acf5657e2cdd4663f946e78cf662f5 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Tue, 2 Apr 2024 08:46:38 +0800
Subject: [PATCH 01/12] Pre-commit test cases

---
 .../MemCpyOpt/memcpy-memcpy-offset.ll         | 195 ++++++++++++++++++
 .../Transforms/PhaseOrdering/memcpy-offset.ll |  42 ++++
 2 files changed, 237 insertions(+)
 create mode 100644 llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll

diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
new file mode 100644
index 0000000000000..fe5056d85dcd3
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s
+
+%buf = type [9 x i8]
+
+; We can forward `memcpy` because the copy location are the same,
+define void @forward_offset(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  ret void
+}
+
+; We need to update the align value of the source of `memcpy` when forwarding.
+define void @forward_offset_align(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset_align(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3
+; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 3
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 3
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  ret void
+}
+
+; We can change the align value to 2 when forwarding.
+define void @forward_offset_align_2(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset_align_2(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2
+; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 2
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  ret void
+}
+
+; We need to create a GEP instruction when forwarding.
+define void @forward_offset_with_gep(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset_with_gep(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  ret void
+}
+
+; Make sure we pass the right parameters when calling `memcpy`.
+define void @forward_offset_memcpy(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset_memcpy(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    call void @use(ptr [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  %dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  call void @use(ptr %dest)
+  ret void
+}
+
+; Make sure we pass the right parameters when calling `memcpy.inline`.
+define void @forward_offset_memcpy_inline(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset_memcpy_inline(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    call void @use(ptr [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  %dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  call void @use(ptr %dest)
+  ret void
+}
+
+; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on.
+define void @do_not_forward_oversize_offset(ptr %dep_src) {
+; CHECK-LABEL: define void @do_not_forward_oversize_offset(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 6, i1 false)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 6, i1 false)
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  ret void
+}
+
+; We can forward `memcpy` because the write operation does not corrupt the location to be copied.
+define void @forward_offset_and_store(ptr %dep_src) {
+; CHECK-LABEL: define void @forward_offset_and_store(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
+; CHECK-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
+; CHECK-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
+; CHECK-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  store i8 1, ptr %dep_src, align 1
+  %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6
+  store i8 1, ptr %dep_src_end, align 1
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  ret void
+}
+
+; We cannot forward `memcpy` because the write operation alters the location to be copied.
+; Also, make sure we have removed the GEP instruction that was created temporarily.
+define void @do_not_forward_offset_and_store(ptr %dep_src) {
+; CHECK-LABEL: define void @do_not_forward_offset_and_store(
+; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    store i8 1, ptr [[DEP]], align 1
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  %dep_src_offset = getelementptr inbounds i8, ptr %dep_src, i64 1
+  store i8 1, ptr %dep_src_offset, align 1
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  ret void
+}
+
+declare void @use(ptr)
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1)
+declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1)
diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
new file mode 100644
index 0000000000000..0d34932937eee
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=memcpyopt,dse,instcombine -S -verify-memoryssa | FileCheck --check-prefix=CUSTOM %s
+; RUN: opt < %s -O2 -S | FileCheck --check-prefix=O2 %s
+
+%buf = type [7 x i8]
+
+; Check that we eliminate all `memcpy` calls in this function.
+define void @forward_offset_and_store(ptr %dep_src) {
+; CUSTOM-LABEL: define void @forward_offset_and_store(
+; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) {
+; CUSTOM-NEXT:    [[DEP_DEST:%.*]] = alloca [7 x i8], align 1
+; CUSTOM-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false)
+; CUSTOM-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
+; CUSTOM-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
+; CUSTOM-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
+; CUSTOM-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CUSTOM-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CUSTOM-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false)
+; CUSTOM-NEXT:    ret void
+;
+; O2-LABEL: define void @forward_offset_and_store(
+; O2-SAME: ptr nocapture [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O2-NEXT:    [[DEP_DEST:%.*]] = alloca [7 x i8], align 1
+; O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false)
+; O2-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
+; O2-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
+; O2-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
+; O2-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; O2-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false)
+; O2-NEXT:    ret void
+;
+  %dep_dest = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
+  store i8 1, ptr %dep_src, align 1
+  %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6
+  store i8 1, ptr %dep_src_end, align 1
+  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
+  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  ret void
+}

>From d5a966c9ecb484ea1777b7417bc151046318fe79 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Tue, 2 Apr 2024 08:47:44 +0800
Subject: [PATCH 02/12] [MemCpyOpt] Calculate the offset value to forward
 `memcpy`

---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 85 +++++++++++++------
 .../MemCpyOpt/memcpy-memcpy-offset.ll         | 17 ++--
 .../Transforms/PhaseOrdering/memcpy-offset.ll | 12 +--
 3 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9bf87f2370531..e41c86069fb4e 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -1124,28 +1125,67 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
 bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
                                                   MemCpyInst *MDep,
                                                   BatchAAResults &BAA) {
-  // We can only transforms memcpy's where the dest of one is the source of the
-  // other.
-  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
-    return false;
-
   // If dep instruction is reading from our current input, then it is a noop
-  // transfer and substituting the input won't change this instruction.  Just
-  // ignore the input and let someone else zap MDep.  This handles cases like:
+  // transfer and substituting the input won't change this instruction. Just
+  // ignore the input and let someone else zap MDep. This handles cases like:
   //    memcpy(a <- a)
   //    memcpy(b <- a)
   if (M->getSource() == MDep->getSource())
     return false;
 
-  // Second, the length of the memcpy's must be the same, or the preceding one
+  // We can only optimize non-volatile memcpy's.
+  if (MDep->isVolatile())
+    return false;
+
+  int64_t MForwardOffset = 0;
+  const DataLayout &DL = M->getModule()->getDataLayout();
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other, or they have an offset in a range.
+  if (M->getSource() != MDep->getDest()) {
+    std::optional<int64_t> Offset =
+        M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL);
+    if (!Offset || *Offset < 0)
+      return false;
+    MForwardOffset = *Offset;
+  }
+
+  // The length of the memcpy's must be the same, or the preceding one
   // must be larger than the following one.
-  if (MDep->getLength() != M->getLength()) {
+  if (MForwardOffset != 0 || (MDep->getLength() != M->getLength())) {
     auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
     auto *MLen = dyn_cast<ConstantInt>(M->getLength());
-    if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+    if (!MDepLen || !MLen ||
+        MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset)
       return false;
   }
 
+  IRBuilder<> Builder(M);
+  auto *CopySource = MDep->getRawSource();
+  auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] {
+    if (CopySource->use_empty())
+      cast<Instruction>(CopySource)->eraseFromParent();
+  });
+  MaybeAlign CopySourceAlign = MDep->getSourceAlign();
+  // We just need to calculate the actual size of the copy.
+  auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
+      MemoryLocation::getForSource(M).Size);
+
+  // We need to update `MCopyLoc` if an offset exists.
+  if (MForwardOffset > 0) {
+    // The copy destination of `M` maybe can serve as the source of copying.
+    std::optional<int64_t> MDestOffset =
+        M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
+    if (MDestOffset && *MDestOffset == MForwardOffset)
+      CopySource = M->getRawDest();
+    else
+      CopySource = Builder.CreateInBoundsPtrAdd(
+          CopySource, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
+                                       MForwardOffset));
+    MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
+    if (CopySourceAlign)
+      CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);
+  }
+
   // Verify that the copied-from memory doesn't change in between the two
   // transfers.  For example, in:
   //    memcpy(a <- b)
@@ -1155,10 +1195,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   //
   // TODO: If the code between M and MDep is transparent to the destination "c",
   // then we could still perform the xform by moving M up to the first memcpy.
-  // TODO: It would be sufficient to check the MDep source up to the memcpy
-  // size of M, rather than MDep.
-  if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep),
-                     MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
+  if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
+                     MSSA->getMemoryAccess(M)))
     return false;
 
   // No need to create `memcpy(a <- a)`.
@@ -1191,23 +1229,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
 
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
-  IRBuilder<> Builder(M);
   Instruction *NewM;
   if (UseMemMove)
-    NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
-                                 MDep->getRawSource(), MDep->getSourceAlign(),
-                                 M->getLength(), M->isVolatile());
+    NewM =
+        Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), CopySource,
+                              CopySourceAlign, M->getLength(), M->isVolatile());
   else if (isa<MemCpyInlineInst>(M)) {
     // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
     // never allowed since that would allow the latter to be lowered as a call
     // to an external function.
-    NewM = Builder.CreateMemCpyInline(
-        M->getRawDest(), M->getDestAlign(), MDep->getRawSource(),
-        MDep->getSourceAlign(), M->getLength(), M->isVolatile());
+    NewM = Builder.CreateMemCpyInline(M->getRawDest(), M->getDestAlign(),
+                                      CopySource, CopySourceAlign,
+                                      M->getLength(), M->isVolatile());
   } else
-    NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
-                                MDep->getRawSource(), MDep->getSourceAlign(),
-                                M->getLength(), M->isVolatile());
+    NewM =
+        Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), CopySource,
+                             CopySourceAlign, M->getLength(), M->isVolatile());
   NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);
 
   assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
index fe5056d85dcd3..4d00ea70a564d 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -11,7 +11,7 @@ define void @forward_offset(ptr %dep_src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
 ; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dep_dest = alloca %buf, align 1
@@ -30,7 +30,7 @@ define void @forward_offset_align(ptr %dep_src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3
 ; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[DEST]], i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dep_dest = alloca %buf, align 1
@@ -49,7 +49,7 @@ define void @forward_offset_align_2(ptr %dep_src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2
 ; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 2 [[DEP]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dep_dest = alloca %buf, align 1
@@ -68,7 +68,8 @@ define void @forward_offset_with_gep(ptr %dep_src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
 ; CHECK-NEXT:    [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dep_dest = alloca %buf, align 1
@@ -87,7 +88,8 @@ define void @forward_offset_memcpy(ptr %dep_src) {
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [9 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    call void @use(ptr [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
@@ -108,7 +110,8 @@ define void @forward_offset_memcpy_inline(ptr %dep_src) {
 ; CHECK-NEXT:    [[DEST:%.*]] = alloca [9 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    call void @use(ptr [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
@@ -151,7 +154,7 @@ define void @forward_offset_and_store(ptr %dep_src) {
 ; CHECK-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
 ; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dep_dest = alloca %buf, align 1
diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
index 0d34932937eee..c7c05901455bc 100644
--- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
+++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
@@ -8,26 +8,16 @@
 define void @forward_offset_and_store(ptr %dep_src) {
 ; CUSTOM-LABEL: define void @forward_offset_and_store(
 ; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) {
-; CUSTOM-NEXT:    [[DEP_DEST:%.*]] = alloca [7 x i8], align 1
-; CUSTOM-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false)
 ; CUSTOM-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
 ; CUSTOM-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
 ; CUSTOM-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
-; CUSTOM-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CUSTOM-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CUSTOM-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false)
 ; CUSTOM-NEXT:    ret void
 ;
 ; O2-LABEL: define void @forward_offset_and_store(
-; O2-SAME: ptr nocapture [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O2-NEXT:    [[DEP_DEST:%.*]] = alloca [7 x i8], align 1
-; O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false)
+; O2-SAME: ptr nocapture writeonly [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; O2-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
 ; O2-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
 ; O2-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
-; O2-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; O2-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false)
 ; O2-NEXT:    ret void
 ;
   %dep_dest = alloca %buf, align 1

>From 289b44d1b13c84136ddd55065b8b8333880567ba Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Tue, 9 Jul 2024 20:40:30 +0800
Subject: [PATCH 03/12] Fix nits

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index e41c86069fb4e..ae11ca057d47a 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1151,7 +1151,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
 
   // The length of the memcpy's must be the same, or the preceding one
   // must be larger than the following one.
-  if (MForwardOffset != 0 || (MDep->getLength() != M->getLength())) {
+  if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) {
     auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
     auto *MLen = dyn_cast<ConstantInt>(M->getLength());
     if (!MDepLen || !MLen ||
@@ -1175,12 +1175,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     // The copy destination of `M` maybe can serve as the source of copying.
     std::optional<int64_t> MDestOffset =
         M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
-    if (MDestOffset && *MDestOffset == MForwardOffset)
+    if (MDestOffset == MForwardOffset)
       CopySource = M->getRawDest();
     else
       CopySource = Builder.CreateInBoundsPtrAdd(
-          CopySource, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
-                                       MForwardOffset));
+          CopySource, Builder.getInt64(MForwardOffset));
     MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
     if (CopySourceAlign)
       CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);

>From d238bb8853f1d28c4a9617a249d7ee2652e15f2d Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Tue, 9 Jul 2024 21:33:18 +0800
Subject: [PATCH 04/12] Add comments for the forward offset

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index ae11ca057d47a..3bbade11c2ec2 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1170,7 +1170,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
       MemoryLocation::getForSource(M).Size);
 
-  // We need to update `MCopyLoc` if an offset exists.
+  // When the forwarding offset is greater than 0, we transform
+  //    memcpy(d1 <- s1)
+  //    memcpy(d2 <- d1+o)
+  // to
+  //    memcpy(d2 <- s1+o)
   if (MForwardOffset > 0) {
     // The copy destination of `M` maybe can serve as the source of copying.
     std::optional<int64_t> MDestOffset =
@@ -1180,6 +1184,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     else
       CopySource = Builder.CreateInBoundsPtrAdd(
           CopySource, Builder.getInt64(MForwardOffset));
+    // We need to update `MCopyLoc` if an offset exists.
     MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
     if (CopySourceAlign)
       CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);

>From a34be7372e6bbb542f608056a0c616f4982cdc41 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Tue, 9 Jul 2024 21:53:01 +0800
Subject: [PATCH 05/12] Update memcpy-memcpy-offset.ll

---
 .../MemCpyOpt/memcpy-memcpy-offset.ll         | 224 +++++++++---------
 1 file changed, 115 insertions(+), 109 deletions(-)

diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
index 4d00ea70a564d..447c55fd9b690 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -4,191 +4,197 @@
 %buf = type [9 x i8]
 
 ; We can forward `memcpy` because the copy location are the same,
-define void @forward_offset(ptr %dep_src) {
+define void @forward_offset(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @forward_offset(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
   ret void
 }
 
 ; We need to update the align value of the source of `memcpy` when forwarding.
-define void @forward_offset_align(ptr %dep_src) {
+define void @forward_offset_align(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @forward_offset_align(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3
-; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[DEST]], i64 5, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[SRC]], i64 9, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 3
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 3
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 4 %src, i64 9, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 3
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 5, i1 false)
   ret void
 }
 
 ; We can change the align value to 2 when forwarding.
-define void @forward_offset_align_2(ptr %dep_src) {
+define void @forward_offset_align_2(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @forward_offset_align_2(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2
-; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 2 [[DEP]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[SRC]], i64 9, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 2
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 2 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 2
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 2
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 4 %src, i64 9, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
+  ret void
+}
+
+; If the copy destination can be used as the copy source, we don't need to create a GEP instruction.
+define void @forward_offset_without_gep(ptr %src) {
+; CHECK-LABEL: define void @forward_offset_without_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[TMP:%.*]] = alloca [9 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[TMP1]], ptr align 1 [[TMP1]], i64 6, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  %dest = getelementptr inbounds i8, ptr %src, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
   ret void
 }
 
 ; We need to create a GEP instruction when forwarding.
-define void @forward_offset_with_gep(ptr %dep_src) {
+define void @forward_offset_with_gep(ptr %src) {
 ; CHECK-LABEL: define void @forward_offset_with_gep(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[TMP1]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 2
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  %dest = getelementptr inbounds i8, ptr %src, i64 2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
   ret void
 }
 
 ; Make sure we pass the right parameters when calling `memcpy`.
-define void @forward_offset_memcpy(ptr %dep_src) {
+define void @forward_offset_memcpy(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @forward_offset_memcpy(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    call void @use(ptr [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  %dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
   call void @use(ptr %dest)
   ret void
 }
 
 ; Make sure we pass the right parameters when calling `memcpy.inline`.
-define void @forward_offset_memcpy_inline(ptr %dep_src) {
+define void @forward_offset_memcpy_inline(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @forward_offset_memcpy_inline(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
 ; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    call void @use(ptr [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  %dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
   call void @use(ptr %dest)
   ret void
 }
 
 ; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on.
-define void @do_not_forward_oversize_offset(ptr %dep_src) {
+define void @do_not_forward_oversize_offset(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @do_not_forward_oversize_offset(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 6, i1 false)
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 6, i1 false)
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 6, i1 false)
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false)
   ret void
 }
 
 ; We can forward `memcpy` because the write operation does not corrupt the location to be copied.
-define void @forward_offset_and_store(ptr %dep_src) {
+define void @forward_offset_and_store(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @forward_offset_and_store(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
-; CHECK-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
-; CHECK-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    store i8 1, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 6
 ; CHECK-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 5, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  store i8 1, ptr %dep_src, align 1
-  %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6
-  store i8 1, ptr %dep_src_end, align 1
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  store i8 1, ptr %src, align 1
+  %src_end = getelementptr inbounds i8, ptr %src, i64 6
+  store i8 1, ptr %src_end, align 1
+  %cpy_tmp_offset  = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 5, i1 false)
   ret void
 }
 
 ; We cannot forward `memcpy` because the write operation alters the location to be copied.
 ; Also, make sure we have removed the GEP instruction that was created temporarily.
-define void @do_not_forward_offset_and_store(ptr %dep_src) {
+define void @do_not_forward_offset_and_store(ptr %src, ptr %dest) {
 ; CHECK-LABEL: define void @do_not_forward_offset_and_store(
-; CHECK-SAME: ptr [[DEP_SRC:%.*]]) {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) {
 ; CHECK-NEXT:    [[DEP_DEST:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
-; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false)
+; CHECK-NEXT:    [[DEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
 ; CHECK-NEXT:    store i8 1, ptr [[DEP]], align 1
-; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
-; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  %dep_src_offset = getelementptr inbounds i8, ptr %dep_src, i64 1
-  store i8 1, ptr %dep_src_offset, align 1
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 2
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  %cpy_tmp = alloca %buf, align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false)
+  %src_offset = getelementptr inbounds i8, ptr %src, i64 1
+  store i8 1, ptr %src_offset, align 1
+  %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 5, i1 false)
   ret void
 }
 

>From 1da22cd2fb1f5f5b4917ff36b934ea4d6d01d342 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Tue, 9 Jul 2024 22:45:58 +0800
Subject: [PATCH 06/12] Leave some FIXME comments

---
 llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll | 1 +
 llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
index 447c55fd9b690..82086eed54332 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -58,6 +58,7 @@ define void @forward_offset_align_2(ptr %src, ptr %dest) {
 }
 
 ; If the copy destination can be used as the copy source, we don't need to create a GEP instruction.
+; FIXME: We can directly remove memcpy here.
 define void @forward_offset_without_gep(ptr %src) {
 ; CHECK-LABEL: define void @forward_offset_without_gep(
 ; CHECK-SAME: ptr [[SRC:%.*]]) {
diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
index c7c05901455bc..a81bb91f7ede0 100644
--- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
+++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
@@ -4,6 +4,7 @@
 
 %buf = type [7 x i8]
 
+; FIXME: This can be done independently in memcpyopt.
 ; Check that we eliminate all `memcpy` calls in this function.
 define void @forward_offset_and_store(ptr %dep_src) {
 ; CUSTOM-LABEL: define void @forward_offset_and_store(

>From 143c92c732baf838a4bb2291cda6c6d65eec81ce Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Thu, 11 Jul 2024 21:05:25 +0800
Subject: [PATCH 07/12] Update due to rebase

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp       | 12 ++++++------
 .../Transforms/MemCpyOpt/memcpy-memcpy-offset.ll     |  3 ---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 3bbade11c2ec2..ab01ef50c5e91 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1160,7 +1160,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   }
 
   IRBuilder<> Builder(M);
-  auto *CopySource = MDep->getRawSource();
+  auto *CopySource = MDep->getSource();
   auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] {
     if (CopySource->use_empty())
       cast<Instruction>(CopySource)->eraseFromParent();
@@ -1180,7 +1180,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     std::optional<int64_t> MDestOffset =
         M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
     if (MDestOffset == MForwardOffset)
-      CopySource = M->getRawDest();
+      CopySource = M->getDest();
     else
       CopySource = Builder.CreateInBoundsPtrAdd(
           CopySource, Builder.getInt64(MForwardOffset));
@@ -1204,7 +1204,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     return false;
 
   // No need to create `memcpy(a <- a)`.
-  if (BAA.isMustAlias(M->getDest(), MDep->getSource())) {
+  if (BAA.isMustAlias(M->getDest(), CopySource)) {
     // Remove the instruction we're replacing.
     eraseInstruction(M);
     ++NumMemCpyInstr;
@@ -1236,18 +1236,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   Instruction *NewM;
   if (UseMemMove)
     NewM =
-        Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), CopySource,
+        Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource,
                               CopySourceAlign, M->getLength(), M->isVolatile());
   else if (isa<MemCpyInlineInst>(M)) {
     // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
     // never allowed since that would allow the latter to be lowered as a call
     // to an external function.
-    NewM = Builder.CreateMemCpyInline(M->getRawDest(), M->getDestAlign(),
+    NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(),
                                       CopySource, CopySourceAlign,
                                       M->getLength(), M->isVolatile());
   } else
     NewM =
-        Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), CopySource,
+        Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource,
                              CopySourceAlign, M->getLength(), M->isVolatile());
   NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
index 82086eed54332..07fc6880746ed 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -58,15 +58,12 @@ define void @forward_offset_align_2(ptr %src, ptr %dest) {
 }
 
 ; If the copy destination can be used as the copy source, we don't need to create a GEP instruction.
-; FIXME: We can directly remove memcpy here.
 define void @forward_offset_without_gep(ptr %src) {
 ; CHECK-LABEL: define void @forward_offset_without_gep(
 ; CHECK-SAME: ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca [9 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP]], ptr align 1 [[SRC]], i64 7, i1 false)
 ; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
-; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[TMP1]], ptr align 1 [[TMP1]], i64 6, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %cpy_tmp = alloca %buf, align 1

>From 351eb0a035cfaa44f2f5b2dc955ae38e12bc3797 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Thu, 11 Jul 2024 21:51:47 +0800
Subject: [PATCH 08/12] Update memcpy-offset.ll & Disable 98321

---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 12 ++---
 .../Transforms/PhaseOrdering/memcpy-offset.ll | 47 ++++++++++---------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index ab01ef50c5e91..cacc5f1b595e3 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1204,12 +1204,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     return false;
 
   // No need to create `memcpy(a <- a)`.
-  if (BAA.isMustAlias(M->getDest(), CopySource)) {
-    // Remove the instruction we're replacing.
-    eraseInstruction(M);
-    ++NumMemCpyInstr;
-    return true;
-  }
+  // if (BAA.isMustAlias(M->getDest(), CopySource)) {
+  //   // Remove the instruction we're replacing.
+  //   eraseInstruction(M);
+  //   ++NumMemCpyInstr;
+  //   return true;
+  // }
 
   // If the dest of the second might alias the source of the first, then the
   // source and dest might overlap. In addition, if the source of the first
diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
index a81bb91f7ede0..39d4b389891fd 100644
--- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
+++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
@@ -1,33 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=memcpyopt,dse,instcombine -S -verify-memoryssa | FileCheck --check-prefix=CUSTOM %s
+; RUN: opt < %s -passes=memcpyopt,instcombine -S -verify-memoryssa | FileCheck --check-prefix=CUSTOM %s
 ; RUN: opt < %s -O2 -S | FileCheck --check-prefix=O2 %s
 
-%buf = type [7 x i8]
-
-; FIXME: This can be done independently in memcpyopt.
 ; Check that we eliminate all `memcpy` calls in this function.
-define void @forward_offset_and_store(ptr %dep_src) {
-; CUSTOM-LABEL: define void @forward_offset_and_store(
-; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) {
-; CUSTOM-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
-; CUSTOM-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
-; CUSTOM-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
+define void @memcpy_forward_back_with_offset(ptr %arg) {
+; CUSTOM-LABEL: define void @memcpy_forward_back_with_offset(
+; CUSTOM-SAME: ptr [[ARG:%.*]]) {
+; CUSTOM-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
+; CUSTOM-NEXT:    store i8 1, ptr [[ARG]], align 1
+; CUSTOM-NEXT:    [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
+; CUSTOM-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false)
 ; CUSTOM-NEXT:    ret void
 ;
-; O2-LABEL: define void @forward_offset_and_store(
-; O2-SAME: ptr nocapture writeonly [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O2-NEXT:    store i8 1, ptr [[DEP_SRC]], align 1
-; O2-NEXT:    [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
-; O2-NEXT:    store i8 1, ptr [[DEP_SRC_END]], align 1
+; O2-LABEL: define void @memcpy_forward_back_with_offset(
+; O2-SAME: ptr nocapture [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
+; O2-NEXT:    store i8 1, ptr [[ARG]], align 1
+; O2-NEXT:    [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
+; O2-NEXT:    tail call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false)
 ; O2-NEXT:    ret void
 ;
-  %dep_dest = alloca %buf, align 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false)
-  store i8 1, ptr %dep_src, align 1
-  %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6
-  store i8 1, ptr %dep_src_end, align 1
-  %src = getelementptr inbounds i8, ptr %dep_dest, i64 1
-  %dest = getelementptr inbounds i8, ptr %dep_src, i64 1
-  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false)
+  %i = alloca [753 x i8], align 1
+  %i1 = alloca [754 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %arg, i64 754, i1 false)
+  %i2 = getelementptr inbounds i8, ptr %i1, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr %i, ptr %i2, i64 753, i1 false)
+  store i8 1, ptr %arg, align 1
+  %i3 = getelementptr inbounds i8, ptr %arg, i64 1
+  call void @llvm.memcpy.p0.p0.i64(ptr %i3, ptr %i, i64 753, i1 false)
   ret void
 }
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)

>From c197dd108b7f2feb6ae5efef43b08659fd30993b Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Thu, 11 Jul 2024 21:52:27 +0800
Subject: [PATCH 09/12] Re-enable 98321

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp      | 12 ++++++------
 llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll |  8 +-------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index cacc5f1b595e3..ab01ef50c5e91 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1204,12 +1204,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
     return false;
 
   // No need to create `memcpy(a <- a)`.
-  // if (BAA.isMustAlias(M->getDest(), CopySource)) {
-  //   // Remove the instruction we're replacing.
-  //   eraseInstruction(M);
-  //   ++NumMemCpyInstr;
-  //   return true;
-  // }
+  if (BAA.isMustAlias(M->getDest(), CopySource)) {
+    // Remove the instruction we're replacing.
+    eraseInstruction(M);
+    ++NumMemCpyInstr;
+    return true;
+  }
 
   // If the dest of the second might alias the source of the first, then the
   // source and dest might overlap. In addition, if the source of the first
diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
index 39d4b389891fd..bd910b82496fd 100644
--- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
+++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
@@ -6,18 +6,12 @@
 define void @memcpy_forward_back_with_offset(ptr %arg) {
 ; CUSTOM-LABEL: define void @memcpy_forward_back_with_offset(
 ; CUSTOM-SAME: ptr [[ARG:%.*]]) {
-; CUSTOM-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
 ; CUSTOM-NEXT:    store i8 1, ptr [[ARG]], align 1
-; CUSTOM-NEXT:    [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
-; CUSTOM-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false)
 ; CUSTOM-NEXT:    ret void
 ;
 ; O2-LABEL: define void @memcpy_forward_back_with_offset(
-; O2-SAME: ptr nocapture [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
+; O2-SAME: ptr nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; O2-NEXT:    store i8 1, ptr [[ARG]], align 1
-; O2-NEXT:    [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1
-; O2-NEXT:    tail call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false)
 ; O2-NEXT:    ret void
 ;
   %i = alloca [753 x i8], align 1

>From a43d005560d346513af013b5f23c08358595968a Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Thu, 11 Jul 2024 22:02:11 +0800
Subject: [PATCH 10/12] Re-generate lifetime.ll

---
 llvm/test/Transforms/MemCpyOpt/lifetime.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index 1a07e6ce7476c..615887474aaaa 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -124,7 +124,7 @@ define void @call_slot_lifetime_bitcast(ptr %ptr) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 4 [[PTR:%.*]], i64 4, i1 false)
 ; CHECK-NEXT:    [[TMP1_CAST:%.*]] = bitcast ptr [[TMP1]] to ptr
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1_CAST]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1_CAST]], ptr align 4 [[PTR]], i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1]], ptr align 4 [[PTR]], i64 4, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %tmp1 = alloca i32

>From c828a2fecf6432b647c515633dd6f589aa2a27c2 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Thu, 11 Jul 2024 22:08:02 +0800
Subject: [PATCH 11/12] Add nikic's comment

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index ab01ef50c5e91..e5983e6b58e1b 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1163,6 +1163,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   auto *CopySource = MDep->getSource();
   auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] {
     if (CopySource->use_empty())
+      // Safety: It's safe here because we will only allocate more instructions
+      // after finishing all BatchAA queries, but we have to be careful if we
+      // want to do something like this in another place. Then we'd probably
+      // have to delay instruction removal until all transforms on an
+      // instruction finished.
       cast<Instruction>(CopySource)->eraseFromParent();
   });
   MaybeAlign CopySourceAlign = MDep->getSourceAlign();

>From ba0188dfe81a4ff8defca6c0e182cc293a441781 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk at dianqk.net>
Date: Fri, 12 Jul 2024 07:49:20 +0800
Subject: [PATCH 12/12] Only delete the instruction we created

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp    | 15 +++++++++------
 .../Transforms/MemCpyOpt/memcpy-memcpy-offset.ll  |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index e5983e6b58e1b..1c65219585e5a 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1161,14 +1161,15 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
 
   IRBuilder<> Builder(M);
   auto *CopySource = MDep->getSource();
-  auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] {
-    if (CopySource->use_empty())
+  Instruction *NewCopySource = nullptr;
+  auto CleanupOnRet = llvm::make_scope_exit([&NewCopySource] {
+    if (NewCopySource && NewCopySource->use_empty())
       // Safety: It's safe here because we will only allocate more instructions
       // after finishing all BatchAA queries, but we have to be careful if we
       // want to do something like this in another place. Then we'd probably
       // have to delay instruction removal until all transforms on an
       // instruction finished.
-      cast<Instruction>(CopySource)->eraseFromParent();
+      NewCopySource->eraseFromParent();
   });
   MaybeAlign CopySourceAlign = MDep->getSourceAlign();
   // We just need to calculate the actual size of the copy.
@@ -1186,9 +1187,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
         M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
     if (MDestOffset == MForwardOffset)
       CopySource = M->getDest();
-    else
-      CopySource = Builder.CreateInBoundsPtrAdd(
-          CopySource, Builder.getInt64(MForwardOffset));
+    else {
+      NewCopySource = cast<Instruction>(Builder.CreateInBoundsPtrAdd(
+          CopySource, Builder.getInt64(MForwardOffset)));
+      CopySource = NewCopySource;
+    }
     // We need to update `MCopyLoc` if an offset exists.
     MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
     if (CopySourceAlign)
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
index 07fc6880746ed..6abb0da827990 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
@@ -64,6 +64,7 @@ define void @forward_offset_without_gep(ptr %src) {
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca [9 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP]], ptr align 1 [[SRC]], i64 7, i1 false)
 ; CHECK-NEXT:    [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1
+; CHECK-NEXT:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
 ; CHECK-NEXT:    ret void
 ;
   %cpy_tmp = alloca %buf, align 1



More information about the llvm-commits mailing list