[llvm] Reapply "[MemCpyOpt] support offset slices for performStackMoveOptzn and processMemCpy", with bug fixed (PR #180002)

Jameson Nash via llvm-commits llvm-commits at lists.llvm.org
Sat Feb 7 12:31:40 PST 2026


https://github.com/vtjnash updated https://github.com/llvm/llvm-project/pull/180002

>From 7b8c595b3b5b4388549d70e9dd5a087274defb41 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Thu, 5 Feb 2026 17:55:05 +0000
Subject: [PATCH 1/4] Reapply "[MemCpyOpt] support offset slices for
 performStackMoveOptzn and processMemCpy (#176436)" (#177482)

This reverts commit f8c4974963302912cd6c5c11d5097de5530c4943.
---
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h  |  12 +-
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp |  78 +++---
 .../Transforms/MemCpyOpt/stack-move-offset.ll | 238 ++++++++++++++++++
 3 files changed, 295 insertions(+), 33 deletions(-)
 create mode 100644 llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 473372d0edfea..b87ea0b9d243f 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -57,11 +57,13 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
 
   LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
+  // Glue for the old PM.
+  LLVM_ABI bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA,
+                        AssumptionCache *AC, DominatorTree *DT,
+                        PostDominatorTree *PDT, MemorySSA *MSSA);
+
 private:
   // Helper functions
-  bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA,
-               AssumptionCache *AC, DominatorTree *DT, PostDominatorTree *PDT,
-               MemorySSA *MSSA);
   bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
   bool processStoreOfLoad(StoreInst *SI, LoadInst *LI, const DataLayout &DL,
                           BasicBlock::iterator &BBI);
@@ -84,8 +86,8 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
                                     Value *ByteVal);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
   bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                             AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
-                             TypeSize Size, BatchAAResults &BAA);
+                             Value *DestPtr, Value *SrcPtr, TypeSize Size,
+                             BatchAAResults &BAA);
   bool isMemMoveMemSetDependency(MemMoveInst *M);
 
   void eraseInstruction(Instruction *I);
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0f75e53cb9998..0cab16be20c03 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -726,18 +726,15 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   // If this is a load-store pair from a stack slot to a stack slot, we
   // might be able to perform the stack-move optimization just as we do for
   // memcpys from an alloca to an alloca.
-  if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
-    if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
-      if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
-                                DL.getTypeStoreSize(T), BAA)) {
-        // Avoid invalidating the iterator.
-        BBI = SI->getNextNode()->getIterator();
-        eraseInstruction(SI);
-        eraseInstruction(LI);
-        ++NumMemCpyInstr;
-        return true;
-      }
-    }
+  if (performStackMoveOptzn(LI, SI, SI->getPointerOperand(),
+                            LI->getPointerOperand(), DL.getTypeStoreSize(T),
+                            BAA)) {
+    // Avoid invalidating the iterator.
+    BBI = SI->getNextNode()->getIterator();
+    eraseInstruction(SI);
+    eraseInstruction(LI);
+    ++NumMemCpyInstr;
+    return true;
   }
 
   return false;
@@ -1515,20 +1512,47 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 // transformation only because we restrict the scope of this optimization to
 // allocas that aren't captured.
 bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                                          AllocaInst *DestAlloca,
-                                          AllocaInst *SrcAlloca, TypeSize Size,
-                                          BatchAAResults &BAA) {
+                                          Value *DestPtr, Value *SrcPtr,
+                                          TypeSize Size, BatchAAResults &BAA) {
   LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
                     << *Store << "\n");
 
+  AllocaInst *DestAlloca = dyn_cast<AllocaInst>(getUnderlyingObject(DestPtr));
+  if (!DestAlloca)
+    return false;
+
+  AllocaInst *SrcAlloca = dyn_cast<AllocaInst>(getUnderlyingObject(SrcPtr));
+  if (!SrcAlloca)
+    return false;
+
+  // Explicitly don't handle degenerate case of a partial copy within one
+  // alloca. It would always fail the dominator check later anyways, and
+  // possibly the modref checks also.
+  if (SrcAlloca == DestAlloca)
+    return false;
+
   // Make sure the two allocas are in the same address space.
   if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
     LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
     return false;
   }
 
+  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
+    return false;
+
   // Check that copy is full with static size.
   const DataLayout &DL = DestAlloca->getDataLayout();
+
+  auto DestOffset = DestPtr->getPointerOffsetFrom(DestAlloca, DL);
+  if (!DestOffset)
+    return false;
+
+  auto SrcOffset = SrcPtr->getPointerOffsetFrom(SrcAlloca, DL);
+  if (!SrcOffset || *SrcOffset < *DestOffset || *SrcOffset < 0)
+    return false;
+  // Offset difference must preserve dest alloca's alignment.
+  if ((*SrcOffset - *DestOffset) % DestAlloca->getAlign().value() != 0)
+    return false;
   std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
   std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
   if (!SrcSize || !DestSize)
@@ -1536,14 +1560,12 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   if (*SrcSize != *DestSize)
     if (!SrcSize->isFixed() || !DestSize->isFixed())
       return false;
-  if (Size != *DestSize) {
+  // Check that copy covers entirety of dest alloca.
+  if (Size != *DestSize || *DestOffset != 0) {
     LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
     return false;
   }
 
-  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
-    return false;
-
   // Check if it will be legal to combine allocas without breaking dominator.
   bool MoveSrc = !DT->dominates(SrcAlloca, DestAlloca);
   if (MoveSrc) {
@@ -1707,7 +1729,13 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   }
 
   // Merge the two allocas.
-  DestAlloca->replaceAllUsesWith(SrcAlloca);
+  Value *NewDestPtr = SrcAlloca;
+  if (*SrcOffset != *DestOffset) {
+    IRBuilder<> Builder(DestAlloca);
+    NewDestPtr = Builder.CreateInBoundsPtrAdd(
+        SrcAlloca, Builder.getInt64(*SrcOffset - *DestOffset));
+  }
+  DestAlloca->replaceAllUsesWith(NewDestPtr);
   eraseInstruction(DestAlloca);
 
   // Drop metadata on the source alloca.
@@ -1778,7 +1806,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
     return false;
 
   // If copying from a constant, try to turn the memcpy into a memset.
-  if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
+  if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(M->getSource())))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
                                            M->getDataLayout())) {
@@ -1864,16 +1892,10 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // If the transfer is from a stack slot to a stack slot, then we may be able
   // to perform the stack-move optimization. See the comments in
   // performStackMoveOptzn() for more details.
-  auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
-  if (!DestAlloca)
-    return false;
-  auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
-  if (!SrcAlloca)
-    return false;
   ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
   if (Len == nullptr)
     return false;
-  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
+  if (performStackMoveOptzn(M, M, M->getDest(), M->getSource(),
                             TypeSize::getFixed(Len->getZExtValue()), BAA)) {
     // Avoid invalidating the iterator.
     BBI = M->getNextNode()->getIterator();
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
new file mode 100644
index 0000000000000..fdf5e838b20e5
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=memcpyopt -verify-memoryssa -S | FileCheck %s
+
+; Test that stack-move optimization works when src is a GEP into an alloca.
+; For the optimization to trigger:
+;   - The copy must cover the entire dest alloca (Size == DestSize, DestOffset == 0)
+;   - SrcOffset must be a multiple of DestAlloca's alignment
+;   - SrcOffset must be non-negative
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.lifetime.start.p0(ptr nocapture)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
+
+declare void @use_nocapture(ptr nocapture)
+
+; Basic test: memcpy from GEP(src) to dest alloca
+; src = [16 x i8], dest = [8 x i8] align 8, copy 8 bytes
+; SrcOffset(8) is a multiple of DestAlign(8), so optimization applies.
+; After optimization: dest uses become src+8
+define void @memcpy_src_gep_to_dest_alloca() {
+; CHECK-LABEL: define void @memcpy_src_gep_to_dest_alloca() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
+; CHECK-NEXT:    [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
+; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP1]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [16 x i8], align 4
+  %dest = alloca [8 x i8], align 8
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  %src.gep = getelementptr inbounds i8, ptr %src, i64 8
+  store i64 42, ptr %src.gep
+  call void @use_nocapture(ptr nocapture %src.gep)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src.gep, i64 8, i1 false)
+
+  call void @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}
+
+; Test: memcpy from GEP(src) to dest alloca with different offset
+; src = [12 x i8], dest = [8 x i8] align 4, copy 8 bytes from src+4
+; SrcOffset(4) is a multiple of DestAlign(4), so optimization applies.
+; After optimization: dest uses become src+4
+define void @memcpy_src_gep_offset4_to_dest_alloca() {
+; CHECK-LABEL: define void @memcpy_src_gep_offset4_to_dest_alloca() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [12 x i8], align 4
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP1]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [12 x i8], align 4
+  %dest = alloca [8 x i8], align 4
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
+  store i64 42, ptr %src.gep
+  call void @use_nocapture(ptr nocapture %src.gep)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src.gep, i64 8, i1 false)
+
+  call void @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}
+
+; Test: load/store from GEP(src) to dest alloca
+; src = [8 x i8], dest = [4 x i8] align 4, load/store 4 bytes from src+4
+; SrcOffset(4) is a multiple of DestAlign(4), so optimization applies.
+define void @load_store_src_gep_to_dest_alloca() {
+; CHECK-LABEL: define void @load_store_src_gep_to_dest_alloca() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [8 x i8], align 4
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    store i32 42, ptr [[SRC_GEP1]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [8 x i8], align 4
+  %dest = alloca [4 x i8], align 4
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
+  store i32 42, ptr %src.gep
+  call void @use_nocapture(ptr nocapture %src.gep)
+
+  %val = load i32, ptr %src.gep
+  store i32 %val, ptr %dest
+
+  call void @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}
+
+; Test: both src and dest are direct allocas (no offset), same size
+; This is the basic stack-move case, included here for completeness.
+define void @memcpy_both_direct_allocas() {
+; CHECK-LABEL: define void @memcpy_both_direct_allocas() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [8 x i8], align 8
+; CHECK-NEXT:    store i64 42, ptr [[SRC]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [8 x i8], align 4
+  %dest = alloca [8 x i8], align 8
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  store i64 42, ptr %src
+  call void @use_nocapture(ptr nocapture %src)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 4 %src, i64 8, i1 false)
+
+  call void @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}
+
+; Negative test: dest has offset (dest is GEP, not direct alloca)
+; The optimization requires DestOffset == 0.
+define void @no_optimize_dest_has_offset() {
+; CHECK-LABEL: define void @no_optimize_dest_has_offset() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DEST]])
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
+; CHECK-NEXT:    [[DEST_GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 8
+; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST_GEP]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[DEST_GEP]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [16 x i8], align 4
+  %dest = alloca [16 x i8], align 8
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  %src.gep = getelementptr inbounds i8, ptr %src, i64 8
+  %dest.gep = getelementptr inbounds i8, ptr %dest, i64 8
+  store i64 42, ptr %src.gep
+  call void @use_nocapture(ptr nocapture %src.gep)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest.gep, ptr align 4 %src.gep, i64 8, i1 false)
+
+  call void @use_nocapture(ptr nocapture %dest.gep)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}
+
+; Negative test: copy doesn't cover entire dest alloca (Size != DestSize)
+; src = [12 x i8], dest = [16 x i8], copy only 8 bytes
+define void @no_optimize_partial_dest_copy() {
+; CHECK-LABEL: define void @no_optimize_partial_dest_copy() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [12 x i8], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DEST]])
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [12 x i8], align 4
+  %dest = alloca [16 x i8], align 4
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
+  store i64 42, ptr %src.gep
+  call void @use_nocapture(ptr nocapture %src.gep)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src.gep, i64 8, i1 false)
+
+  call void @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}
+
+; Negative test: SrcOffset not a multiple of DestAlign
+; src = [12 x i8] with offset 4, dest = [8 x i8] align 8
+; SrcOffset(4) % DestAlign(8) = 4 != 0 -> rejected
+define void @no_optimize_alignment_mismatch() {
+; CHECK-LABEL: define void @no_optimize_alignment_mismatch() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [12 x i8], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [8 x i8], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DEST]])
+; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP]], align 4
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DEST]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
+; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DEST]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca [12 x i8], align 4
+  %dest = alloca [8 x i8], align 8
+  call void @llvm.lifetime.start.p0(ptr %src)
+  call void @llvm.lifetime.start.p0(ptr %dest)
+  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
+  store i64 42, ptr %src.gep
+  call void @use_nocapture(ptr nocapture %src.gep)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 4 %src.gep, i64 8, i1 false)
+
+  call void @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(ptr %src)
+  call void @llvm.lifetime.end.p0(ptr %dest)
+  ret void
+}

>From 23553e741295ed844e09343a8cbc3da6ce104f8e Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Thu, 5 Feb 2026 18:09:05 +0000
Subject: [PATCH 2/4] add broken test

---
 .../Transforms/MemCpyOpt/stack-move-offset.ll | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
index fdf5e838b20e5..c232c939c8d2d 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
@@ -236,3 +236,50 @@ define void @no_optimize_alignment_mismatch() {
   call void @llvm.lifetime.end.p0(ptr %dest)
   ret void
 }
+
+; Negative test: Clobbering store to source offset between memcpy from source and memcpy to dest
+; This is a minimal reproducer for the bug in LLVM PR #176436.
+; The bug: MemCpyOpt incorrectly eliminated temp2 without accounting for the store
+; that clobbers byte 0 of local_buf before the final memcpy.
+; The fix ensures that when checking for clobbering of the source, we check the
+; actual range being copied (SrcPtr with Size) rather than the entire SrcAlloca.
+define void @no_optimize_clobbering_store_to_src_offset(ptr noalias %dst) {
+; CHECK-LABEL: define void @no_optimize_clobbering_store_to_src_offset
+; CHECK-SAME: (ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:    [[LOCAL:%.*]] = alloca { [48 x i8], { i64, i64 }, ptr }, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[LOCAL]], i64 48
+; CHECK-NEXT:    [[TEMP1:%.*]] = alloca { i64, i64 }, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[TEMP1]])
+; CHECK-NEXT:    [[DST_BUF:%.*]] = getelementptr i8, ptr [[DST]], i64 48
+; CHECK-NEXT:    [[LOCAL_BUF:%.*]] = getelementptr inbounds i8, ptr [[LOCAL]], i64 48
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[LOCAL_BUF]], ptr align 8 [[DST_BUF]], i64 16, i1 false)
+; CHECK-NEXT:    store i8 0, ptr [[DST_BUF]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[TEMP1]])
+; CHECK-NEXT:    store i8 0, ptr [[LOCAL_BUF]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST_BUF]], ptr align 8 [[TMP1]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %temp2 = alloca { i64, i64 }, align 8
+  %temp1 = alloca { i64, i64 }, align 8
+  %local = alloca { [48 x i8], { i64, i64 }, ptr }, align 8
+
+  ; First move: copy from dst+48 to local+48 via temp1
+  call void @llvm.lifetime.start.p0(ptr %temp1)
+  %dst_buf = getelementptr i8, ptr %dst, i64 48
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp1, ptr align 8 %dst_buf, i64 16, i1 false)
+  store i8 0, ptr %dst_buf, align 1
+  %local_buf = getelementptr inbounds i8, ptr %local, i64 48
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %local_buf, ptr align 8 %temp1, i64 16, i1 false)
+  call void @llvm.lifetime.end.p0(ptr %temp1)
+
+  ; Second move: copy from local+48 back to dst+48 via temp2
+  ; BUG: PR incorrectly eliminated temp2 but the store below clobbers byte 0 first!
+  ; The fix ensures we check SrcPtr (local_buf) not SrcAlloca (local) for clobbering.
+  call void @llvm.lifetime.start.p0(ptr %temp2)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp2, ptr align 8 %local_buf, i64 16, i1 false)
+  store i8 0, ptr %local_buf, align 1   ; <-- clobbers byte 0
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst_buf, ptr align 8 %temp2, i64 16, i1 false)
+  call void @llvm.lifetime.end.p0(ptr %temp2)
+
+  ret void
+}

>From b328618d5df52f82db466531155097a30fe57845 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Thu, 5 Feb 2026 18:13:43 +0000
Subject: [PATCH 3/4] [MemCpyOpt] Fix clobber checking for stack-move with
 source offsets

When checking for clobbering stores in performStackMoveOptzn, use SrcPtr
instead of SrcAlloca for the MemoryLocation. When SrcPtr is offset into
the alloca, but DstPtr is still not offset, we need to check the actual
copied region [SrcPtr, SrcPtr+Size) rather than
[SrcAlloca, SrcAlloca+Size) for ModRef. This check can become more
complex once DstOffset is non-zero or not-full-sized, as indicated.

Fixes miscompilation where stores were not detected, causing incorrect
elimination of temporary buffers.

Fixes #177185
Reverts #177482
Relands #176436
---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp    | 15 ++++++++++-----
 .../Transforms/MemCpyOpt/stack-move-offset.ll     |  9 ++++++---
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0cab16be20c03..f490367e36736 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1632,10 +1632,10 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     return true;
   };
 
-  // Check that dest has no Mod/Ref, from the alloca to the Store. And collect
-  // modref inst for the reachability check.
+  // Check that dest alloca has no Mod/Ref, from the alloca to the Store. And
+  // collect modref inst for the reachability check.
   ModRefInfo DestModRef = ModRefInfo::NoModRef;
-  MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
+  MemoryLocation DestLoc(DestAlloca, LocationSize::precise(*DestSize));
   SmallVector<BasicBlock *, 8> ReachabilityWorklist;
   auto DestModRefCallback = [&](Instruction *UI) -> bool {
     // We don't care about the store itself.
@@ -1684,8 +1684,13 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
 
   // Check that, from after the Load to the end of the BB,
   //   - if the dest has any Mod, src has no Ref, and
-  //   - if the dest has any Ref, src has no Mod except full-sized lifetimes.
-  MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size));
+  //   - if the dest has any Ref, src has no Mod except full-sized lifetimes
+  //   - from SrcPtr minus DestOffset to min(DestSize, SrcSize minus SrcOffset)
+  //   - where DestOffset and DestSize could be computed by DestModRefCallback
+  //     to be the bounds of the first and last mod region, which is at least
+  //     DestOffset to DestSize.
+  // Currently DestOffset==0 and DestSize==Size, so this math is simplified.
+  MemoryLocation SrcLoc(SrcPtr, LocationSize::precise(Size));
 
   auto SrcModRefCallback = [&](Instruction *UI) -> bool {
     // Any ModRef post-dominated by Load doesn't matter, also Load and Store
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
index c232c939c8d2d..da96eca42fadb 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
@@ -246,17 +246,20 @@ define void @no_optimize_alignment_mismatch() {
 define void @no_optimize_clobbering_store_to_src_offset(ptr noalias %dst) {
 ; CHECK-LABEL: define void @no_optimize_clobbering_store_to_src_offset
 ; CHECK-SAME: (ptr noalias [[DST:%.*]]) {
-; CHECK-NEXT:    [[LOCAL:%.*]] = alloca { [48 x i8], { i64, i64 }, ptr }, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[LOCAL]], i64 48
+; CHECK-NEXT:    [[TEMP2:%.*]] = alloca { i64, i64 }, align 8
 ; CHECK-NEXT:    [[TEMP1:%.*]] = alloca { i64, i64 }, align 8
+; CHECK-NEXT:    [[LOCAL:%.*]] = alloca { [48 x i8], { i64, i64 }, ptr }, align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[TEMP1]])
 ; CHECK-NEXT:    [[DST_BUF:%.*]] = getelementptr i8, ptr [[DST]], i64 48
 ; CHECK-NEXT:    [[LOCAL_BUF:%.*]] = getelementptr inbounds i8, ptr [[LOCAL]], i64 48
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[LOCAL_BUF]], ptr align 8 [[DST_BUF]], i64 16, i1 false)
 ; CHECK-NEXT:    store i8 0, ptr [[DST_BUF]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[TEMP1]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[TEMP2]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TEMP2]], ptr align 8 [[LOCAL_BUF]], i64 16, i1 false)
 ; CHECK-NEXT:    store i8 0, ptr [[LOCAL_BUF]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST_BUF]], ptr align 8 [[TMP1]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST_BUF]], ptr align 8 [[TEMP2]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[TEMP2]])
 ; CHECK-NEXT:    ret void
 ;
   %temp2 = alloca { i64, i64 }, align 8

>From d95e50a786a9b73e2bfb8ab8ee859bfd22f28284 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Sat, 7 Feb 2026 20:31:21 +0000
Subject: [PATCH 4/4] fixup! [MemCpyOpt] Fix clobber checking for stack-move
 with source offsets

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp      | 13 ++++++++-----
 llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll |  8 ++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index f490367e36736..f101b5355d3d8 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1685,11 +1685,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   // Check that, from after the Load to the end of the BB,
   //   - if the dest has any Mod, src has no Ref, and
   //   - if the dest has any Ref, src has no Mod except full-sized lifetimes
-  //   - from SrcPtr minus DestOffset to min(DestSize, SrcSize minus SrcOffset)
-  //   - where DestOffset and DestSize could be computed by DestModRefCallback
-  //     to be the bounds of the first and last mod region, which is at least
-  //     DestOffset to DestSize.
-  // Currently DestOffset==0 and DestSize==Size, so this math is simplified.
+  // Where:
+  //   - src is defined as the memory from max(SrcAlloca, SrcPtr minus
+  //     dest_offset) to min(dest_size, SrcSize minus SrcOffset)
+  //   - dest_offset and dest_size could be computed by DestModRefCallback
+  //     to be the bounds of the first and last mod region, and which is at
+  //     least as large as DestOffset to DestSize, and at most as large as
+  //     SrcAlloca to SrcSize.
+  //   - Currently DestOffset==0 and DestSize==Size, so this math is simplified.
   MemoryLocation SrcLoc(SrcPtr, LocationSize::precise(Size));
 
   auto SrcModRefCallback = [&](Instruction *UI) -> bool {
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
index da96eca42fadb..9d877aacde90e 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
@@ -239,10 +239,6 @@ define void @no_optimize_alignment_mismatch() {
 
 ; Negative test: Clobbering store to source offset between memcpy from source and memcpy to dest
 ; This is a minimal reproducer for the bug in LLVM PR #176436.
-; The bug: MemCpyOpt incorrectly eliminated temp2 without accounting for the store
-; that clobbers byte 0 of local_buf before the final memcpy.
-; The fix ensures that when checking for clobbering of the source, we check the
-; actual range being copied (SrcPtr with Size) rather than the entire SrcAlloca.
 define void @no_optimize_clobbering_store_to_src_offset(ptr noalias %dst) {
 ; CHECK-LABEL: define void @no_optimize_clobbering_store_to_src_offset
 ; CHECK-SAME: (ptr noalias [[DST:%.*]]) {
@@ -276,8 +272,8 @@ define void @no_optimize_clobbering_store_to_src_offset(ptr noalias %dst) {
   call void @llvm.lifetime.end.p0(ptr %temp1)
 
   ; Second move: copy from local+48 back to dst+48 via temp2
-  ; BUG: PR incorrectly eliminated temp2 but the store below clobbers byte 0 first!
-  ; The fix ensures we check SrcPtr (local_buf) not SrcAlloca (local) for clobbering.
+  ; BUG: PR incorrectly eliminated temp2 but the store below clobbers part of local_buf first!
+  ; The fix ensures we check the right portion of SrcAlloca for any clobbering.
   call void @llvm.lifetime.start.p0(ptr %temp2)
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp2, ptr align 8 %local_buf, i64 16, i1 false)
   store i8 0, ptr %local_buf, align 1   ; <-- clobbers byte 0



More information about the llvm-commits mailing list