[llvm] [MemCpyOpt] allow more memcpy-to-memcpy optimization (PR #150792)

Jameson Nash via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 20 08:28:56 PDT 2025


https://github.com/vtjnash updated https://github.com/llvm/llvm-project/pull/150792

>From 11a1a3672c3ef02d3b82358e76ae8be5d2de6e99 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Wed, 11 Jun 2025 20:17:18 +0000
Subject: [PATCH 1/3] [MemCpyOpt] allow more memcpy-to-memcpy optimization

Allow the memcpy-to-memcpy optimization even when the sizes are not
identical. For example, it might have been generated as a small slice of
a larger struct (currently only for zero offset however), or might be
only storing to part of an oversized alloca.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 118 ++++++++++++---
 .../Transforms/MemCpyOpt/preserve-memssa.ll   |   5 +-
 llvm/test/Transforms/MemCpyOpt/stack-move.ll  | 138 ++++++++++--------
 .../test/Transforms/MemCpyOpt/stackrestore.ll |   5 +-
 4 files changed, 178 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index e043d072a7638..5442a12c8dcaf 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -1415,6 +1416,30 @@ static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
   return false;
 }
 
+// If only the MemSrc instruction is known, a similar but slightly weaker
+// analysis can apply
+static bool allOverreadUndefContents(MemorySSA *MSSA, Instruction *Store,
+                                     BatchAAResults &BAA) {
+  MemoryLocation Loc;
+  Value *Ptr;
+  if (auto SI = dyn_cast<StoreInst>(Store)) {
+    Loc = MemoryLocation::get(SI);
+    Ptr = SI->getPointerOperand();
+  } else if (auto MI = dyn_cast<MemCpyInst>(Store)) {
+    Loc = MemoryLocation::getForDest(MI);
+    Ptr = MI->getDest();
+  } else {
+    llvm_unreachable("performStackMoveOptzn must have a known store kind");
+  }
+  MemoryUseOrDef *MemAccess = MSSA->getMemoryAccess(Store);
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      MemAccess->getDefiningAccess(), Loc, BAA);
+  if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+    if (hasUndefContents(MSSA, BAA, Ptr, MD))
+      return true;
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@@ -1508,21 +1533,43 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     return false;
   }
 
-  // Check that copy is full with static size.
-  const DataLayout &DL = DestAlloca->getDataLayout();
-  std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
-  if (!SrcSize || Size != *SrcSize) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
-    return false;
-  }
-  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
-  if (!DestSize || Size != *DestSize) {
-    LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
+  if (SrcAlloca->isUsedWithInAlloca() || DestAlloca->isUsedWithInAlloca())
     return false;
-  }
 
-  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
-    return false;
+  Type *SrcType = SrcAlloca->getAllocatedType();
+  Type *DestType = DestAlloca->getAllocatedType();
+  // If they don't have common type, then they will need to be converted to a
+  // common size at runtime
+  const auto &DL = SrcAlloca->getDataLayout();
+  TypeSize SrcSize = DL.getTypeAllocSize(SrcType);
+  TypeSize DestSize = DL.getTypeAllocSize(DestType);
+  if (SrcType != DestType)
+    if (SrcSize != DestSize)
+      if (!SrcSize.isFixed() || !DestSize.isFixed())
+        return false;
+
+  // Check that copy is full with dest size, either because it wrote every byte,
+  // or it was fresh.
+  std::optional<TypeSize> FullSize = DestAlloca->getAllocationSize(DL);
+  if (!FullSize || Size != *FullSize)
+    if (!allOverreadUndefContents(MSSA, Store, BAA)) {
+      LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
+      return false;
+    }
+
+  // Check if it will be legal to combine allocas without breaking dominator.
+  // TODO: Try to hoist the arguments (recursively) instead of giving up
+  // immediately.
+  bool MoveSrc = !DT->dominates(SrcAlloca, DestAlloca);
+  if (MoveSrc) {
+    if (!DT->dominates(DestAlloca, SrcAlloca))
+      return false;
+    if (!DT->dominates(SrcAlloca->getArraySize(), DestAlloca))
+      return false;
+  } else {
+    if (!DT->dominates(DestAlloca->getArraySize(), SrcAlloca))
+      return false;
+  }
 
   // Check that src and dest are never captured, unescaped allocas. Also
   // find the nearest common dominator and postdominator for all users in
@@ -1531,7 +1578,6 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
 
   SmallVector<Instruction *, 4> LifetimeMarkers;
   SmallPtrSet<Instruction *, 4> AAMetadataInstrs;
-  bool SrcNotDom = false;
 
   auto CaptureTrackingWithModRef =
       [&](Instruction *AI,
@@ -1545,10 +1591,6 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
       Instruction *I = Worklist.pop_back_val();
       for (const Use &U : I->uses()) {
         auto *UI = cast<Instruction>(U.getUser());
-        // If any use that isn't dominated by SrcAlloca exists, we move src
-        // alloca to the entry before the transformation.
-        if (!DT->dominates(SrcAlloca, UI))
-          SrcNotDom = true;
 
         if (Visited.size() >= MaxUsesToExplore) {
           LLVM_DEBUG(
@@ -1656,15 +1698,43 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
     return false;
 
-  // We can do the transformation. First, move the SrcAlloca to the start of the
-  // BB.
-  if (SrcNotDom)
-    SrcAlloca->moveBefore(*SrcAlloca->getParent(),
-                          SrcAlloca->getParent()->getFirstInsertionPt());
+  // We can now do the transformation. First move the Src if it was after Dest.
+  if (MoveSrc)
+    SrcAlloca->moveBefore(DestAlloca->getIterator());
+
   // Align the allocas appropriately.
   SrcAlloca->setAlignment(
       std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
 
+  // Size the allocas appropriately.
+  Value *SrcArraySize = SrcAlloca->getArraySize();
+  Value *DestArraySize = DestAlloca->getArraySize();
+  IRBuilder<InstSimplifyFolder> Builder(SrcAlloca->getContext(),
+                                        InstSimplifyFolder(DL));
+  Builder.SetInsertPoint(SrcAlloca);
+  Type *Int32Ty = Builder.getInt32Ty();
+  if (SrcType != DestType && SrcSize != DestSize) {
+    SrcAlloca->setAllocatedType(Type::getInt8Ty(Load->getContext()));
+    if (SrcArraySize->getType() != Int32Ty)
+      SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
+    if (DestArraySize->getType() != Int32Ty)
+      DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
+    SrcArraySize = Builder.CreateMul(
+        SrcArraySize, ConstantInt::get(Int32Ty, SrcSize.getFixedValue()), "",
+        true, true);
+    DestArraySize = Builder.CreateMul(
+        DestArraySize, ConstantInt::get(Int32Ty, DestSize.getFixedValue()), "",
+        true, true);
+  }
+  if (SrcArraySize != DestArraySize) {
+    if (SrcArraySize->getType() != DestArraySize->getType()) {
+      SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
+      DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
+    }
+    SrcAlloca->setOperand(0, Builder.CreateBinaryIntrinsic(
+                                 Intrinsic::umax, SrcArraySize, DestArraySize));
+  }
+
   // Merge the two allocas.
   DestAlloca->replaceAllUsesWith(SrcAlloca);
   eraseInstruction(DestAlloca);
@@ -1692,7 +1762,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     I->setMetadata(LLVMContext::MD_tbaa_struct, nullptr);
   }
 
-  LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
+  LLVM_DEBUG(dbgs() << "Stack Move: Performed stack-move optimization\n");
   NumStackMove++;
   return true;
 }
diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
index e1a6c3f00c05d..da2cca91d34e0 100644
--- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
@@ -76,12 +76,9 @@ declare void @decompose(ptr nocapture)
 define void @test5(ptr %ptr) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[EARLY_DATA:%.*]] = alloca [128 x i8], align 8
-; CHECK-NEXT:    [[TMP:%.*]] = alloca [[T:%.*]], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EARLY_DATA]])
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, i32 8224, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    call fastcc void @decompose(ptr [[TMP]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[EARLY_DATA]], ptr [[TMP]], i64 32, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index 940e30ec46881..a1648e2b84002 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1023,22 +1023,14 @@ bb2:
 }
 
 
-; Optimization failures follow:
-
 ; Tests that a memcpy that doesn't completely overwrite a stack value is a use
 ; for the purposes of liveness analysis, not a definition.
 define void @incomplete_memcpy() {
 ; CHECK-LABEL: define void @incomplete_memcpy() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 11, i1 false)
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
@@ -1058,17 +1050,10 @@ define void @incomplete_memcpy() {
 ; for the purposes of liveness analysis, not a definition.
 define void @incomplete_store() {
 ; CHECK-LABEL: define void @incomplete_store() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[DEST]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
@@ -1085,20 +1070,86 @@ define void @incomplete_store() {
   ret void
 }
 
+; Tests merging allocas with different sizes
+define void @mismatched_alloca_size() {
+; CHECK-LABEL: define void @mismatched_alloca_size() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 24, align 4
+; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i8, i64 24, align 4
+  %dest = alloca i8, i64 12, align 4
+  call void @llvm.lifetime.start.p0(ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(ptr nocapture %dest)
+  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
+  %1 = call i32 @use_nocapture(ptr nocapture %src)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+
+  %2 = call i32 @use_nocapture(ptr nocapture %dest)
+  call void @llvm.lifetime.end.p0(ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(ptr nocapture %dest)
+  ret void
+}
+
+; Tests merging allocas with different types
+define void @mismatched_alloca_type() {
+; CHECK-LABEL: define void @mismatched_alloca_type() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 6, align 4
+; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i16, i64 6, align 4
+  %dest = alloca i8, i64 12, align 4
+  call void @llvm.lifetime.start.p0(ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(ptr nocapture %dest)
+  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
+  %1 = call i32 @use_nocapture(ptr nocapture %src)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+
+  %2 = call i32 @use_nocapture(ptr nocapture %dest)
+  call void @llvm.lifetime.end.p0(ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(ptr nocapture %dest)
+  ret void
+}
+
+; Tests merging allocas with different types and sizes
+define void @mismatched_alloca_type_size() {
+; CHECK-LABEL: define void @mismatched_alloca_type_size() {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i32 24, align 4
+; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca i16, i64 12, align 4
+  %dest = alloca i8, i64 12, align 4
+  call void @llvm.lifetime.start.p0(ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(ptr nocapture %dest)
+  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
+  %1 = call i32 @use_nocapture(ptr nocapture %src)
+
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
+
+  %2 = call i32 @use_nocapture(ptr nocapture %dest)
+  call void @llvm.lifetime.end.p0(ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(ptr nocapture %dest)
+  ret void
+}
+
 ; Tests that dynamically-sized allocas are never merged.
 define void @dynamically_sized_alloca(i64 %i) {
 ; CHECK-LABEL: define void @dynamically_sized_alloca
 ; CHECK-SAME: (i64 [[I:%.*]]) {
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 [[I]], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca i8, i64 [[I]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[DEST]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca i8, i64 %i, align 4
@@ -1117,6 +1168,8 @@ define void @dynamically_sized_alloca(i64 %i) {
 }
 
 
+; Optimization failures follow:
+
 ; Tests that inalloca attributed allocas are never merged, to prevent stacksave/stackrestore handling.
 define void @inalloca() {
 ; CHECK-LABEL: define void @inalloca() {
@@ -1178,35 +1231,6 @@ define void @dynamically_sized_memcpy(i64 %size) {
   ret void
 }
 
-; Tests that allocas with different sizes aren't merged together.
-define void @mismatched_alloca_size() {
-; CHECK-LABEL: define void @mismatched_alloca_size() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 24, align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca i8, i64 12, align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca i8, i64 24, align 4
-  %dest = alloca i8, i64 12, align 4
-  call void @llvm.lifetime.start.p0(ptr nocapture %src)
-  call void @llvm.lifetime.start.p0(ptr nocapture %dest)
-  store %struct.Foo { i32 10, i32 20, i32 30 }, ptr %src
-  %1 = call i32 @use_nocapture(ptr nocapture %src)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src, i64 12, i1 false)
-
-  %2 = call i32 @use_nocapture(ptr nocapture %dest)
-  call void @llvm.lifetime.end.p0(ptr nocapture %src)
-  call void @llvm.lifetime.end.p0(ptr nocapture %dest)
-  ret void
-}
 
 ; Tests that allocas with mismatched address spaces aren't combined.
 define void @mismatched_alloca_addrspace() {
diff --git a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
index 0fc37c44fa9e8..493ca3faabb61 100644
--- a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
@@ -16,12 +16,11 @@ target triple = "i686-unknown-windows-msvc19.14.26433"
 
 define i32 @test_norestore(i32 %n) {
 ; CHECK-LABEL: @test_norestore(
-; CHECK-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
-; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
+; CHECK-NEXT:    [[N:%.*]] = call i32 @llvm.umax.i32(i32 [[N1:%.*]], i32 10)
+; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N]], align 4
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[P]], ptr align 1 @str, i32 9, i1 false)
 ; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 9
 ; CHECK-NEXT:    store i8 0, ptr [[P10]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[TMPMEM]], ptr [[P]], i32 10, i1 false)
 ; CHECK-NEXT:    call void @external()
 ; CHECK-NEXT:    [[HEAP:%.*]] = call ptr @malloc(i32 9)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[HEAP]], ptr align 1 @str, i32 9, i1 false)

>From 7b074328e193d24619f563de709c7f5e8d59b1f1 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Wed, 20 Aug 2025 15:27:22 +0000
Subject: [PATCH 2/3] add missing SrcAlloca->setOperand call after changing
 size

---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 1 +
 llvm/test/Transforms/MemCpyOpt/stack-move.ll   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 5442a12c8dcaf..017d16ce808e5 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1725,6 +1725,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     DestArraySize = Builder.CreateMul(
         DestArraySize, ConstantInt::get(Int32Ty, DestSize.getFixedValue()), "",
         true, true);
+    SrcAlloca->setOperand(0, SrcArraySize);
   }
   if (SrcArraySize != DestArraySize) {
     if (SrcArraySize->getType() != DestArraySize->getType()) {
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index a1648e2b84002..6c79bb6ec7997 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1097,7 +1097,7 @@ define void @mismatched_alloca_size() {
 ; Tests merging allocas with different types
 define void @mismatched_alloca_type() {
 ; CHECK-LABEL: define void @mismatched_alloca_type() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 6, align 4
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i32 12, align 4
 ; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])

>From e6e6279abe918880babbe20804cf02dfd4c512ad Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Wed, 20 Aug 2025 15:28:27 +0000
Subject: [PATCH 3/3] remove support for dynamic arrays (except FCA)

---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 61 +++++--------------
 .../Transforms/MemCpyOpt/preserve-memssa.ll   |  2 +-
 llvm/test/Transforms/MemCpyOpt/stack-move.ll  | 12 +++-
 .../test/Transforms/MemCpyOpt/stackrestore.ll |  5 +-
 4 files changed, 28 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 017d16ce808e5..a2caaaf67923c 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -1536,39 +1535,29 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   if (SrcAlloca->isUsedWithInAlloca() || DestAlloca->isUsedWithInAlloca())
     return false;
 
-  Type *SrcType = SrcAlloca->getAllocatedType();
-  Type *DestType = DestAlloca->getAllocatedType();
-  // If they don't have common type, then they will need to be converted to a
-  // common size at runtime
   const auto &DL = SrcAlloca->getDataLayout();
-  TypeSize SrcSize = DL.getTypeAllocSize(SrcType);
-  TypeSize DestSize = DL.getTypeAllocSize(DestType);
-  if (SrcType != DestType)
-    if (SrcSize != DestSize)
-      if (!SrcSize.isFixed() || !DestSize.isFixed())
-        return false;
+  // Check if allocation sizes are compatible with compile-time math
+  std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
+  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
+  if (!SrcSize || !DestSize)
+    return false;
+  if (*SrcSize != *DestSize)
+    if (!SrcSize->isFixed() || !DestSize->isFixed())
+      return false;
 
   // Check that copy is full with dest size, either because it wrote every byte,
   // or it was fresh.
-  std::optional<TypeSize> FullSize = DestAlloca->getAllocationSize(DL);
-  if (!FullSize || Size != *FullSize)
+  if (Size != *DestSize)
     if (!allOverreadUndefContents(MSSA, Store, BAA)) {
       LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
       return false;
     }
 
   // Check if it will be legal to combine allocas without breaking dominator.
-  // TODO: Try to hoist the arguments (recursively) instead of giving up
-  // immediately.
   bool MoveSrc = !DT->dominates(SrcAlloca, DestAlloca);
   if (MoveSrc) {
     if (!DT->dominates(DestAlloca, SrcAlloca))
       return false;
-    if (!DT->dominates(SrcAlloca->getArraySize(), DestAlloca))
-      return false;
-  } else {
-    if (!DT->dominates(DestAlloca->getArraySize(), SrcAlloca))
-      return false;
   }
 
   // Check that src and dest are never captured, unescaped allocas. Also
@@ -1707,33 +1696,13 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
       std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
 
   // Size the allocas appropriately.
-  Value *SrcArraySize = SrcAlloca->getArraySize();
-  Value *DestArraySize = DestAlloca->getArraySize();
-  IRBuilder<InstSimplifyFolder> Builder(SrcAlloca->getContext(),
-                                        InstSimplifyFolder(DL));
-  Builder.SetInsertPoint(SrcAlloca);
-  Type *Int32Ty = Builder.getInt32Ty();
-  if (SrcType != DestType && SrcSize != DestSize) {
-    SrcAlloca->setAllocatedType(Type::getInt8Ty(Load->getContext()));
-    if (SrcArraySize->getType() != Int32Ty)
-      SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
-    if (DestArraySize->getType() != Int32Ty)
-      DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
-    SrcArraySize = Builder.CreateMul(
-        SrcArraySize, ConstantInt::get(Int32Ty, SrcSize.getFixedValue()), "",
-        true, true);
-    DestArraySize = Builder.CreateMul(
-        DestArraySize, ConstantInt::get(Int32Ty, DestSize.getFixedValue()), "",
-        true, true);
-    SrcAlloca->setOperand(0, SrcArraySize);
-  }
-  if (SrcArraySize != DestArraySize) {
-    if (SrcArraySize->getType() != DestArraySize->getType()) {
-      SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
-      DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
+  if (*SrcSize != *DestSize) {
+    // Only possible if both sizes are fixed (due to earlier check)
+    // Set Src to the type and array size of Dest if Dest was larger
+    if (DestSize->getFixedValue() > SrcSize->getFixedValue()) {
+      SrcAlloca->setAllocatedType(DestAlloca->getAllocatedType());
+      SrcAlloca->setOperand(0, DestAlloca->getArraySize());
     }
-    SrcAlloca->setOperand(0, Builder.CreateBinaryIntrinsic(
-                                 Intrinsic::umax, SrcArraySize, DestArraySize));
   }
 
   // Merge the two allocas.
diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
index da2cca91d34e0..1daf23b275476 100644
--- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
@@ -76,7 +76,7 @@ declare void @decompose(ptr nocapture)
 define void @test5(ptr %ptr) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, i32 8224, align 8
+; CHECK-NEXT:    [[TMP:%.*]] = alloca [[T:%.*]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    call fastcc void @decompose(ptr [[TMP]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index 6c79bb6ec7997..1896292455611 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1097,7 +1097,7 @@ define void @mismatched_alloca_size() {
 ; Tests merging allocas with different types
 define void @mismatched_alloca_type() {
 ; CHECK-LABEL: define void @mismatched_alloca_type() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i32 12, align 4
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i16, i64 6, align 4
 ; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
@@ -1121,7 +1121,7 @@ define void @mismatched_alloca_type() {
 ; Tests merging allocas with different types and sizes
 define void @mismatched_alloca_type_size() {
 ; CHECK-LABEL: define void @mismatched_alloca_type_size() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i32 24, align 4
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i16, i64 12, align 4
 ; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
@@ -1147,9 +1147,15 @@ define void @dynamically_sized_alloca(i64 %i) {
 ; CHECK-LABEL: define void @dynamically_sized_alloca
 ; CHECK-SAME: (i64 [[I:%.*]]) {
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, i64 [[I]], align 4
+; CHECK-NEXT:    [[DEST:%.*]] = alloca i8, i64 [[I]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[SRC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca i8, i64 %i, align 4
diff --git a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
index 493ca3faabb61..0fc37c44fa9e8 100644
--- a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
@@ -16,11 +16,12 @@ target triple = "i686-unknown-windows-msvc19.14.26433"
 
 define i32 @test_norestore(i32 %n) {
 ; CHECK-LABEL: @test_norestore(
-; CHECK-NEXT:    [[N:%.*]] = call i32 @llvm.umax.i32(i32 [[N1:%.*]], i32 10)
-; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N]], align 4
+; CHECK-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[P]], ptr align 1 @str, i32 9, i1 false)
 ; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 9
 ; CHECK-NEXT:    store i8 0, ptr [[P10]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[TMPMEM]], ptr [[P]], i32 10, i1 false)
 ; CHECK-NEXT:    call void @external()
 ; CHECK-NEXT:    [[HEAP:%.*]] = call ptr @malloc(i32 9)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[HEAP]], ptr align 1 @str, i32 9, i1 false)



More information about the llvm-commits mailing list