[llvm] [memcpyopt] allow some undef contents overread in processMemCpyMemCpyDependence (PR #143745)
Jameson Nash via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 11 13:59:14 PDT 2025
https://github.com/vtjnash updated https://github.com/llvm/llvm-project/pull/143745
>From a2f0414a052397ac6a85afc6be3c76ec9ba626f7 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Wed, 11 Jun 2025 20:20:55 +0000
Subject: [PATCH 1/2] [memcpyopt] fix incorrect handling of lifetime markers
Having lifetime markers should only increase the information available
to LLVM, but it was stopping the MemSSA walk and unable to see that
these were extra.
Secondly, it would ignore a lifetime marker that wasn't full size
(relying on the callback to forbid the optimization entirely), but again
sub-optimal lifetime markers are not supposed to be forbidding
optimizations that would otherwise apply if they were either absent or
optimal. This pass wasn't tracking GEP offsets either, so it wasn't
quite correctly handled either (although earlier sub-optimal checks that
this size is the same as the alloca test made this safe in the past).
Lastly, the stack-move test seemed to have been providing lifetime in
bits instead of bytes, although now we optimize it regardless of that,
so we didn't really have to fix that.
---
.../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 110 ++++++++++--------
.../ScopedNoAliasAA/alias-scope-merging.ll | 20 ++--
.../Transforms/MemCpyOpt/callslot_badaa.ll | 18 ++-
.../test/Transforms/MemCpyOpt/memcpy-undef.ll | 2 -
llvm/test/Transforms/MemCpyOpt/stack-move.ll | 15 +--
5 files changed, 90 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 960001bf880c6..877600034f84e 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1366,56 +1366,68 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
/// Determine whether the pointer V had only undefined content (due to Def) up
/// to the given Size, either because it was freshly alloca'd or started its
-/// lifetime.
-static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
- MemoryDef *Def, Value *Size) {
- if (MSSA->isLiveOnEntryDef(Def))
- return isa<AllocaInst>(getUnderlyingObject(V));
-
- if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
- if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
- auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
-
- if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
- if (AA.isMustAlias(V, II->getArgOperand(1)) &&
- LTSize->getZExtValue() >= CSize->getZExtValue())
- return true;
- }
+/// lifetime by walking the MSSA graph.
+static bool hadUndefContentsBefore(MemorySSA *MSSA, BatchAAResults &BAA,
+ Value *V, MemoryAccess *Clobber,
+ MemoryLocation Loc, Value *Size) {
+ while (1) {
+ Clobber = MSSA->getWalker()->getClobberingMemoryAccess(Clobber, Loc, BAA);
+ MemoryDef *Def = dyn_cast<MemoryDef>(Clobber);
+ if (!Def)
+ return false;
+
+ if (MSSA->isLiveOnEntryDef(Def))
+ return isa<AllocaInst>(getUnderlyingObject(V));
+
+ if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+ auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
- // If the lifetime.start covers a whole alloca (as it almost always
- // does) and we're querying a pointer based on that alloca, then we know
- // the memory is definitely undef, regardless of how exactly we alias.
- // The size also doesn't matter, as an out-of-bounds access would be UB.
- if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
- if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
- const DataLayout &DL = Alloca->getDataLayout();
- if (std::optional<TypeSize> AllocaSize =
- Alloca->getAllocationSize(DL))
- if (*AllocaSize == LTSize->getValue())
+ if (Size)
+ if (auto CSize = dyn_cast<ConstantInt>(Size))
+ if (BAA.isMustAlias(V, II->getArgOperand(1)) &&
+ LTSize->getZExtValue() >= CSize->getZExtValue())
return true;
+
+ // If the lifetime.start covers a whole alloca (as it almost always
+ // does) and we're querying a pointer based on that alloca, then we know
+ // the memory is definitely undef, regardless of how exactly we alias.
+ // The size also doesn't matter, as an out-of-bounds access would be UB.
+ if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
+ if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
+ const DataLayout &DL = Alloca->getDataLayout();
+ if (std::optional<TypeSize> AllocaSize =
+ Alloca->getAllocationSize(DL))
+ if (*AllocaSize == LTSize->getValue())
+ return true;
+ }
}
+ Clobber = Def->getDefiningAccess();
+ continue;
+ } else if (II->getIntrinsicID() == Intrinsic::lifetime_end) {
+ Clobber = Def->getDefiningAccess();
+ continue;
}
}
- }
- return false;
+ return false;
+ }
}
// If the memcpy is larger than the previous, but the memory was undef prior to
// that, we can just ignore the tail. Technically we're only interested in the
// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but
-// as we can't easily represent this location (hasUndefContents uses mustAlias
-// which cannot deal with offsets), we use the full 0..CopySize range.
+// as we can't easily represent this location (hadUndefContentsBefore uses
+// mustAlias which cannot deal with offsets), we use the full 0..CopySize range.
static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
MemIntrinsic *MemSrc, BatchAAResults &BAA) {
Value *CopySize = MemCpy->getLength();
- MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
- MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc);
- MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
- MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA);
- if (auto *MD = dyn_cast<MemoryDef>(Clobber))
- if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
- return true;
+ MemoryLocation LoadLoc = MemoryLocation::getForSource(MemCpy);
+ MemoryAccess *MemSrcAccess =
+ MSSA->getMemoryAccess(MemSrc)->getDefiningAccess();
+ if (hadUndefContentsBefore(MSSA, BAA, MemCpy->getSource(), MemSrcAccess,
+ LoadLoc, CopySize))
+ return true;
return false;
}
@@ -1573,11 +1585,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
// since both llvm.lifetime.start and llvm.lifetime.end intrinsics
// practically fill all the bytes of the alloca with an undefined
// value, although conceptually marked as alive/dead.
- int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
- if (Size < 0 || Size == DestSize) {
- LifetimeMarkers.push_back(UI);
- continue;
- }
+ // We don't currently track GEP offsets and sizes, so we don't have
+ // a way to check whether this lifetime marker affects the relevant
+ // memory regions.
+ // While we only really need to delete lifetime.end from Src and
+ // lifetime.begin from Dst, those are often implied by the memcpy
+ // anyways so hopefully not much is lost by removing all of them.
+ LifetimeMarkers.push_back(UI);
+ continue;
}
AAMetadataInstrs.insert(UI);
@@ -1594,9 +1609,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
return true;
};
- // Check that dest has no Mod/Ref, from the alloca to the Store, except full
- // size lifetime intrinsics. And collect modref inst for the reachability
- // check.
+ // Check that dest has no Mod/Ref, from the alloca to the Store. And collect
+ // modref inst for the reachability check.
ModRefInfo DestModRef = ModRefInfo::NoModRef;
MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
SmallVector<BasicBlock *, 8> ReachabilityWorklist;
@@ -1779,8 +1793,9 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
if (processMemSetMemCpyDependence(M, MDep, BAA))
return true;
- MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
- AnyClobber, MemoryLocation::getForSource(M), BAA);
+ MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+ MemoryAccess *SrcClobber =
+ MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, SrcLoc, BAA);
// There are five possible optimizations we can do for memcpy:
// a) memcpy-memcpy xform which exposes redundance for DSE.
@@ -1820,7 +1835,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
}
}
- if (hasUndefContents(MSSA, BAA, M->getSource(), MD, M->getLength())) {
+ if (hadUndefContentsBefore(MSSA, BAA, M->getSource(), AnyClobber, SrcLoc,
+ M->getLength())) {
LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
eraseInstruction(M);
++NumMemCpyInstr;
diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
index 989049ab67a0b..3b08e4404dde2 100644
--- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
+++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
@@ -1,25 +1,27 @@
-; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
define i8 @test(i8 %input) {
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: i8 [[INPUT:%.*]]) {
+; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
+; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
+; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT: ret i8 [[RET_VALUE]]
+;
%tmp = alloca i8
%dst = alloca i8
%src = alloca i8
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
- call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
+ call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4
store i8 %input, ptr %src
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4
+ call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4
%ret_value = load i8, ptr %dst
ret i8 %ret_value
}
-; Merged scope contains "callee0: %a" and "callee0 : %b"
-; CHECK-DAG: ![[CALLEE0_A:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %a"}
-; CHECK-DAG: ![[CALLEE0_B:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, !"callee0: %b"}
-; CHECK-DAG: ![[SCOPE]] = !{![[CALLEE0_A]], ![[CALLEE0_B]]}
-
declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
index efdbdce401b76..fb5d675402eba 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
@@ -1,9 +1,10 @@
-; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
; Merging here naively generates:
; call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3
-; call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0
+; call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0
; ...
; !0 = !{!1}
; !1 = distinct !{!1, !2, !"callee1: %a"}
@@ -13,15 +14,20 @@
; !5 = distinct !{!5, !"callee0"}
; Which is incorrect because the lifetime.end of %src will now "noalias" the above memcpy.
define i8 @test(i8 %input) {
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: i8 [[INPUT:%.*]]) {
+; CHECK-NEXT: [[SRC:%.*]] = alloca i8, align 1
+; CHECK-NEXT: store i8 [[INPUT]], ptr [[SRC]], align 1
+; CHECK-NEXT: [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT: ret i8 [[RET_VALUE]]
+;
%tmp = alloca i8
%dst = alloca i8
%src = alloca i8
-; NOTE: we're matching the full line and looking for the lack of !alias.scope here
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false)
- call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3
+ call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3
store i8 %input, ptr %src
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
- call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3
+ call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3
%ret_value = load i8, ptr %dst
ret i8 %ret_value
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
index 2f1ce37ea2256..a463fed3ac58c 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
@@ -96,7 +96,6 @@ define void @test_lifetime_partial_alias_3(ptr noalias %dst) {
; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[GEP]], i64 4, i1 false)
; CHECK-NEXT: ret void
;
%a = alloca [16 x i8]
@@ -112,7 +111,6 @@ define void @test_lifetime_partial_alias_4(ptr noalias %dst) {
; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 1
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[GEP]], i64 8, i1 false)
; CHECK-NEXT: ret void
;
%a = alloca [16 x i8]
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index 4a75c5eea2499..31e255b83eb9e 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1649,20 +1649,13 @@ loop_exit:
ret void
}
-; Tests that failure because partial-sized lifetimes are counted as mod.
+; Tests that partial-sized lifetimes are not inhibiting the optimizer
define void @partial_lifetime() {
; CHECK-LABEL: define void @partial_lifetime() {
-; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
-; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr captures(none) [[DEST]])
-; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 3, ptr captures(none) [[SRC]])
+; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
; CHECK-NEXT: ret void
;
%src = alloca %struct.Foo, align 4
>From 645f2472085420b9e9a4e4c1b59c4c1768a31e28 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Wed, 11 Jun 2025 20:17:18 +0000
Subject: [PATCH 2/2] [memcpyopt] allow more memcpy-to-memcpy optimziation
Allow the memcpy-to-memcpy optimization even when the sizes are not
identical. For example, it might have been generated as a small slice of
a larger struct (currently only for zero offset however), or might be
only storing to part of an oversized alloca.
---
.../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 112 ++++++++++++++----
.../Transforms/MemCpyOpt/preserve-memssa.ll | 5 +-
llvm/test/Transforms/MemCpyOpt/stack-move.ll | 33 ++----
3 files changed, 97 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 877600034f84e..cf296296e0996 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -23,6 +23,7 @@
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -1431,6 +1432,28 @@ static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
return false;
}
+// If only the MemSrc instruction is known, a similar but slightly weaker
+// analysis can apply
+static bool anyOverreadUndefContents(MemorySSA *MSSA, Instruction *Store,
+ BatchAAResults &BAA) {
+ MemoryLocation Loc;
+ Value *Ptr;
+ if (auto SI = dyn_cast<StoreInst>(Store)) {
+ Loc = MemoryLocation::get(SI);
+ Ptr = SI->getPointerOperand();
+ } else if (auto MI = dyn_cast<MemCpyInst>(Store)) {
+ Loc = MemoryLocation::getForDest(MI);
+ Ptr = MI->getDest();
+ } else {
+ llvm_unreachable("performStackMoveOptzn must have a known store kind");
+ }
+ MemoryAccess *MemAccess = MSSA->getMemoryAccess(Store)->getDefiningAccess();
+ if (hadUndefContentsBefore(MSSA, BAA, Ptr, MemAccess, Loc, nullptr))
+ return true;
+ return false;
+}
+
+
/// Transform memcpy to memset when its source was just memset.
/// In other words, turn:
/// \code
@@ -1524,22 +1547,42 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
return false;
}
- // Check that copy is full with static size.
- const DataLayout &DL = DestAlloca->getDataLayout();
- std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
- if (!SrcSize || Size != *SrcSize) {
- LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
- return false;
- }
- std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
- if (!DestSize || Size != *DestSize) {
- LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
- return false;
- }
-
if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
return false;
+ Type *SrcType = SrcAlloca->getAllocatedType();
+ Type *DestType = DestAlloca->getAllocatedType();
+ // If they don't have common type, then they will need to be converted to a
+ // common size at runtime
+ const auto &DL = SrcAlloca->getDataLayout();
+ TypeSize SrcSize = DL.getTypeAllocSize(SrcType);
+ TypeSize DestSize = DL.getTypeAllocSize(DestType);
+ if (SrcType != DestType)
+ if (SrcSize != DestSize)
+ if (!SrcSize.isFixed() || !DestSize.isFixed())
+ return false;
+
+ // Check that copy is full with dest size, either because it wrote every byte,
+ // or it was fresh.
+ std::optional<TypeSize> FullSize = DestAlloca->getAllocationSize(DL);
+ if (!FullSize || Size != *FullSize)
+ if (!anyOverreadUndefContents(MSSA, Store, BAA)) {
+ LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
+ return false;
+ }
+
+ // Check if it will be legal to combine allocas without breaking dominator.
+ // TODO: Try to hoist the arguments (recursively) instead of giving up
+ // immediately.
+ bool MoveSrc = !DT->dominates(SrcAlloca, DestAlloca);
+ if (MoveSrc) {
+ if (!DT->dominates(SrcAlloca->getArraySize(), DestAlloca))
+ return false;
+ } else {
+ if (!DT->dominates(DestAlloca->getArraySize(), SrcAlloca))
+ return false;
+ }
+
// Check that src and dest are never captured, unescaped allocas. Also
// find the nearest common dominator and postdominator for all users in
// order to shrink wrap the lifetimes, and instructions with noalias metadata
@@ -1547,7 +1590,6 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
SmallVector<Instruction *, 4> LifetimeMarkers;
SmallSet<Instruction *, 4> AAMetadataInstrs;
- bool SrcNotDom = false;
auto CaptureTrackingWithModRef =
[&](Instruction *AI,
@@ -1561,10 +1603,6 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
Instruction *I = Worklist.pop_back_val();
for (const Use &U : I->uses()) {
auto *UI = cast<Instruction>(U.getUser());
- // If any use that isn't dominated by SrcAlloca exists, we move src
- // alloca to the entry before the transformation.
- if (!DT->dominates(SrcAlloca, UI))
- SrcNotDom = true;
if (Visited.size() >= MaxUsesToExplore) {
LLVM_DEBUG(
@@ -1678,15 +1716,43 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
return false;
- // We can do the transformation. First, move the SrcAlloca to the start of the
- // BB.
- if (SrcNotDom)
- SrcAlloca->moveBefore(*SrcAlloca->getParent(),
- SrcAlloca->getParent()->getFirstInsertionPt());
+ // We can now do the transformation. First move the Src if it was after Dest.
+ if (MoveSrc)
+ SrcAlloca->moveBefore(DestAlloca->getIterator());
+
// Align the allocas appropriately.
SrcAlloca->setAlignment(
std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
+ // Size the allocas appropriately.
+ Value *SrcArraySize = SrcAlloca->getArraySize();
+ Value *DestArraySize = DestAlloca->getArraySize();
+ IRBuilder<InstSimplifyFolder> Builder(SrcAlloca->getContext(),
+ InstSimplifyFolder(DL));
+ Builder.SetInsertPoint(SrcAlloca);
+ Type *Int32Ty = Builder.getInt32Ty();
+ if (SrcType != DestType && SrcSize != DestSize) {
+ SrcAlloca->setAllocatedType(Type::getInt8Ty(Load->getContext()));
+ if (SrcArraySize->getType() != Int32Ty)
+ SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
+ if (DestArraySize->getType() != Int32Ty)
+ DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
+ SrcArraySize = Builder.CreateMul(
+ SrcArraySize, ConstantInt::get(Int32Ty, SrcSize.getFixedValue()), "",
+ true, true);
+ DestArraySize = Builder.CreateMul(
+ DestArraySize, ConstantInt::get(Int32Ty, DestSize.getFixedValue()), "",
+ true, true);
+ }
+ if (SrcArraySize != DestArraySize) {
+ if (SrcArraySize->getType() != DestArraySize->getType()) {
+ SrcArraySize = Builder.CreateZExtOrTrunc(SrcArraySize, Int32Ty);
+ DestArraySize = Builder.CreateZExtOrTrunc(DestArraySize, Int32Ty);
+ }
+ SrcAlloca->setOperand(0, Builder.CreateBinaryIntrinsic(
+ Intrinsic::umax, SrcArraySize, DestArraySize));
+ }
+
// Merge the two allocas.
DestAlloca->replaceAllUsesWith(SrcAlloca);
eraseInstruction(DestAlloca);
diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
index b654319b9432e..d2de266baac23 100644
--- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
@@ -76,12 +76,9 @@ declare void @decompose(ptr nocapture)
define void @test5(ptr %ptr) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[EARLY_DATA:%.*]] = alloca [128 x i8], align 8
-; CHECK-NEXT: [[TMP:%.*]] = alloca [[T:%.*]], align 8
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[EARLY_DATA]])
+; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i32 8224, align 8
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 8
; CHECK-NEXT: call fastcc void @decompose(ptr [[TMP]])
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[EARLY_DATA]], ptr [[TMP]], i64 32, i1 false)
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index 31e255b83eb9e..9c29c4f9f010d 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1029,16 +1029,10 @@ bb2:
; for the purposes of liveness analysis, not a definition.
define void @incomplete_memcpy() {
; CHECK-LABEL: define void @incomplete_memcpy() {
-; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
-; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[DEST]])
-; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 11, i1 false)
+; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
; CHECK-NEXT: ret void
;
%src = alloca %struct.Foo, align 4
@@ -1058,17 +1052,10 @@ define void @incomplete_memcpy() {
; for the purposes of liveness analysis, not a definition.
define void @incomplete_store() {
; CHECK-LABEL: define void @incomplete_store() {
-; CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
-; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[DEST]])
-; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[SRC]])
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: store i32 [[TMP2]], ptr [[DEST]], align 4
+; CHECK-NEXT: [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT: store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @use_nocapture(ptr noundef captures(none) [[DEST]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
; CHECK-NEXT: ret void
;
%src = alloca %struct.Foo, align 4
@@ -1182,15 +1169,9 @@ define void @dynamically_sized_memcpy(i64 %size) {
define void @mismatched_alloca_size() {
; CHECK-LABEL: define void @mismatched_alloca_size() {
; CHECK-NEXT: [[SRC:%.*]] = alloca i8, i64 24, align 4
-; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i64 12, align 4
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[DEST]])
; CHECK-NEXT: store [[STRUCT_FOO:%.*]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 24, ptr captures(none) [[SRC]])
-; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
; CHECK-NEXT: ret void
;
%src = alloca i8, i64 24, align 4
More information about the llvm-commits
mailing list