[llvm] [InstCombine] Forward memcpy source to load instruction (PR #140249)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 16 05:54:24 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: dianqk (dianqk)
<details>
<summary>Changes</summary>
Fixes #<!-- -->137810.
I've already tried implementing this with MemCpyOpt in #<!-- -->138490, which resulted in unacceptable compile-time. Therefore, I implement this with InstCombine.
Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=059b0c2efbf30d986d812c4d2cf6d6c7876569fe&to=bb5edb394bb8983c5d3eacbaa3c3491504dd549b&stat=instructions%3Au.
---
Full diff: https://github.com/llvm/llvm-project/pull/140249.diff
6 Files Affected:
- (modified) llvm/include/llvm/Analysis/Loads.h (+3-2)
- (modified) llvm/lib/Analysis/Loads.cpp (+33-1)
- (modified) llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp (+36-4)
- (added) llvm/test/Transforms/InstCombine/memcpy-forward-load.ll (+169)
- (modified) llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll (+2-4)
- (added) llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll (+47)
``````````diff
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b..94d761379a9c5 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -153,9 +153,10 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
/// This overload provides a more efficient implementation of
/// FindAvailableLoadedValue() for the case where we are not interested in
/// finding the closest clobbering instruction if no available load is found.
-/// This overload cannot be used to scan across multiple blocks.
+/// This overload cannot be used to scan across multiple blocks. If a memcpy is
+/// returned, it indicates that we can load from its source.
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan = DefMaxInstsToScan);
/// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 425f3682122cd..f766331dab2f1 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -713,8 +713,31 @@ Value *llvm::findAvailablePtrLoadStore(
return nullptr;
}
+static Value *availableMemCpySrc(LoadInst *LI, MemCpyInst *MemCpy,
+ int64_t &Offset) {
+ if (!LI->isSimple() || MemCpy->isVolatile())
+ return nullptr;
+ const DataLayout &DL = LI->getDataLayout();
+ u_int64_t Size = DL.getTypeStoreSize(LI->getType()).getKnownMinValue();
+ if (Size == 0)
+ return nullptr;
+ Value *OldSrc = LI->getPointerOperand();
+
+ if (OldSrc != MemCpy->getDest()) {
+ std::optional<int64_t> PointerOffset =
+ OldSrc->getPointerOffsetFrom(MemCpy->getDest(), DL);
+ if (!PointerOffset || *PointerOffset < 0)
+ return nullptr;
+ Offset = *PointerOffset;
+ }
+ auto *CopyLen = dyn_cast<ConstantInt>(MemCpy->getLength());
+ if (!CopyLen || CopyLen->getZExtValue() < Size + Offset)
+ return nullptr;
+ return MemCpy;
+}
+
Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
- bool *IsLoadCSE,
+ bool *IsLoadCSE, int64_t &Offset,
unsigned MaxInstsToScan) {
const DataLayout &DL = Load->getDataLayout();
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -739,6 +762,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
AtLeastAtomic, DL, IsLoadCSE);
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(&Inst))
+ Available = availableMemCpySrc(Load, MemCpy, Offset);
+
if (Available)
break;
@@ -753,6 +779,12 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
for (Instruction *Inst : MustNotAliasInsts)
if (isModSet(AA.getModRefInfo(Inst, Loc)))
return nullptr;
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(Available)) {
+ MemoryLocation Loc = MemoryLocation::getForSource(MemCpy);
+ for (Instruction *Inst : MustNotAliasInsts)
+ if (isModSet(AA.getModRefInfo(Inst, Loc)))
+ return nullptr;
+ }
}
return Available;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c29cba6f675c5..cf0ebc9fd043f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1053,13 +1053,45 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
// separated by a few arithmetic operations.
bool IsLoadCSE = false;
BatchAAResults BatchAA(*AA);
- if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
+ int64_t Offset = 0;
+ if (Value *AvailableVal =
+ FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE, Offset)) {
if (IsLoadCSE)
combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
- return replaceInstUsesWith(
- LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
- LI.getName() + ".cast"));
+ /// Perform simplification of load's. If we have memcpy A which copies X to
+ /// Y, and load instruction B which loads from Y, then we can rewrite B to
+ /// be a load instruction loads from X. This allows later passes to remove
+ /// the memcpy A or identify the source of the load instruction.
+ if (auto *MemCpy = dyn_cast<MemCpyInst>(AvailableVal)) {
+ Value *NewSrc = MemCpy->getSource();
+ Value *OldSrc = LI.getPointerOperand();
+ MaybeAlign NewAlign = MemCpy->getSourceAlign();
+ if (Offset != 0) {
+ if (NewAlign.has_value())
+ NewAlign = commonAlignment(*NewAlign, Offset);
+ // Avoid increasing instructions
+ if (isa<Instruction>(OldSrc) && OldSrc->hasOneUse())
+ NewSrc =
+ Builder.CreateInBoundsPtrAdd(NewSrc, Builder.getInt64(Offset));
+ else
+ NewSrc = nullptr;
+ }
+ // Avoid infinite loops
+ if (NewSrc && !BatchAA.isMustAlias(OldSrc, NewSrc))
+ AvailableVal = Builder.CreateAlignedLoad(LI.getType(), NewSrc, NewAlign,
+ LI.getName());
+ else {
+ AvailableVal = nullptr;
+ if (NewSrc && NewSrc->use_empty())
+ cast<Instruction>(NewSrc)->eraseFromParent();
+ }
+ } else
+ AvailableVal = Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+ LI.getName() + ".cast");
+
+ if (AvailableVal)
+ return replaceInstUsesWith(LI, AvailableVal);
}
// None of the following transforms are legal for volatile/ordered atomic
diff --git a/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
new file mode 100644
index 0000000000000..7a56bb50b0903
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i24 @forward_load(ptr align 4 %src) {
+; CHECK-LABEL: define i24 @forward_load(
+; CHECK-SAME: ptr align 4 [[SRC:%.*]]) {
+; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4
+; CHECK-NEXT: ret i24 [[VAL1]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i8 @forward_load_gep(ptr %src) {
+; CHECK-LABEL: define i8 @forward_load_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i8 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ ret i8 %val
+}
+
+define i17 @forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i17 @forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[VAL:%.*]] = load i17, ptr [[SRC]], align 1
+; CHECK-NEXT: ret i17 [[VAL]]
+;
+ %dest = alloca [5 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load i17, ptr %dest
+ ret i17 %val
+}
+
+define <2 x i8> @forward_load_vector(ptr %src) {
+; CHECK-LABEL: define <2 x i8> @forward_load_vector(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[SRC]], align 1
+; CHECK-NEXT: ret <2 x i8> [[TMP1]]
+;
+ %dest = alloca <2 x i8>
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+ %val = load <2 x i8>, ptr %dest
+ ret <2 x i8> %val
+}
+
+; Negative tests
+
+define i24 @forward_load_volatile(ptr %src) {
+; CHECK-LABEL: define i24 @forward_load_volatile(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load volatile i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %val = load volatile i24, ptr %dest
+ ret i24 %val
+}
+
+define i24 @failed_forward_load_write_src(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_src(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: store i1 true, ptr [[SRC]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ store i1 true, ptr %src
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i24 @failed_forward_load_write_dest(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_dest(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: store i1 true, ptr [[DEST]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ store i1 true, ptr %dest
+ %val = load i24, ptr %dest
+ ret i24 %val
+}
+
+define i16 @failed_forward_load_size(ptr %src) {
+; CHECK-LABEL: define i16 @failed_forward_load_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[DEST]], align 1
+; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT: ret i16 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+ %val = load i16, ptr %dest
+ ret i16 %val
+}
+
+define i8 @failed_forward_load_gep(ptr %src) {
+; CHECK-LABEL: define i8 @failed_forward_load_gep(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 1
+; CHECK-NEXT: store i16 [[TMP1]], ptr [[DEST]], align 1
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i8 [[VAL]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ ret i8 %val
+}
+
+define i8 @failed_forward_load_gep_multi_use(ptr %src) {
+; CHECK-LABEL: define i8 @failed_forward_load_gep_multi_use(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2
+; CHECK-NEXT: [[VAL1:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: call void @use_ptr(ptr nonnull [[GEP]])
+; CHECK-NEXT: ret i8 [[VAL1]]
+;
+ %dest = alloca [3 x i8]
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+ %gep = getelementptr inbounds i8, ptr %dest, i64 2
+ %val = load i8, ptr %gep
+ call void @use_ptr(ptr %gep)
+ ret i8 %val
+}
+
+define i24 @failed_forward_load_must_alias(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_must_alias(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[DEST_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST_GEP]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC_GEP]], i64 3, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST_GEP]], align 4
+; CHECK-NEXT: ret i24 [[VAL]]
+;
+ %src_gep = getelementptr inbounds i8, ptr %src, i64 2
+ %dest_gep = getelementptr inbounds i8, ptr %src, i64 2
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest_gep, ptr %src_gep, i64 3, i1 false)
+ %val = load i24, ptr %dest_gep
+ ret i24 %val
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @use_ptr(ptr)
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index f084fe38bb226..431870155ae83 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -208,8 +208,7 @@ define i32 @test_memcpy_after_phi(i1 %cond, ptr %ptr) {
; CHECK: join:
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], [[IF]] ], [ [[PTR:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PHI]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
-; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[PHI]], align 4
-; CHECK-NEXT: ret i32 [[V]]
+; CHECK-NEXT: ret i32 0
;
entry:
%a = alloca [32 x i8]
@@ -384,8 +383,7 @@ define i8 @select_after_memcpy_keep_alloca(i1 %cond, ptr %p) {
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 1
; CHECK-NEXT: [[PTR:%.*]] = select i1 [[COND:%.*]], ptr [[ALLOCA]], ptr [[P:%.*]]
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
-; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR]], align 1
-; CHECK-NEXT: ret i8 [[LOAD]]
+; CHECK-NEXT: ret i8 0
;
entry:
%alloca = alloca [32 x i8]
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
new file mode 100644
index 0000000000000..d5dc213e6d6b6
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+define i1 @main(ptr %i2) {
+; CHECK-LABEL: define noundef i1 @main(
+; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[COMMON_RET:.*:]]
+; CHECK-NEXT: store i8 0, ptr [[I2]], align 1
+; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT: store i8 1, ptr [[I3]], align 1
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT: store i8 2, ptr [[I4]], align 1
+; CHECK-NEXT: ret i1 true
+;
+ %i1 = alloca [3 x i8], align 1
+ store i8 0, ptr %i2, align 1
+ %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1
+ store i8 1, ptr %i3, align 1
+ %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2
+ store i8 2, ptr %i4, align 1
+ call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1)
+ call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false)
+ %i5 = load i8, ptr %i1, align 1
+ %i6 = icmp eq i8 %i5, 0
+ %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1
+ %i8 = load i8, ptr %i7, align 1
+ %i9 = icmp eq i8 %i8, 1
+ %i10 = select i1 %i6, i1 %i9, i1 false
+ %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2
+ %i12 = load i8, ptr %i11, align 1
+ %i13 = icmp eq i8 %i12, 2
+ %i14 = select i1 %i10, i1 %i13, i1 false
+ br i1 %i14, label %true, label %false
+
+true:
+ call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1)
+ ret i1 true
+
+false:
+ call void @assert_failed(ptr %i1)
+ ret i1 false
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.lifetime.start.p0(i64, ptr)
+declare void @llvm.lifetime.end.p0(i64, ptr)
+declare void @assert_failed(ptr)
``````````
</details>
https://github.com/llvm/llvm-project/pull/140249
More information about the llvm-commits
mailing list