[llvm] da1cdff - [loop-idiom] Hoist loop memcpys to loop preheader
Han Zhu via llvm-commits
llvm-commits at lists.llvm.org
Tue May 4 17:07:03 PDT 2021
Author: Han Zhu
Date: 2021-05-04T17:05:04-07:00
New Revision: da1cdffbb1b729d221360a76e1c0793992b05dfc
URL: https://github.com/llvm/llvm-project/commit/da1cdffbb1b729d221360a76e1c0793992b05dfc
DIFF: https://github.com/llvm/llvm-project/commit/da1cdffbb1b729d221360a76e1c0793992b05dfc.diff
LOG: [loop-idiom] Hoist loop memcpys to loop preheader
For a simple loop like:
```
struct S {
int x;
int y;
char b;
};
unsigned foo(S* __restrict__ a, S* b, int n) {
for (int i = 0; i < n; i++)
a[i] = b[i];
return sizeof(a[0]);
}
```
We could eliminate the loop and convert it to a large memcpy of 12*n bytes. Currently this is not handled. Output of `opt -loop-idiom -S < memcpy_before.ll`
```
%struct.S = type { i32, i32, i8 }
define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret i32 12
for.body: ; preds = %for.body, %for.body.preheader
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%idxprom = zext i32 %i.08 to i64
%arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
%arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
%0 = bitcast %struct.S* %arrayidx2 to i8*
%1 = bitcast %struct.S* %arrayidx to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
%inc = add nuw nsw i32 %i.08, 1
%cmp = icmp slt i32 %inc, %n
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0
attributes #0 = { argmemonly nofree nosync nounwind willreturn }
```
The loop idiom pass currently only handles load and store instructions. Since struct S is too big to fit in a register, the loop body contains a memcpy intrinsic.
With this change, re-run `opt -loop-idiom -S < memcpy_before.ll`. The loop memcpy is promoted to loop preheader. For this trivial case, the loop is dead and will be removed by another pass.
```
%struct.S = type { i32, i32, i8 }
define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr {
entry:
%a1 = bitcast %struct.S* %a to i8*
%b2 = bitcast %struct.S* %b to i8*
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%0 = zext i32 %n to i64
%1 = mul nuw nsw i64 %0, 12
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a1, i8* align 4 %b2, i64 %1, i1 false)
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret i32 12
for.body: ; preds = %for.body, %for.body.preheader
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%idxprom = zext i32 %i.08 to i64
%arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
%arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
%2 = bitcast %struct.S* %arrayidx2 to i8*
%3 = bitcast %struct.S* %arrayidx to i8*
%inc = add nuw nsw i32 %i.08, 1
%cmp = icmp slt i32 %inc, %n
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0
attributes #0 = { argmemonly nofree nosync nounwind willreturn }
```
Reviewed By: zino
Differential Revision: https://reviews.llvm.org/D97667
Added:
llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-different-types.ll
llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
Modified:
llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index c9bce17fcde68..34185e06719e5 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -205,6 +205,13 @@ class LoopIdiomRecognize {
enum class ForMemset { No, Yes };
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
ForMemset For);
+
+ template <typename MemInst>
+ bool processLoopMemIntrinsic(
+ BasicBlock *BB,
+ bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
+ const SCEV *BECount);
+ bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
@@ -635,22 +642,10 @@ bool LoopIdiomRecognize::runOnLoopBlock(
for (auto &SI : StoreRefsForMemcpy)
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *Inst = &*I++;
- // Look for memset instructions, which may be optimized to a larger memset.
- if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
- WeakTrackingVH InstPtr(&*I);
- if (!processLoopMemSet(MSI, BECount))
- continue;
- MadeChange = true;
-
- // If processing the memset invalidated our iterator, start over from the
- // top of the block.
- if (!InstPtr)
- I = BB->begin();
- continue;
- }
- }
+ MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
+ BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
+ MadeChange |= processLoopMemIntrinsic<MemSetInst>(
+ BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
return MadeChange;
}
@@ -799,6 +794,100 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
return Changed;
}
+/// processLoopMemIntrinsic - Template function for calling
diff erent processor
+/// functions based on mem instrinsic type.
+template <typename MemInst>
+bool LoopIdiomRecognize::processLoopMemIntrinsic(
+ BasicBlock *BB,
+ bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
+ const SCEV *BECount) {
+ bool MadeChange = false;
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+ // Look for memory instructions, which may be optimized to a larger one.
+ if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
+ WeakTrackingVH InstPtr(&*I);
+ if (!(this->*Processor)(MI, BECount))
+ continue;
+ MadeChange = true;
+
+ // If processing the instruction invalidated our iterator, start over from
+ // the top of the block.
+ if (!InstPtr)
+ I = BB->begin();
+ }
+ }
+ return MadeChange;
+}
+
+/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
+bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
+ const SCEV *BECount) {
+ // We can only handle non-volatile memcpys with a constant size.
+ if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
+ return false;
+
+ // If we're not allowed to hack on memcpy, we fail.
+ if (!HasMemcpy || DisableLIRP::Memcpy)
+ return false;
+
+ Value *Dest = MCI->getDest();
+ Value *Source = MCI->getSource();
+ if (!Dest || !Source)
+ return false;
+
+ // See if the load and store pointer expressions are AddRec like {base,+,1} on
+ // the current loop, which indicates a strided load and store. If we have
+ // something else, it's a random load or store we can't handle.
+ const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
+ if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+ return false;
+ const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
+ if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+ return false;
+
+ // Reject memcpys that are so large that they overflow an unsigned.
+ uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+ if ((SizeInBytes >> 32) != 0)
+ return false;
+
+ // Check if the stride matches the size of the memcpy. If so, then we know
+ // that every byte is touched in the loop.
+ const SCEVConstant *StoreStride =
+ dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
+ const SCEVConstant *LoadStride =
+ dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
+ if (!StoreStride || !LoadStride)
+ return false;
+
+ APInt StoreStrideValue = StoreStride->getAPInt();
+ APInt LoadStrideValue = LoadStride->getAPInt();
+ // Huge stride value - give up
+ if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
+ return false;
+
+ if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
+ << ore::NV("Inst", "memcpy") << " in "
+ << ore::NV("Function", MCI->getFunction())
+ << " function will not be hoised: "
+ << ore::NV("Reason", "memcpy size is not equal to stride");
+ });
+ return false;
+ }
+
+ int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
+ int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
+ // Check if the load stride matches the store stride.
+ if (StoreStrideInt != LoadStrideInt)
+ return false;
+
+ return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
+ MCI->getDestAlign(), MCI->getSourceAlign(),
+ MCI, MCI, StoreEv, LoadEv, BECount);
+}
+
/// processLoopMemSet - See if this memset can be promoted to a large memset.
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
const SCEV *BECount) {
@@ -807,7 +896,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
return false;
// If we're not allowed to hack on memset, we fail.
- if (!HasMemset)
+ if (!HasMemset || DisableLIRP::Memset)
return false;
Value *Pointer = MSI->getDest();
@@ -1047,9 +1136,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
NewCall->getDebugLoc(), Preheader)
- << "Transformed loop-strided store into a call to "
+ << "Transformed loop-strided store in "
+ << ore::NV("Function", TheStore->getFunction())
+ << " function into a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
- << "() function";
+ << "() intrinsic";
});
// Okay, the memset has been formed. Zap the original store and anything that
@@ -1137,9 +1228,22 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
SmallPtrSet<Instruction *, 1> Stores;
Stores.insert(TheStore);
+
+ bool IsMemCpy = isa<MemCpyInst>(TheStore);
+ const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
+
if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
- StoreSize, *AA, Stores))
+ StoreSize, *AA, Stores)) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
+ TheStore)
+ << ore::NV("Inst", InstRemark) << " in "
+ << ore::NV("Function", TheStore->getFunction())
+ << " function will not be hoisted: "
+ << ore::NV("Reason", "The loop may access store location");
+ });
return Changed;
+ }
const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
@@ -1153,9 +1257,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+ // If the store is a memcpy instruction, we must check if it will write to
+ // the load memory locations. So remove it from the ignored stores.
+ if (IsMemCpy)
+ Stores.erase(TheStore);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
- StoreSize, *AA, Stores))
+ StoreSize, *AA, Stores)) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
+ << ore::NV("Inst", InstRemark) << " in "
+ << ore::NV("Function", TheStore->getFunction())
+ << " function will not be hoisted: "
+ << ore::NV("Reason", "The loop may access load location");
+ });
return Changed;
+ }
if (avoidLIRForMultiBlockLoop())
return Changed;
@@ -1216,7 +1332,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
NewCall->getDebugLoc(), Preheader)
<< "Formed a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
- << "() function";
+ << "() intrinsic from " << ore::NV("Inst", InstRemark)
+ << " instruction in " << ore::NV("Function", TheStore->getFunction())
+ << " function";
});
// Okay, the memcpy has been formed. Zap the original store and anything that
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
index 3578540cc4d2d..6f817f2b56d83 100644
--- a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
; Check that everything still works when debuginfo is present, and that it is reasonably propagated.
-; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function
+; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function
define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
; CHECK-LABEL: @test6_dest_align(
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-
diff erent-types.ll b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-
diff erent-types.ll
new file mode 100644
index 0000000000000..20def758e6316
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-
diff erent-types.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+
+; #include <vector>
+;
+; class SDValue {
+; int A;
+; int B;
+; unsigned C;
+; };
+;
+; class SDUse {
+; SDValue Val;
+; SDUse **Prev = nullptr;
+; SDUse *Next = nullptr;
+;
+; public:
+; operator const SDValue&() const { return Val; }
+; };
+;
+; void foo(SDUse *S, int N) {
+; // Should not hoist memcpy because source and destination are of
diff erent types
+; std::vector<SDValue> Ops(S, S + N);
+; }
+
+; ModuleID = '
diff erent_types.cpp'
+source_filename = "
diff erent_types.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.SDUse = type { %class.SDValue, %class.SDUse**, %class.SDUse* }
+%class.SDValue = type { i32, i32, i32 }
+
+declare dso_local i32 @__gxx_personality_v0(...)
+
+; Function Attrs: uwtable mustprogress
+define linkonce_odr dso_local %class.SDValue* @_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIP5SDUseP7SDValueEET0_T_S7_S6_(%class.SDUse* %__first, %class.SDUse* %__last, %class.SDValue* %__result) local_unnamed_addr #0 align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: @_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIP5SDUseP7SDValueEET0_T_S7_S6_(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_NOT15:%.*]] = icmp eq %class.SDUse* [[__FIRST:%.*]], [[__LAST:%.*]]
+; CHECK-NEXT: br i1 [[CMP_NOT15]], label [[FOR_END:%.*]], label [[FOR_INC_PREHEADER:%.*]]
+; CHECK: for.inc.preheader:
+; CHECK-NEXT: br label [[FOR_INC:%.*]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[__CUR_017:%.*]] = phi %class.SDValue* [ [[INCDEC_PTR1:%.*]], [[FOR_INC]] ], [ [[__RESULT:%.*]], [[FOR_INC_PREHEADER]] ]
+; CHECK-NEXT: [[__FIRST_ADDR_016:%.*]] = phi %class.SDUse* [ [[INCDEC_PTR:%.*]], [[FOR_INC]] ], [ [[__FIRST]], [[FOR_INC_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast %class.SDValue* [[__CUR_017]] to i8*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast %class.SDUse* [[__FIRST_ADDR_016]] to i8*
+; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) [[TMP0]], i8* noundef nonnull align 8 dereferenceable(12) [[TMP1]], i64 12, i1 false)
+; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds [[CLASS_SDUSE:%.*]], %class.SDUse* [[__FIRST_ADDR_016]], i64 1
+; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds [[CLASS_SDVALUE:%.*]], %class.SDValue* [[__CUR_017]], i64 1
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq %class.SDUse* [[INCDEC_PTR]], [[__LAST]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC]]
+; CHECK: for.end.loopexit:
+; CHECK-NEXT: [[INCDEC_PTR1_LCSSA:%.*]] = phi %class.SDValue* [ [[INCDEC_PTR1]], [[FOR_INC]] ]
+; CHECK-NEXT: br label [[FOR_END]]
+; CHECK: for.end:
+; CHECK-NEXT: [[__CUR_0_LCSSA:%.*]] = phi %class.SDValue* [ [[__RESULT]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR1_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT: ret %class.SDValue* [[__CUR_0_LCSSA]]
+;
+entry:
+ %cmp.not15 = icmp eq %class.SDUse* %__first, %__last
+ br i1 %cmp.not15, label %for.end, label %for.inc.preheader
+
+for.inc.preheader: ; preds = %entry
+ br label %for.inc
+
+for.inc: ; preds = %for.inc.preheader, %for.inc
+ %__cur.017 = phi %class.SDValue* [ %incdec.ptr1, %for.inc ], [ %__result, %for.inc.preheader ]
+ %__first.addr.016 = phi %class.SDUse* [ %incdec.ptr, %for.inc ], [ %__first, %for.inc.preheader ]
+ %0 = bitcast %class.SDValue* %__cur.017 to i8*
+ %1 = bitcast %class.SDUse* %__first.addr.016 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %0, i8* noundef nonnull align 8 dereferenceable(12) %1, i64 12, i1 false)
+ %incdec.ptr = getelementptr inbounds %class.SDUse, %class.SDUse* %__first.addr.016, i64 1
+ %incdec.ptr1 = getelementptr inbounds %class.SDValue, %class.SDValue* %__cur.017, i64 1
+ %cmp.not = icmp eq %class.SDUse* %incdec.ptr, %__last
+ br i1 %cmp.not, label %for.end.loopexit, label %for.inc
+
+for.end.loopexit: ; preds = %for.inc
+ %incdec.ptr1.lcssa = phi %class.SDValue* [ %incdec.ptr1, %for.inc ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %__cur.0.lcssa = phi %class.SDValue* [ %__result, %entry ], [ %incdec.ptr1.lcssa, %for.end.loopexit ]
+ ret %class.SDValue* %__cur.0.lcssa
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
new file mode 100644
index 0000000000000..a8b59213b86c7
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
@@ -0,0 +1,434 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+
+%struct.S = type { i32, i32, i8 }
+
+; unsigned copy_noalias(S* __restrict a, S *b, int n) {
+; for (int i = 0; i < n; i++) {
+; a[i] = b[i];
+; }
+; return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.S* [[A:%.*]] to i8*
+; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.S* [[B:%.*]] to i8*
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 12
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A1]], i8* align 4 [[B2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 12
+; CHECK: for.body:
+; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret i32 12
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idxprom = zext i32 %i.08 to i64
+ %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
+ %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
+ %0 = bitcast %struct.S* %arrayidx2 to i8*
+ %1 = bitcast %struct.S* %arrayidx to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
+ %inc = add nuw nsw i32 %i.08, 1
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; unsigned copy_may_alias(S *a, S *b, int n) {
+; for (int i = 0; i < n; i++) {
+; a[i] = b[i];
+; }
+; return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_may_alias(%struct.S* nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_may_alias(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 12
+; CHECK: for.body:
+; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false)
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret i32 12
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idxprom = zext i32 %i.08 to i64
+ %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
+ %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
+ %0 = bitcast %struct.S* %arrayidx2 to i8*
+ %1 = bitcast %struct.S* %arrayidx to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
+ %inc = add nuw nsw i32 %i.08, 1
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+%struct.R = type <{ i8, i32, i8 }>
+
+; void copy_noalias_read(S* __restrict x, S* __restrict y, int n, int &s) {
+; for (int i = 0; i < n; i++) {
+; x[i] = y[i];
+; s += y[i].b;
+; }
+; }
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local void @copy_noalias_read(%struct.R* noalias nocapture %x, %struct.R* noalias nocapture readonly %y, i32 %n, i32* nocapture nonnull align 4 dereferenceable(4) %s) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[X1:%.*]] = bitcast %struct.R* [[X:%.*]] to i8*
+; CHECK-NEXT: [[Y2:%.*]] = bitcast %struct.R* [[Y:%.*]] to i8*
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT: [[S_PROMOTED:%.*]] = load i32, i32* [[S:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 6
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[X1]], i8* align 1 [[Y2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.for.cond.cleanup_crit_edge:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[S]], align 4
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[ADD13:%.*]] = phi i32 [ [[S_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_012]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_R:%.*]], %struct.R* [[X]], i64 [[IDXPROM]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 0
+; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[B]], align 1
+; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD13]], [[TMP4]]
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ %s.promoted = load i32, i32* %s, align 4
+ br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ store i32 %add.lcssa, i32* %s, align 4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+ ret void
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %add13 = phi i32 [ %s.promoted, %for.body.lr.ph ], [ %add, %for.body ]
+ %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %idxprom = zext i32 %i.012 to i64
+ %0 = getelementptr inbounds %struct.R, %struct.R* %x, i64 %idxprom, i32 0
+ %1 = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 0
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(6) %0, i8* nonnull align 1 dereferenceable(6) %1, i64 6, i1 false)
+ %b = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 1
+ %2 = load i32, i32* %b, align 1
+ %add = add nsw i32 %add13, %2
+ %inc = add nuw nsw i32 %i.012, 1
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
+}
+
+; unsigned copy_noalias_negative_stride(S* __restrict__ a, S* b, int n) {
+; for (int i = n; i >= 0; i--) {
+; a[i] = b[i];
+; }
+; return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nosync nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias_negative_stride(%struct.S* noalias nocapture %0, %struct.S* nocapture readonly %1, i32 %2) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_negative_stride(
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.S* [[TMP0:%.*]] to i8*
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.S* [[TMP1:%.*]] to i8*
+; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP2:%.*]], -1
+; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP12:%.*]]
+; CHECK: 7:
+; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 12
+; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 12
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP4]], i8* align 4 [[TMP5]], i64 [[TMP10]], i1 false)
+; CHECK-NEXT: br label [[TMP13:%.*]]
+; CHECK: 11:
+; CHECK-NEXT: br label [[TMP12]]
+; CHECK: 12:
+; CHECK-NEXT: ret i32 12
+; CHECK: 13:
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[TMP20:%.*]], [[TMP13]] ], [ [[TMP2]], [[TMP7]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[TMP1]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast %struct.S* [[TMP16]] to i8*
+; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP14]], -1
+; CHECK-NEXT: [[TMP21:%.*]] = icmp sgt i32 [[TMP14]], 0
+; CHECK-NEXT: br i1 [[TMP21]], label [[TMP13]], label [[TMP11:%.*]]
+;
+ %4 = icmp sgt i32 %2, -1
+ br i1 %4, label %5, label %7
+
+5: ; preds = %3
+ br label %8
+
+6: ; preds = %8
+ br label %7
+
+7: ; preds = %6, %3
+ ret i32 12
+
+8: ; preds = %5, %8
+ %9 = phi i32 [ %15, %8 ], [ %2, %5 ]
+ %10 = zext i32 %9 to i64
+ %11 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %10
+ %12 = getelementptr inbounds %struct.S, %struct.S* %0, i64 %10
+ %13 = bitcast %struct.S* %12 to i8*
+ %14 = bitcast %struct.S* %11 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %13, i8* noundef nonnull align 4 dereferenceable(12) %14, i64 12, i1 false)
+ %15 = add nsw i32 %9, -1
+ %16 = icmp sgt i32 %9, 0
+ br i1 %16, label %8, label %6
+}
+
+; unsigned copy_noalias_opposite_stride(S* __restrict__ a, S* b, int n) {
+; for (int i = 0, j = n; i < n && j >= 0; i++, j--) {
+; a[i] = b[j];
+; }
+; return sizeof(a[0]);
+; }
+
+; Function Attrs: nofree nosync nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias_opposite_stride(%struct.S* noalias nocapture %0, %struct.S* nocapture readonly %1, i32 %2) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_opposite_stride(
+; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2:%.*]], 0
+; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP7:%.*]]
+; CHECK: 5:
+; CHECK-NEXT: br label [[TMP8:%.*]]
+; CHECK: 6:
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: ret i32 12
+; CHECK: 8:
+; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP18:%.*]], [[TMP8]] ], [ [[TMP2]], [[TMP5]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP17:%.*]], [[TMP8]] ], [ 0, [[TMP5]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[TMP1:%.*]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP0:%.*]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast %struct.S* [[TMP14]] to i8*
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast %struct.S* [[TMP12]] to i8*
+; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) [[TMP15]], i8* noundef nonnull align 4 dereferenceable(12) [[TMP16]], i64 12, i1 false)
+; CHECK-NEXT: [[TMP17]] = add nuw nsw i32 [[TMP10]], 1
+; CHECK-NEXT: [[TMP18]] = add nsw i32 [[TMP9]], -1
+; CHECK-NEXT: [[TMP19:%.*]] = icmp slt i32 [[TMP17]], [[TMP2]]
+; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP9]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[TMP8]], label [[TMP6:%.*]]
+;
+ %4 = icmp sgt i32 %2, 0
+ br i1 %4, label %5, label %7
+
+5: ; preds = %3
+ br label %8
+
+6: ; preds = %8
+ br label %7
+
+7: ; preds = %6, %3
+ ret i32 12
+
+8: ; preds = %5, %8
+ %9 = phi i32 [ %18, %8 ], [ %2, %5 ]
+ %10 = phi i32 [ %17, %8 ], [ 0, %5 ]
+ %11 = zext i32 %9 to i64
+ %12 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %11
+ %13 = zext i32 %10 to i64
+ %14 = getelementptr inbounds %struct.S, %struct.S* %0, i64 %13
+ %15 = bitcast %struct.S* %14 to i8*
+ %16 = bitcast %struct.S* %12 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %15, i8* noundef nonnull align 4 dereferenceable(12) %16, i64 12, i1 false)
+ %17 = add nuw nsw i32 %10, 1
+ %18 = add nsw i32 %9, -1
+ %19 = icmp slt i32 %17, %2
+ %20 = icmp sgt i32 %9, 0
+ %21 = and i1 %19, %20
+ br i1 %21, label %8, label %6
+}
+
+%struct.SPacked = type <{ i32, i32, i8 }>
+
+; Function Attrs: nofree nounwind uwtable mustprogress
+define dso_local i32 @copy_noalias_packed(%struct.SPacked* noalias nocapture %a, %struct.SPacked* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_packed(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.SPacked* [[A:%.*]] to i8*
+; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.SPacked* [[B:%.*]] to i8*
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 9
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[A1]], i8* align 1 [[B2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 9
+; CHECK: for.body:
+; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SPACKED:%.*]], %struct.SPacked* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SPACKED]], %struct.SPacked* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX]] to i8*
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret i32 9
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idxprom = zext i32 %i.08 to i64
+ %arrayidx = getelementptr inbounds %struct.SPacked, %struct.SPacked* %b, i64 %idxprom
+ %arrayidx2 = getelementptr inbounds %struct.SPacked, %struct.SPacked* %a, i64 %idxprom
+ %0 = bitcast %struct.SPacked* %arrayidx2 to i8*
+ %1 = bitcast %struct.SPacked* %arrayidx to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(9) %0, i8* nonnull align 1 dereferenceable(9) %1, i64 9, i1 false)
+ %inc = add nuw nsw i32 %i.08, 1
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+%struct.SAligned = type { i32, i32, i8, [7 x i8] }
+
+define dso_local i32 @copy_noalias_aligned(%struct.SAligned* noalias nocapture %a, %struct.SAligned* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @copy_noalias_aligned(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.SAligned* [[A:%.*]] to i8*
+; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.SAligned* [[B:%.*]] to i8*
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A1]], i8* align 16 [[B2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 16
+; CHECK: for.body:
+; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED:%.*]], %struct.SAligned* [[B]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED]], %struct.SAligned* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX2]] to i8*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX]] to i8*
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret i32 16
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idxprom = zext i32 %i.08 to i64
+ %arrayidx = getelementptr inbounds %struct.SAligned, %struct.SAligned* %b, i64 %idxprom
+ %arrayidx2 = getelementptr inbounds %struct.SAligned, %struct.SAligned* %a, i64 %idxprom
+ %0 = bitcast %struct.SAligned* %arrayidx2 to i8*
+ %1 = bitcast %struct.SAligned* %arrayidx to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) %0, i8* nonnull align 16 dereferenceable(16) %1, i64 16, i1 false)
+ %inc = add nuw nsw i32 %i.08, 1
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
diff --git a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
index 06e17fecec6da..b7a866f446c74 100644
--- a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
+++ b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
; *begin = value;
; }
-; CHECK: remark: <stdin>:4:1: Transformed loop-strided store into a call to llvm.memset.p0i8.i64() function
+; CHECK: remark: <stdin>:4:1: Transformed loop-strided store in _Z15my_basic_memsetPcS_c function into a call to llvm.memset.p0i8.i64() intrinsic
define void @_Z15my_basic_memsetPcS_c(i8* %ptr, i8* %end, i8 %value) {
; CHECK-LABEL: @_Z15my_basic_memsetPcS_c(
More information about the llvm-commits
mailing list