[llvm] r269125 - MemCpyOpt: combine local load/store sequences into memcpy.
Tim Northover via llvm-commits
llvm-commits at lists.llvm.org
Tue May 10 14:48:12 PDT 2016
Author: tnorthover
Date: Tue May 10 16:48:11 2016
New Revision: 269125
URL: http://llvm.org/viewvc/llvm-project?rev=269125&view=rev
Log:
MemCpyOpt: combine local load/store sequences into memcpy.
Sort of the BB-local equivalent to idiom-recognizer: if we have a basic-block
that really implements a memcpy operation, backends can benefit from seeing
this.
Added:
llvm/trunk/test/Transforms/MemCpyOpt/form-memcpy.ll
Modified:
llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp
Modified: llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp?rev=269125&r1=269124&r2=269125&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/MemCpyOptimizer.cpp Tue May 10 16:48:11 2016
@@ -38,6 +38,7 @@ using namespace llvm;
#define DEBUG_TYPE "memcpyopt"
STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemCpyInfer, "Number of memcpys inferred");
STATISTIC(NumMemSetInfer, "Number of memsets inferred");
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
@@ -126,6 +127,18 @@ static bool IsPointerOffset(Value *Ptr1,
return true;
}
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+ const LoadInst *LI) {
+ unsigned StoreAlign = SI->getAlignment();
+ if (!StoreAlign)
+ StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+ unsigned LoadAlign = LI->getAlignment();
+ if (!LoadAlign)
+ LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+ return std::min(StoreAlign, LoadAlign);
+}
+
/// Represents a range of memset'd bytes with the ByteVal value.
/// This allows us to analyze stores like:
@@ -138,14 +151,16 @@ static bool IsPointerOffset(Value *Ptr1,
/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
/// two ranges into [0, 3) which is memset'able.
namespace {
-struct MemsetRange {
+struct MemIntrinsicRange {
// Start/End - A semi range that describes the span that this range covers.
// The range is closed at the start and open at the end: [Start, End).
int64_t Start, End;
/// StartPtr - The getelementptr instruction that points to the start of the
/// range.
- Value *StartPtr;
+ Value *DestStartPtr;
+
+ Value *SrcStartPtr;
/// Alignment - The known alignment of the first store.
unsigned Alignment;
@@ -153,21 +168,22 @@ struct MemsetRange {
/// TheStores - The actual stores that make up this range.
SmallVector<Instruction*, 16> TheStores;
- bool isProfitableToUseMemset(const DataLayout &DL) const;
+ bool isProfitableToUseMemIntrinsic(const DataLayout &DL) const;
};
} // end anon namespace
-bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
- // If we found more than 4 stores to merge or 16 bytes, use memset.
+bool MemIntrinsicRange::isProfitableToUseMemIntrinsic(
+ const DataLayout &DL) const {
+ // If we found more than 4 stores to merge or 16 bytes, use mem intrinsic.
if (TheStores.size() >= 4 || End-Start >= 16) return true;
// If there is nothing to merge, don't do anything.
if (TheStores.size() < 2) return false;
- // If any of the stores are a memset, then it is always good to extend the
- // memset.
+ // If any of the stores are already a mem intrinsic, then it is always good to
+ // extend it.
for (Instruction *SI : TheStores)
- if (!isa<StoreInst>(SI))
+ if (isa<MemIntrinsic>(SI))
return true;
// Assume that the code generator is capable of merging pairs of stores
@@ -201,15 +217,15 @@ bool MemsetRange::isProfitableToUseMemse
namespace {
-class MemsetRanges {
+class MemIntrinsicRanges {
/// A sorted list of the memset ranges.
- SmallVector<MemsetRange, 8> Ranges;
- typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+ SmallVector<MemIntrinsicRange, 8> Ranges;
+ typedef SmallVectorImpl<MemIntrinsicRange>::iterator range_iterator;
const DataLayout &DL;
public:
- MemsetRanges(const DataLayout &DL) : DL(DL) {}
+ MemIntrinsicRanges(const DataLayout &DL) : DL(DL) {}
- typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+ typedef SmallVectorImpl<MemIntrinsicRange>::const_iterator const_iterator;
const_iterator begin() const { return Ranges.begin(); }
const_iterator end() const { return Ranges.end(); }
bool empty() const { return Ranges.empty(); }
@@ -223,17 +239,35 @@ public:
void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+ unsigned Alignment =
+ SI->getAlignment()
+ ? SI->getAlignment()
+ : DL.getABITypeAlignment(SI->getValueOperand()->getType());
addRange(OffsetFromFirst, StoreSize,
- SI->getPointerOperand(), SI->getAlignment(), SI);
+ SI->getPointerOperand(), nullptr, Alignment, SI);
+ }
+
+ void addLoadStore(int64_t OffsetFromFirst, LoadInst *LI, StoreInst *SI) {
+ int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+ addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
+ LI->getPointerOperand(), findCommonAlignment(DL, SI, LI), SI);
}
void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
- addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+ addRange(OffsetFromFirst, Size, MSI->getDest(), nullptr,
+ MSI->getAlignment(), MSI);
}
- void addRange(int64_t Start, int64_t Size, Value *Ptr,
+ void addMemTransfer(int64_t OffsetFromFirst, MemTransferInst *MTI) {
+ int64_t Size = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+ addRange(OffsetFromFirst, Size, MTI->getDest(), MTI->getSource(),
+ MTI->getAlignment(), MTI);
+ }
+
+ void addRange(int64_t Start, int64_t Size, Value *DestPtr, Value *SrcPtr,
unsigned Alignment, Instruction *Inst);
};
@@ -241,24 +275,26 @@ public:
} // end anon namespace
-/// Add a new store to the MemsetRanges data structure. This adds a
+/// Add a new store to the MemIntrinsicRanges data structure. This adds a
/// new range for the specified store at the specified offset, merging into
/// existing ranges as appropriate.
-void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
- unsigned Alignment, Instruction *Inst) {
+void MemIntrinsicRanges::addRange(int64_t Start, int64_t Size, Value *DestPtr,
+ Value *SrcPtr, unsigned Alignment,
+ Instruction *Inst) {
int64_t End = Start+Size;
range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
- [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
+ [](const MemIntrinsicRange &LHS, int64_t RHS) { return LHS.End < RHS; });
// We now know that I == E, in which case we didn't find anything to merge
// with, or that Start <= I->End. If End < I->Start or I == E, then we need
// to insert a new range. Handle this now.
if (I == Ranges.end() || End < I->Start) {
- MemsetRange &R = *Ranges.insert(I, MemsetRange());
+ MemIntrinsicRange &R = *Ranges.insert(I, MemIntrinsicRange());
R.Start = Start;
R.End = End;
- R.StartPtr = Ptr;
+ R.DestStartPtr = DestPtr;
+ R.SrcStartPtr = SrcPtr;
R.Alignment = Alignment;
R.TheStores.push_back(Inst);
return;
@@ -280,7 +316,8 @@ void MemsetRanges::addRange(int64_t Star
// stopped on *it*.
if (Start < I->Start) {
I->Start = Start;
- I->StartPtr = Ptr;
+ I->DestStartPtr = DestPtr;
+ I->SrcStartPtr = SrcPtr;
I->Alignment = Alignment;
}
@@ -335,7 +372,7 @@ namespace {
// Helper functions
bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
- bool processMemCpy(MemCpyInst *M);
+ bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
bool processMemMove(MemMoveInst *M);
bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
@@ -345,6 +382,9 @@ namespace {
bool processByValArgument(CallSite CS, unsigned ArgNo);
Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
Value *ByteVal);
+ Instruction *tryMergingIntoMemcpy(Instruction *StartInst,
+ Value *StartDstPtr,
+ Value *StartSrcPtr);
bool iterateOnFunction(Function &F);
};
@@ -378,7 +418,7 @@ Instruction *MemCpyOpt::tryMergingIntoMe
// all subsequent stores of the same value to offset from the same pointer.
// Join these together into ranges, so we can decide whether contiguous blocks
// are stored.
- MemsetRanges Ranges(DL);
+ MemIntrinsicRanges Ranges(DL);
BasicBlock::iterator BI(StartInst);
for (++BI; !isa<TerminatorInst>(BI); ++BI) {
@@ -440,28 +480,22 @@ Instruction *MemCpyOpt::tryMergingIntoMe
// Now that we have full information about ranges, loop over the ranges and
// emit memset's for anything big enough to be worthwhile.
Instruction *AMemSet = nullptr;
- for (const MemsetRange &Range : Ranges) {
+ for (const MemIntrinsicRange &Range : Ranges) {
if (Range.TheStores.size() == 1) continue;
// If it is profitable to lower this range to memset, do so now.
- if (!Range.isProfitableToUseMemset(DL))
+ if (!Range.isProfitableToUseMemIntrinsic(DL))
continue;
// Otherwise, we do want to transform this! Create a new memset.
// Get the starting pointer of the block.
- StartPtr = Range.StartPtr;
-
- // Determine alignment
+ StartPtr = Range.DestStartPtr;
unsigned Alignment = Range.Alignment;
- if (Alignment == 0) {
- Type *EltType =
- cast<PointerType>(StartPtr->getType())->getElementType();
- Alignment = DL.getABITypeAlignment(EltType);
- }
+ assert(!Range.SrcStartPtr && "memset containing transfer instruction?");
- AMemSet =
- Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+ AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
+ Alignment);
DEBUG(dbgs() << "Replace stores:\n";
for (Instruction *SI : Range.TheStores)
@@ -482,16 +516,149 @@ Instruction *MemCpyOpt::tryMergingIntoMe
return AMemSet;
}
-static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
- const LoadInst *LI) {
- unsigned StoreAlign = SI->getAlignment();
- if (!StoreAlign)
- StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
- unsigned LoadAlign = LI->getAlignment();
- if (!LoadAlign)
- LoadAlign = DL.getABITypeAlignment(LI->getType());
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOpt::tryMergingIntoMemcpy(Instruction *StartInst,
+ Value *StartDestPtr,
+ Value *StartSrcPtr) {
+ const DataLayout &DL = StartInst->getModule()->getDataLayout();
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- return std::min(StoreAlign, LoadAlign);
+ // Okay, so we now have a single store that can be splatable. Scan to find
+ // all subsequent stores of the same value to offset from the same pointer.
+ // Join these together into ranges, so we can decide whether contiguous blocks
+ // are stored.
+ MemIntrinsicRanges Ranges(DL);
+
+ BasicBlock::iterator BI(StartInst);
+ LoadInst *NextLoad = nullptr;
+ for (;!isa<TerminatorInst>(BI); ++BI) {
+ if (!isa<StoreInst>(BI) && !isa<LoadInst>(BI) &&
+ !isa<MemTransferInst>(BI)) {
+ // If the instruction is readnone, ignore it, otherwise bail out. We
+ // don't even allow readonly here because we don't want something like:
+ // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+ if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+ break;
+ continue;
+ }
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(BI)) {
+ if (NextLoad || !LI->isSimple() || !LI->hasOneUse())
+ break;
+ NextLoad = LI;
+ } else if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ // If this is a store, see if we can merge it in.
+ if (!NextLoad || NextLoad != NextStore->getValueOperand() ||
+ !NextStore->isSimple())
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ int64_t DestOffset;
+ if (!IsPointerOffset(StartDestPtr, NextStore->getPointerOperand(),
+ DestOffset, DL))
+ break;
+
+ int64_t SrcOffset;
+ if (!IsPointerOffset(StartSrcPtr, NextLoad->getPointerOperand(),
+ SrcOffset, DL))
+ break;
+
+ if (DestOffset != SrcOffset)
+ break;
+
+ Ranges.addLoadStore(DestOffset, NextLoad, NextStore);
+ NextLoad = nullptr;
+ } else {
+ MemTransferInst *MTI = cast<MemTransferInst>(BI);
+
+ if (NextLoad || MTI->isVolatile() || !isa<ConstantInt>(MTI->getLength()))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ int64_t DestOffset;
+ if (!IsPointerOffset(StartDestPtr, MTI->getDest(), DestOffset, DL))
+ break;
+
+ int64_t SrcOffset;
+ if (!IsPointerOffset(StartSrcPtr, MTI->getSource(), SrcOffset, DL))
+ break;
+
+ if (SrcOffset != DestOffset)
+ break;
+
+ Ranges.addMemTransfer(SrcOffset, MTI);
+ }
+ }
+
+ // If we have no ranges, then we just had a single store with nothing that
+ // could be merged in. This is a very common case of course.
+ if (Ranges.empty())
+ return nullptr;
+
+ // If we create any memsets, we put it right before the first instruction that
+ // isn't part of the memset block. This ensure that the memset is dominated
+ // by any addressing instruction needed by the start of the block.
+ IRBuilder<> Builder(&*BI);
+
+ // Now that we have full information about ranges, loop over the ranges and
+ // emit memset's for anything big enough to be worthwhile.
+ Instruction *AMemCpy = nullptr;
+ for (const MemIntrinsicRange &Range : Ranges) {
+
+ if (Range.TheStores.size() == 1) continue;
+
+ // If it is profitable to lower this range to memset, do so now.
+ if (!Range.isProfitableToUseMemIntrinsic(DL))
+ continue;
+
+ // Otherwise, we do want to transform this! Create a new memset.
+ // Get the starting pointer of the block.
+ Value *DestStartPtr = Range.DestStartPtr;
+ Value *SrcStartPtr = Range.SrcStartPtr;
+ unsigned Alignment = Range.Alignment;
+
+ // We don't keep track of load/store pairs well enough to determine whether
+ // a memmove is permitted for possibly-aliasing addresses (both order and
+ // duplicates matter in that case, possibly in ways only determined
+ // dynamically).
+ uint64_t Size = Range.End - Range.Start;
+ if (!AA.isNoAlias(MemoryLocation(DestStartPtr, Size),
+ MemoryLocation(SrcStartPtr, Size)))
+ continue;
+
+ AMemCpy = Builder.CreateMemCpy(DestStartPtr, SrcStartPtr, Size, Alignment);
+
+ DEBUG(dbgs() << "Replace load/stores:\n";
+ for (Instruction *I : Range.TheStores) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ dbgs() << *SI->getValueOperand() << '\n';
+ dbgs() << *I << '\n';
+ }
+ dbgs() << "With: " << *AMemCpy << '\n');
+
+ if (!Range.TheStores.empty())
+ AMemCpy->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+ // Zap all the excess operations.
+ for (Instruction *I : Range.TheStores) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ auto LI = cast<LoadInst>(SI->getValueOperand());
+ MD->removeInstruction(LI);
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ LI->eraseFromParent();
+ } else {
+ MD->removeInstruction(I);
+ I->eraseFromParent();
+ }
+ }
+ ++NumMemCpyInfer;
+ }
+
+ return AMemCpy;
}
// This method try to lift a store instruction before position P.
@@ -662,6 +829,10 @@ bool MemCpyOpt::processStore(StoreInst *
BBI = M->getIterator();
return true;
}
+ } else if (Instruction *I = tryMergingIntoMemcpy(
+ LI, SI->getPointerOperand(), LI->getPointerOperand())) {
+ BBI = I->getIterator();
+ return true;
}
// Detect cases where we're performing call slot forwarding, but
@@ -1124,7 +1295,7 @@ bool MemCpyOpt::performMemCpyToMemSetOpt
/// B to be a memcpy from X to Z (or potentially a memmove, depending on
/// circumstances). This allows later passes to remove the first memcpy
/// altogether.
-bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+bool MemCpyOpt::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
// We can only optimize non-volatile memcpy's.
if (M->isVolatile()) return false;
@@ -1217,6 +1388,9 @@ bool MemCpyOpt::processMemCpy(MemCpyInst
return true;
}
+ if (auto I = tryMergingIntoMemcpy(M, M->getDest(), M->getSource()))
+ BBI = I->getIterator();
+
return false;
}
@@ -1339,7 +1513,7 @@ bool MemCpyOpt::iterateOnFunction(Functi
else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
RepeatInstruction = processMemSet(M, BI);
else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
- RepeatInstruction = processMemCpy(M);
+ RepeatInstruction = processMemCpy(M, BI);
else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
RepeatInstruction = processMemMove(M);
else if (auto CS = CallSite(I)) {
Added: llvm/trunk/test/Transforms/MemCpyOpt/form-memcpy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MemCpyOpt/form-memcpy.ll?rev=269125&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/MemCpyOpt/form-memcpy.ll (added)
+++ llvm/trunk/test/Transforms/MemCpyOpt/form-memcpy.ll Tue May 10 16:48:11 2016
@@ -0,0 +1,353 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+
+define void @test_simple_memcpy(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_simple_memcpy
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define void @test_simple_memmove(i32* %dst, i32* %src) {
+; CHECK-LABEL: @test_simple_memmove
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+; Make sure we can handle calculating bases & offsets from a real memcpy.
+define void @test_initial_memcpy(i32* noalias %dst, i32* noalias%src) {
+; CHECK-LABEL: @test_initial_memcpy
+; CHECK: {{%.*}} = bitcast i32* %dst to i8*
+; CHECK: {{%.*}} = bitcast i32* %src to i8*
+; CHECK: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+
+ %dst.0 = bitcast i32* %dst to i8*
+ %src.0 = bitcast i32* %src to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.0, i8* %src.0, i64 4, i32 4, i1 false)
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define void @test_volatile_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_volatile_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load volatile i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define void @test_atomic_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_atomic_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store atomic i32 %val.1, i32* %dst.1 unordered, align 4
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define i32 @test_multi_use_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_multi_use_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret i32 %val.1
+}
+
+define void @test_side_effect_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_side_effect_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ call void asm sideeffect "", ""()
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define void @test_holes_handled(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_holes_handled
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst.7 to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src.7 to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+
+ %src.7 = getelementptr i32, i32* %src, i32 7
+ %dst.7 = getelementptr i32, i32* %dst, i32 7
+ %val.7 = load i32, i32* %src.7
+ store i32 %val.7, i32* %dst.7
+
+ %src.9 = getelementptr i32, i32* %src, i32 9
+ %dst.9 = getelementptr i32, i32* %dst, i32 9
+ %val.9 = load i32, i32* %src.9
+ store i32 %val.9, i32* %dst.9
+
+ %src.10 = getelementptr i32, i32* %src, i32 10
+ %dst.10 = getelementptr i32, i32* %dst, i32 10
+ %val.10 = load i32, i32* %src.10
+ store i32 %val.10, i32* %dst.10
+
+ %src.8 = getelementptr i32, i32* %src, i32 8
+ %dst.8 = getelementptr i32, i32* %dst, i32 8
+ %val.8 = load i32, i32* %src.8
+ store i32 %val.8, i32* %dst.8
+
+ ret void
+}
+
+define void @test_offset_mismatch(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_offset_mismatch
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 1
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 2
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define void @test_non_idempotent_ops(i8* %dst, i8* %src) {
+; CHECK-LABEL: @test_non_idempotent_ops
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+ %val.0 = load i8, i8* %src
+ store i8 %val.0, i8* %dst
+
+ %src.2 = getelementptr i8, i8* %src, i8 2
+ %dst.2 = getelementptr i8, i8* %dst, i8 2
+ %val.2 = load i8, i8* %src.2
+ store i8 %val.2, i8* %dst.2
+
+ %val.0.dup = load i8, i8* %src
+ store i8 %val.0.dup, i8* %dst
+
+ %src.1 = getelementptr i8, i8* %src, i8 1
+ %dst.1 = getelementptr i8, i8* %dst, i8 1
+ %val.1 = load i8, i8* %src.1
+ store i8 %val.1, i8* %dst.1
+
+ %src.3 = getelementptr i8, i8* %src, i8 3
+ %dst.3 = getelementptr i8, i8* %dst, i8 3
+ %val.3 = load i8, i8* %src.3
+ store i8 %val.3, i8* %dst.3
+
+ ret void
+}
+
+define void @test_intervening_op(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_intervening_op
+; CHECK-NOT: call void @llvm.memcpy
+
+ %val.0 = load i32, i32* %src
+ store i32 %val.0, i32* %dst
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %src16.2 = bitcast i32* %src.2 to i16*
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val16.2 = load i16, i16* %src16.2
+ %val.2 = sext i16 %val16.2 to i32
+ store i32 %val.2, i32* %dst.2
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+define void @test_infer_align(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_infer_align
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 8, i1 false)
+
+ %src.2 = getelementptr i32, i32* %src, i32 2
+ %dst.2 = getelementptr i32, i32* %dst, i32 2
+ %val.2 = load i32, i32* %src.2
+ store i32 %val.2, i32* %dst.2
+
+ %val.0 = load i32, i32* %src, align 8
+ store i32 %val.0, i32* %dst, align 16
+
+ %src.1 = getelementptr i32, i32* %src, i32 1
+ %dst.1 = getelementptr i32, i32* %dst, i32 1
+ %val.1 = load i32, i32* %src.1
+ store i32 %val.1, i32* %dst.1
+
+ %src.3 = getelementptr i32, i32* %src, i32 3
+ %dst.3 = getelementptr i32, i32* %dst, i32 3
+ %val.3 = load i32, i32* %src.3
+ store i32 %val.3, i32* %dst.3
+
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
More information about the llvm-commits
mailing list