[llvm] r227149 - Commoning of target specific load/store intrinsics in Early CSE.
Chad Rosier
mcrosier at codeaurora.org
Mon Jan 26 14:51:15 PST 2015
Author: mcrosier
Date: Mon Jan 26 16:51:15 2015
New Revision: 227149
URL: http://llvm.org/viewvc/llvm-project?rev=227149&view=rev
Log:
Commoning of target specific load/store intrinsics in Early CSE.
Phabricator revision: http://reviews.llvm.org/D7121
Patch by Sanjin Sijaric <ssijaric at codeaurora.org>!
Added:
llvm/trunk/test/Transforms/EarlyCSE/AArch64/
llvm/trunk/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
llvm/trunk/test/Transforms/EarlyCSE/AArch64/lit.local.cfg
Modified:
llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp
Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=227149&r1=227148&r2=227149&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Mon Jan 26 16:51:15 2015
@@ -23,6 +23,7 @@
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Pass.h"
#include "llvm/Support/DataTypes.h"
@@ -35,6 +36,20 @@ class Type;
class User;
class Value;
+/// \brief Information about a load/store intrinsic defined by the target.
+struct MemIntrinsicInfo {
+ MemIntrinsicInfo()
+ : ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
+ NumMemRefs(0), PtrVal(nullptr) {}
+ bool ReadMem;
+ bool WriteMem;
+ bool Vol;
+ // Same Id is set by the target for corresponding load/store intrinsics.
+ unsigned short MatchingId;
+ int NumMemRefs;
+ Value *PtrVal;
+};
+
/// TargetTransformInfo - This pass provides access to the codegen
/// interfaces that are needed for IR-level transformations.
class TargetTransformInfo {
@@ -443,6 +458,20 @@ public:
/// any callee-saved registers, so would require a spill and fill.
virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;
+ /// \returns True if the intrinsic is a supported memory intrinsic. Info
+ /// will contain additional information - whether the intrinsic may write
+ /// or read to memory, volatility and the pointer. Info is undefined
+ /// if false is returned.
+ virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) const;
+
+ /// \returns A value which is the result of the given memory intrinsic. New
+ /// instructions may be created to extract the result from the given intrinsic
+ /// memory operation. Returns nullptr if the target cannot create a result
+ /// from the given intrinsic.
+ virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+ Type *ExpectedType) const;
+
/// @}
/// Analysis group identification.
Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=227149&r1=227148&r2=227149&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Mon Jan 26 16:51:15 2015
@@ -254,6 +254,16 @@ unsigned TargetTransformInfo::getCostOfK
return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
}
+Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
+ IntrinsicInst *Inst, Type *ExpectedType) const {
+ return PrevTTI->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
+}
+
+bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) const {
+ return PrevTTI->getTgtMemIntrinsic(Inst, Info);
+}
+
namespace {
struct NoTTI final : ImmutablePass, TargetTransformInfo {
@@ -656,6 +666,15 @@ struct NoTTI final : ImmutablePass, Targ
return 0;
}
+ bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) const override {
+ return false;
+ }
+
+ Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+ Type *ExpectedType) const override {
+ return nullptr;
+ }
};
} // end anonymous namespace
Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=227149&r1=227148&r2=227149&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Mon Jan 26 16:51:15 2015
@@ -44,6 +44,12 @@ class AArch64TTI final : public Immutabl
/// are set if the result needs to be inserted and/or extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+ enum MemIntrinsicType {
+ VECTOR_LDST_TWO_ELEMENTS,
+ VECTOR_LDST_THREE_ELEMENTS,
+ VECTOR_LDST_FOUR_ELEMENTS
+ };
+
public:
AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
@@ -131,6 +137,11 @@ public:
void getUnrollingPreferences(const Function *F, Loop *L,
UnrollingPreferences &UP) const override;
+ Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+ Type *ExpectedType) const override;
+
+ bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) const override;
/// @}
};
@@ -554,3 +565,83 @@ void AArch64TTI::getUnrollingPreferences
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
}
+
+Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+ Type *ExpectedType) const {
+ switch (Inst->getIntrinsicID()) {
+ default:
+ return nullptr;
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4: {
+ // Create a struct type
+ StructType *ST = dyn_cast<StructType>(ExpectedType);
+ if (!ST)
+ return nullptr;
+ unsigned NumElts = Inst->getNumArgOperands() - 1;
+ if (ST->getNumElements() != NumElts)
+ return nullptr;
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
+ return nullptr;
+ }
+ Value *Res = UndefValue::get(ExpectedType);
+ IRBuilder<> Builder(Inst);
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ Value *L = Inst->getArgOperand(i);
+ Res = Builder.CreateInsertValue(Res, L, i);
+ }
+ return Res;
+ }
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ if (Inst->getType() == ExpectedType)
+ return Inst;
+ return nullptr;
+ }
+}
+
+bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
+ MemIntrinsicInfo &Info) const {
+ switch (Inst->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld4:
+ Info.ReadMem = true;
+ Info.WriteMem = false;
+ Info.Vol = false;
+ Info.NumMemRefs = 1;
+ Info.PtrVal = Inst->getArgOperand(0);
+ break;
+ case Intrinsic::aarch64_neon_st2:
+ case Intrinsic::aarch64_neon_st3:
+ case Intrinsic::aarch64_neon_st4:
+ Info.ReadMem = false;
+ Info.WriteMem = true;
+ Info.Vol = false;
+ Info.NumMemRefs = 1;
+ Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
+ break;
+ }
+
+ switch (Inst->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_st2:
+ Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
+ break;
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_st3:
+ Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
+ break;
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_st4:
+ Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
+ break;
+ }
+ return true;
+}
Modified: llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp?rev=227149&r1=227148&r2=227149&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/EarlyCSE.cpp Mon Jan 26 16:51:15 2015
@@ -18,6 +18,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
@@ -273,6 +274,7 @@ class EarlyCSE : public FunctionPass {
public:
const DataLayout *DL;
const TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
DominatorTree *DT;
AssumptionCache *AC;
typedef RecyclingAllocator<
@@ -383,14 +385,83 @@ private:
bool Processed;
};
+ /// \brief Wrapper class to handle memory instructions, including loads,
+ /// stores and intrinsic loads and stores defined by the target.
+ class ParseMemoryInst {
+ public:
+ ParseMemoryInst(Instruction *Inst, const TargetTransformInfo *TTI)
+ : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
+ MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
+ MayReadFromMemory = Inst->mayReadFromMemory();
+ MayWriteToMemory = Inst->mayWriteToMemory();
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ MemIntrinsicInfo Info;
+ if (!TTI->getTgtMemIntrinsic(II, Info))
+ return;
+ if (Info.NumMemRefs == 1) {
+ Store = Info.WriteMem;
+ Load = Info.ReadMem;
+ MatchingId = Info.MatchingId;
+ MayReadFromMemory = Info.ReadMem;
+ MayWriteToMemory = Info.WriteMem;
+ Vol = Info.Vol;
+ Ptr = Info.PtrVal;
+ }
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ Load = true;
+ Vol = !LI->isSimple();
+ Ptr = LI->getPointerOperand();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ Store = true;
+ Vol = !SI->isSimple();
+ Ptr = SI->getPointerOperand();
+ }
+ }
+ bool isLoad() { return Load; }
+ bool isStore() { return Store; }
+ bool isVolatile() { return Vol; }
+ bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
+ return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+ }
+ bool isValid() { return Ptr != nullptr; }
+ int getMatchingId() { return MatchingId; }
+ Value *getPtr() { return Ptr; }
+ bool mayReadFromMemory() { return MayReadFromMemory; }
+ bool mayWriteToMemory() { return MayWriteToMemory; }
+
+ private:
+ bool Load;
+ bool Store;
+ bool Vol;
+ bool MayReadFromMemory;
+ bool MayWriteToMemory;
+ // For regular (non-intrinsic) loads/stores, this is set to -1. For
+ // intrinsic loads/stores, the id is retrieved from the corresponding
+ // field in the MemIntrinsicInfo structure. That field contains
+ // non-negative values only.
+ int MatchingId;
+ Value *Ptr;
+ };
+
bool processNode(DomTreeNode *Node);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfo>();
AU.setPreservesCFG();
}
+
+ Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return LI;
+ else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand();
+ assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+ return TTI->getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+ ExpectedType);
+ }
};
}
@@ -420,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *
/// as long as there in no instruction that reads memory. If we see a store
/// to the same location, we delete the dead store. This zaps trivial dead
/// stores which can occur in bitfield code among other things.
- StoreInst *LastStore = nullptr;
+ Instruction *LastStore = nullptr;
bool Changed = false;
@@ -475,10 +546,11 @@ bool EarlyCSE::processNode(DomTreeNode *
continue;
}
+ ParseMemoryInst MemInst(Inst, TTI);
// If this is a non-volatile load, process it.
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ if (MemInst.isValid() && MemInst.isLoad()) {
// Ignore volatile loads.
- if (!LI->isSimple()) {
+ if (MemInst.isVolatile()) {
LastStore = nullptr;
continue;
}
@@ -486,27 +558,35 @@ bool EarlyCSE::processNode(DomTreeNode *
// If we have an available version of this load, and if it is the right
// generation, replace this instruction.
std::pair<Value *, unsigned> InVal =
- AvailableLoads->lookup(Inst->getOperand(0));
+ AvailableLoads->lookup(MemInst.getPtr());
if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
- DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
- << " to: " << *InVal.first << '\n');
- if (!Inst->use_empty())
- Inst->replaceAllUsesWith(InVal.first);
- Inst->eraseFromParent();
- Changed = true;
- ++NumCSELoad;
- continue;
+ Value *Op = getOrCreateResult(InVal.first, Inst->getType());
+ if (Op != nullptr) {
+ DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(Op);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSELoad;
+ continue;
+ }
}
// Otherwise, remember that we have this instruction.
- AvailableLoads->insert(Inst->getOperand(0), std::pair<Value *, unsigned>(
- Inst, CurrentGeneration));
+ AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+ Inst, CurrentGeneration));
LastStore = nullptr;
continue;
}
// If this instruction may read from memory, forget LastStore.
- if (Inst->mayReadFromMemory())
+ // Load/store intrinsics will indicate both a read and a write to
+ // memory. The target may override this (e.g. so that a store intrinsic
+ // does not read from memory, and thus will be treated the same as a
+ // regular store for commoning purposes).
+ if (Inst->mayReadFromMemory() &&
+ !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
LastStore = nullptr;
// If this is a read-only call, process it.
@@ -537,17 +617,19 @@ bool EarlyCSE::processNode(DomTreeNode *
if (Inst->mayWriteToMemory()) {
++CurrentGeneration;
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (MemInst.isValid() && MemInst.isStore()) {
// We do a trivial form of DSE if there are two stores to the same
// location with no intervening loads. Delete the earlier store.
- if (LastStore &&
- LastStore->getPointerOperand() == SI->getPointerOperand()) {
- DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
- << " due to: " << *Inst << '\n');
- LastStore->eraseFromParent();
- Changed = true;
- ++NumDSE;
- LastStore = nullptr;
+ if (LastStore) {
+ ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+ DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << *Inst << '\n');
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
// fallthrough - we can exploit information about this store
}
@@ -556,13 +638,12 @@ bool EarlyCSE::processNode(DomTreeNode *
// version of the pointer. It is safe to forward from volatile stores
// to non-volatile loads, so we don't have to check for volatility of
// the store.
- AvailableLoads->insert(SI->getPointerOperand(),
- std::pair<Value *, unsigned>(
- SI->getValueOperand(), CurrentGeneration));
+ AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+ Inst, CurrentGeneration));
// Remember that this was the last store we saw for DSE.
- if (SI->isSimple())
- LastStore = SI;
+ if (!MemInst.isVolatile())
+ LastStore = Inst;
}
}
}
@@ -584,6 +665,7 @@ bool EarlyCSE::runOnFunction(Function &F
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
Added: llvm/trunk/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/EarlyCSE/AArch64/intrinsics.ll?rev=227149&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/EarlyCSE/AArch64/intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/EarlyCSE/AArch64/intrinsics.ll Mon Jan 26 16:51:15 2015
@@ -0,0 +1,231 @@
+; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
+
+define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+ %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+ %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+ %cmp = icmp slt i32 %i.0, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = bitcast i32* %a to i8*
+ %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+ %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+ %3 = bitcast <16 x i8> %1 to <4 x i32>
+ %4 = bitcast <16 x i8> %2 to <4 x i32>
+ call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+ %5 = bitcast i32* %a to i8*
+ %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+ %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+ %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+ %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse2
+; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+ %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+ %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+ %cmp = icmp slt i32 %i.0, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = bitcast i32* %a to i8*
+ %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+ %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+ %3 = bitcast <16 x i8> %1 to <4 x i32>
+ %4 = bitcast <16 x i8> %2 to <4 x i32>
+ call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+ call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+ %5 = bitcast i32* %a to i8*
+ %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+ %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+ %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+ %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
+entry:
+; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse3
+; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+ %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+ %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+ %cmp = icmp slt i32 %i.0, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = bitcast i32* %a to i8*
+ %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
+ %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+ %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+ %1 = bitcast i32* %a to i8*
+ %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
+ %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
+ %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
+ %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret <4 x i32> %res.0
+}
+
+
+define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
+; away by Early CSE.
+; CHECK-LABEL: @test_nocse
+; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+ %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+ %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+ %cmp = icmp slt i32 %i.0, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = bitcast i32* %a to i8*
+ %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+ %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+ %3 = bitcast <16 x i8> %1 to <4 x i32>
+ %4 = bitcast <16 x i8> %2 to <4 x i32>
+ call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+ store i32 0, i32* %b, align 4
+ %5 = bitcast i32* %a to i8*
+ %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+ %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+ %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+ %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
+; to mismatch between st2 and ld3.
+; CHECK-LABEL: @test_nocse2
+; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
+ %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+ %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+ %cmp = icmp slt i32 %i.0, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = bitcast i32* %a to i8*
+ %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+ %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+ %3 = bitcast <16 x i8> %1 to <4 x i32>
+ %4 = bitcast <16 x i8> %2 to <4 x i32>
+ call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+ %5 = bitcast i32* %a to i8*
+ %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
+ %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+ %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
+ %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
+; mismatch between st2 and st3.
+; CHECK-LABEL: @test_nocse3
+; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
+; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
+ %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+ %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+ %cmp = icmp slt i32 %i.0, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = bitcast i32* %a to i8*
+ %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+ %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+ %3 = bitcast <16 x i8> %1 to <4 x i32>
+ %4 = bitcast <16 x i8> %2 to <4 x i32>
+ call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
+ call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+ %5 = bitcast i32* %a to i8*
+ %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
+ %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+ %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
+ %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret <4 x i32> %res.0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
+
+; Function Attrs: nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
+
+; Function Attrs: nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
+
+define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
+entry:
+ %add = add <4 x i32> %__p0, %__p1
+ ret <4 x i32> %add
+}
Added: llvm/trunk/test/Transforms/EarlyCSE/AArch64/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/EarlyCSE/AArch64/lit.local.cfg?rev=227149&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/EarlyCSE/AArch64/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/EarlyCSE/AArch64/lit.local.cfg Mon Jan 26 16:51:15 2015
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+ config.unsupported = True
More information about the llvm-commits
mailing list