[llvm] r363581 - [LV] Suppress vectorization in some nontemporal cases
Warren Ristow via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 17 10:20:08 PDT 2019
Author: wristow
Date: Mon Jun 17 10:20:08 2019
New Revision: 363581
URL: http://llvm.org/viewvc/llvm-project?rev=363581&view=rev
Log:
[LV] Suppress vectorization in some nontemporal cases
When considering a loop containing nontemporal stores or loads for
vectorization, suppress the vectorization if the corresponding
vectorized store or load with the aligment of the original scaler
memory op is not supported with the nontemporal hint on the target.
This adds two new functions:
bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
to TTI, leaving the target independent default implementation as
returning true, but with overriding implementations for X86 that
check the legality based on available Subtarget features.
This fixes https://llvm.org/PR40759
Differential Revision: https://reviews.llvm.org/D61764
Added:
llvm/trunk/test/Transforms/LoopVectorize/X86/nontemporal.ll
Modified:
llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll
Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Mon Jun 17 10:20:08 2019
@@ -531,6 +531,11 @@ public:
/// Return true if the target supports masked store.
bool isLegalMaskedLoad(Type *DataType) const;
+ /// Return true if the target supports nontemporal store.
+ bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
+ /// Return true if the target supports nontemporal load.
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
+
/// Return true if the target supports masked scatter.
bool isLegalMaskedScatter(Type *DataType) const;
/// Return true if the target supports masked gather.
@@ -1118,6 +1123,8 @@ public:
virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
virtual bool isLegalMaskedStore(Type *DataType) = 0;
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
+ virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0;
+ virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0;
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
virtual bool isLegalMaskedGather(Type *DataType) = 0;
virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
@@ -1373,6 +1380,12 @@ public:
bool isLegalMaskedLoad(Type *DataType) override {
return Impl.isLegalMaskedLoad(DataType);
}
+ bool isLegalNTStore(Type *DataType, unsigned Alignment) override {
+ return Impl.isLegalNTStore(DataType, Alignment);
+ }
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment) override {
+ return Impl.isLegalNTLoad(DataType, Alignment);
+ }
bool isLegalMaskedScatter(Type *DataType) override {
return Impl.isLegalMaskedScatter(DataType);
}
Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h Mon Jun 17 10:20:08 2019
@@ -229,6 +229,20 @@ public:
bool isLegalMaskedLoad(Type *DataType) { return false; }
+ bool isLegalNTStore(Type *DataType, unsigned Alignment) {
+ // By default, assume nontemporal memory stores are available for stores
+ // that are aligned and have a size that is a power of 2.
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ return Alignment >= DataSize && isPowerOf2_32(DataSize);
+ }
+
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment) {
+ // By default, assume nontemporal memory loads are available for loads that
+ // are aligned and have a size that is a power of 2.
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ return Alignment >= DataSize && isPowerOf2_32(DataSize);
+ }
+
bool isLegalMaskedScatter(Type *DataType) { return false; }
bool isLegalMaskedGather(Type *DataType) { return false; }
Modified: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (original)
+++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h Mon Jun 17 10:20:08 2019
@@ -205,12 +205,13 @@ class LoopVectorizationLegality {
public:
LoopVectorizationLegality(
Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
- TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
- std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
- OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
- LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
- : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
- ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AliasAnalysis *AA,
+ Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
+ LoopInfo *LI, OptimizationRemarkEmitter *ORE,
+ LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
+ AssumptionCache *AC)
+ : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
+ GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
@@ -402,6 +403,9 @@ private:
/// unrolling.
PredicatedScalarEvolution &PSE;
+ /// Target Transform Info.
+ TargetTransformInfo *TTI;
+
/// Target Library Info.
TargetLibraryInfo *TLI;
Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Mon Jun 17 10:20:08 2019
@@ -183,6 +183,16 @@ bool TargetTransformInfo::isLegalMaskedL
return TTIImpl->isLegalMaskedLoad(DataType);
}
+bool TargetTransformInfo::isLegalNTStore(Type *DataType,
+ unsigned Alignment) const {
+ return TTIImpl->isLegalNTStore(DataType, Alignment);
+}
+
+bool TargetTransformInfo::isLegalNTLoad(Type *DataType,
+ unsigned Alignment) const {
+ return TTIImpl->isLegalNTLoad(DataType, Alignment);
+}
+
bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
return TTIImpl->isLegalMaskedGather(DataType);
}
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Mon Jun 17 10:20:08 2019
@@ -3143,6 +3143,41 @@ bool X86TTIImpl::isLegalMaskedStore(Type
return isLegalMaskedLoad(DataType);
}
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ // The only supported nontemporal loads are for aligned vectors of 16 or 32
+ // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
+ // (the equivalent stores only require AVX).
+ if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+ return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
+
+ return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+ // SSE4A supports nontemporal stores of float and double at arbitrary
+ // alignment.
+ if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+ return true;
+
+ // Besides the SSE4A subtarget exception above, only aligned stores are
+ // available nontemporaly on any other subtarget. And only stores with a size
+ // of 4..32 bytes (powers of 2, only) are permitted.
+ if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+ !isPowerOf2_32(DataSize))
+ return false;
+
+ // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+ // loads require AVX2).
+ if (DataSize == 32)
+ return ST->hasAVX();
+ else if (DataSize == 16)
+ return ST->hasSSE1();
+ return true;
+}
+
bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
if (!isa<VectorType>(DataTy))
return false;
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h Mon Jun 17 10:20:08 2019
@@ -186,6 +186,8 @@ public:
bool canMacroFuseCmp();
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment);
+ bool isLegalNTStore(Type *DataType, unsigned Alignment);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
bool isLegalMaskedExpandLoad(Type *DataType);
Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp Mon Jun 17 10:20:08 2019
@@ -767,6 +767,38 @@ bool LoopVectorizationLegality::canVecto
return false;
}
+ // For nontemporal stores, check that a nontemporal vector version is
+ // supported on the target.
+ if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+ // Arbitrarily try a vector of 2 elements.
+ Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+ assert(VecTy && "did not find vectorized version of stored type");
+ unsigned Alignment = getLoadStoreAlignment(ST);
+ if (!TTI->isLegalNTStore(VecTy, Alignment)) {
+ reportVectorizationFailure(
+ "nontemporal store instruction cannot be vectorized",
+ "nontemporal store instruction cannot be vectorized",
+ "CantVectorizeNontemporalStore", ST);
+ return false;
+ }
+ }
+
+ } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+ if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+ // For nontemporal loads, check that a nontemporal vector version is
+ // supported on the target (arbitrarily try a vector of 2 elements).
+ Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+ assert(VecTy && "did not find vectorized version of load type");
+ unsigned Alignment = getLoadStoreAlignment(LD);
+ if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
+ reportVectorizationFailure(
+ "nontemporal load instruction cannot be vectorized",
+ "nontemporal load instruction cannot be vectorized",
+ "CantVectorizeNontemporalLoad", LD);
+ return false;
+ }
+ }
+
// FP instructions can allow unsafe algebra, thus vectorizable by
// non-IEEE-754 compliant SIMD units.
// This applies to floating-point math operations and calls, not memory
Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Mon Jun 17 10:20:08 2019
@@ -7275,7 +7275,7 @@ bool LoopVectorizePass::processLoop(Loop
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
&Requirements, &Hints, DB, AC);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
Added: llvm/trunk/test/Transforms/LoopVectorize/X86/nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/nontemporal.ll?rev=363581&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/nontemporal.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/nontemporal.ll Mon Jun 17 10:20:08 2019
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+; The three test-cases below are all based on modified versions of a simple copy-loop:
+;
+; void foo(unsigned *src, unsigned *dst, unsigned nElts) {
+; for (unsigned i = 0; i < nElts; ++i) {
+; unsigned tmp = src[i];
+; dst[i] = tmp;
+; }
+; }
+;
+; In the first version, there are no nontemporal stores or loads, and so vectorization
+; is safely done.
+;
+; In the second version, the store into dst[i] has the nontemporal hint. The alignment
+; on X86_64 for 'unsigned' is 4, so the vector store generally will not be aligned to the
+; vector size (of 16 here). Unaligned nontemporal vector stores are not supported on X86_64,
+; and so the vectorization is suppressed (because when vectorizing it, the nontemoral hint
+; would not be honored in the final code-gen).
+;
+; The third version is analogous to the second, except rather than the store, it is the
+; load from 'src[i]' that has the nontemporal hint. Vectorization is suppressed in this
+; case because (like stores) unaligned nontemoral vector loads are not supported on X86_64.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64"
+
+; CHECK-LABEL: @vectorTest(
+define void @vectorTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+ %cmp8 = icmp eq i32 %nElts, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %nElts to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; Check that we vectorized the load, and that there is no nontemporal hint.
+; CHECK: %wide.load = load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+; Check that we vectorized the store, and that there is no nontemporal hint.
+; CHECK: store <4 x i32> %wide.load, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+ %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+ store i32 %0, i32* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTStoreTest(
+; Check that the vectorized type of the store does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTStoreTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+ %cmp8 = icmp eq i32 %nElts, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %nElts to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+; Check that the store is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx2, align 4, !nontemporal !4
+ store i32 %0, i32* %arrayidx2, align 4, !nontemporal !0
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTLoadTest(
+; Check that the vectorized type of the load does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTLoadTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+ %cmp8 = icmp eq i32 %nElts, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %nElts to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+; Check that the load is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: load i32, i32* %arrayidx, align 4, !nontemporal !4
+ %0 = load i32, i32* %arrayidx, align 4, !nontemporal !0
+ %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+ store i32 %0, i32* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+!0 = !{i32 1}
Modified: llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll?rev=363581&r1=363580&r2=363581&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/nontemporal.ll Mon Jun 17 10:20:08 2019
@@ -14,19 +14,19 @@ for.body.preheader:
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-; Check that we don't lose !nontemporal hint when vectorizing loads.
-; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of loads.
+; CHECK: load {{.*}} align 4, !nontemporal !0
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !nontemporal !0
; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
-; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
+; CHECK: load {{.*}} align 4{{$}}
%arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4
%add = fadd float %0, %1
-; Check that we don't lose !nontemporal hint when vectorizing stores.
-; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of stores.
+; CHECK: store {{.*}} align 4, !nontemporal !0
%arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %add, float* %arrayidx4, align 4, !nontemporal !0
More information about the llvm-commits
mailing list