[llvm] 636efd2 - [SVE][LoopVectorize] Add option to disable tail-folding for reverse loops
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 27 07:12:32 PDT 2023
Author: David Sherwood
Date: 2023-03-27T14:10:15Z
New Revision: 636efd2e3508f4a39717cee75c1255a48acb8f4e
URL: https://github.com/llvm/llvm-project/commit/636efd2e3508f4a39717cee75c1255a48acb8f4e
DIFF: https://github.com/llvm/llvm-project/commit/636efd2e3508f4a39717cee75c1255a48acb8f4e.diff
LOG: [SVE][LoopVectorize] Add option to disable tail-folding for reverse loops
If we use tail-folding for reverse loops that contain loads
and stores then we will need to reverse the loop predicate.
This patch adds a new 'reverse' sve-tail-folding option and
ensures they are not considered 'simple'.
I did this by adding a function called
containsDecreasingPointers to AArch64TargetTransformInfo.cpp
that searches all instructions in the loop for loads or
stores with negative strides.
Differential Revision: https://reviews.llvm.org/D146128
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 7f6430cda3750..6c423274986dd 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -388,6 +388,10 @@ class LoopVectorizationLegality {
return ConditionalAssumes;
}
+ PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
+ return &PSE;
+ }
+
private:
/// Return true if the pre-header, exiting and latch blocks of \p Lp and all
/// its nested loops are considered legal for vectorization. These legal
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3e0f419ca3f94..f145ae8b269a0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -49,8 +49,9 @@ class TailFoldingKind {
TFDisabled = 0x0,
TFReductions = 0x01,
TFRecurrences = 0x02,
+ TFReverse = 0x04,
TFSimple = 0x80,
- TFAll = TFReductions | TFRecurrences | TFSimple
+ TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
};
void operator=(const std::string &Val) {
@@ -71,10 +72,14 @@ class TailFoldingKind {
add(TFReductions);
else if (TailFoldType == "recurrences")
add(TFRecurrences);
+ else if (TailFoldType == "reverse")
+ add(TFReverse);
else if (TailFoldType == "noreductions")
remove(TFReductions);
else if (TailFoldType == "norecurrences")
remove(TFRecurrences);
+ else if (TailFoldType == "noreverse")
+ remove(TFReverse);
else {
errs()
<< "invalid argument " << TailFoldType.str()
@@ -106,7 +111,9 @@ cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
"recurrences)"
"\nreductions Use tail-folding for loops containing reductions"
"\nrecurrences Use tail-folding for loops containing fixed order "
- "recurrences"),
+ "recurrences"
+ "\nreverse Use tail-folding for loops requiring reversed "
+ "predicates"),
cl::location(TailFoldingKindLoc));
// Experimental option that will only be fully functional when the
@@ -3383,6 +3390,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
}
+static bool containsDecreasingPointers(Loop *TheLoop,
+ PredicatedScalarEvolution *PSE) {
+ const ValueToValueMap &Strides = ValueToValueMap();
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Scan the instructions in the block and look for addresses that are
+ // consecutive and decreasing.
+ for (Instruction &I : *BB) {
+ if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ Type *AccessTy = getLoadStoreType(&I);
+ if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
+ /*ShouldCheckWrap=*/false)
+ .value_or(0) < 0)
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
bool AArch64TTIImpl::preferPredicateOverEpilogue(
Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
@@ -3401,6 +3428,12 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(
Required.add(TailFoldingKind::TFReductions);
if (LVL->getFixedOrderRecurrences().size())
Required.add(TailFoldingKind::TFRecurrences);
+
+ // We call this to discover whether any load/store pointers in the loop have
+ // negative strides. This will require extra work to reverse the loop
+ // predicate, which may be expensive.
+ if (containsDecreasingPointers(L, LVL->getPredicatedScalarEvolution()))
+ Required.add(TailFoldingKind::TFReverse);
if (!Required)
Required.add(TailFoldingKind::TFSimple);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
index c805e27ce632b..bcc53f05a47bb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -1,9 +1,10 @@
; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
target triple = "aarch64-unknown-linux-gnu"
@@ -33,6 +34,14 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
; CHECK-TF-NOREC: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NOREV-LABEL: @simple_memset(
+; CHECK-TF-NOREV: vector.ph:
+; CHECK-TF-NOREV: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
+; CHECK-TF-NOREV: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-TF-NOREV: vector.body:
+; CHECK-TF-NOREV: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
; CHECK-TF-LABEL: @simple_memset(
; CHECK-TF: vector.ph:
; CHECK-TF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
@@ -91,6 +100,16 @@ define float @fadd_red_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-TF-NOREC: middle.block:
; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+; CHECK-TF-NOREV-LABEL: @fadd_red_fast
+; CHECK-TF-NOREV: vector.body:
+; CHECK-TF-NOREV: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
+; CHECK-TF-NOREV: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NOREV: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-TF-NOREV: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
+; CHECK-TF-NOREV: middle.block:
+; CHECK-TF-NOREV-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
; CHECK-TF-LABEL: @fadd_red_fast
; CHECK-TF: vector.body:
; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
@@ -167,6 +186,19 @@ define void @add_recur(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
; CHECK-TF-NOREC: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
; CHECK-TF-NOREC: store <vscale x 4 x i32> %[[ADD]]
+; CHECK-TF-NOREV-LABEL: @add_recur
+; CHECK-TF-NOREV: entry:
+; CHECK-TF-NOREV: %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-TF-NOREV: vector.ph:
+; CHECK-TF-NOREV: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-TF-NOREV: vector.body:
+; CHECK-TF-NOREV: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-TF-NOREV: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NOREV: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREV: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-TF-NOREV: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
+
; CHECK-TF-LABEL: @add_recur
; CHECK-TF: entry:
; CHECK-TF: %[[PRE:.*]] = load i32, i32* %src, align 4
@@ -238,6 +270,12 @@ define void @interleave(float* noalias %dst, float* noalias %src, i64 %n) #0 {
; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-TF-NOREV-LABEL: @interleave(
+; CHECK-TF-NOREV: vector.body:
+; CHECK-TF-NOREV: %[[LOAD:.*]] = load <8 x float>, <8 x float>
+; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
entry:
br label %for.body
@@ -266,6 +304,55 @@ for.end: ; preds = %for.body, %entry
ret void
}
+define void @reverse(double* noalias %dst, double* noalias %src) #0 {
+; CHECK-NOTF-LABEL: @reverse(
+; CHECK-NOTF: vector.body:
+; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NOTF: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
+; CHECK-NOTF: %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+
+; CHECK-TF-NOREV-LABEL: @reverse(
+; CHECK-TF-NOREV: vector.body:
+; CHECK-TF-NOREV-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
+; CHECK-TF-NOREV: %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+
+; CHECK-TF-LABEL: @reverse(
+; CHECK-TF: vector.body:
+; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
+; CHECK-TF: %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF: %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
+
+; CHECK-TF-NORED-LABEL: @reverse(
+; CHECK-TF-NORED: vector.body:
+; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
+; CHECK-TF-NORED: %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NORED: %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
+
+; CHECK-TF-NOREC-LABEL: @reverse(
+; CHECK-TF-NOREC: vector.body:
+; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
+; CHECK-TF-NOREC: %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NOREC: %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds double, double* %src, i64 %indvars.iv
+ %0 = load double, double* %arrayidx, align 8
+ %add = fadd double %0, 1.000000e+00
+ %arrayidx2 = getelementptr inbounds double, double* %dst, i64 %indvars.iv
+ store double %add, double* %arrayidx2, align 8
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %cmp.not = icmp eq i64 %indvars.iv, 0
+ br i1 %cmp.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
attributes #0 = { "target-features"="+sve" }
!0 = distinct !{!0, !1, !2, !3, !4}
More information about the llvm-commits
mailing list