[llvm] [LV] strip TailFoldingStyle::DataWithoutLaneMask (PR #93303)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 24 07:18:33 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: Ramkumar Ramachandra (artagnon)
<details>
<summary>Changes</summary>
There is just one usage of TailFoldingStyle::DataWithoutLaneMask in LoopVectorize, introduced by 413a66f ([LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL.), but this usage is completely unnecessary, as @<!-- -->llvm.get.active.lane.mask is unrelated to EVL. Moreover, SelectionDAG automatically detects if a target supports the @<!-- -->llvm.get.active.lane.mask intrinsic, and lowers it to equivalent instructions on targets where it is not preferred, since 243a532 ([SelectionDAG] Lower @<!-- -->llvm.get.active.lane.mask to setcc).
---
Patch is 75.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93303.diff
7 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+3-8)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll (-56)
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll (+320-64)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll (+15-23)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll (+49-52)
- (modified) llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll (+16-22)
- (modified) llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll (+243-50)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 52cb8c9f88f94..44905bee96519 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -243,9 +243,6 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
clEnumValN(
TailFoldingStyle::Data, "data",
"Create lane mask for data only, using active.lane.mask intrinsic"),
- clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
- "data-without-lane-mask",
- "Create lane mask with compare/stepvector"),
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
"Create lane mask using active.lane.mask intrinsic, and use "
"it for both data and control flow"),
@@ -1538,12 +1535,10 @@ class LoopVectorizationCostModel {
// FIXME: remove this once reductions are supported.
Legal->getReductionVars().empty();
if (!EVLIsLegal) {
- // If for some reason EVL mode is unsupported, fallback to
- // DataWithoutLaneMask to try to vectorize the loop with folded tail
- // in a generic way.
+ // If for some reason EVL mode is unsupported, fallback to Data to try to
+ // vectorize the loop with folded tail in a generic way.
ChosenTailFoldingStyle =
- std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
- TailFoldingStyle::DataWithoutLaneMask);
+ std::make_pair(TailFoldingStyle::Data, TailFoldingStyle::Data);
LLVM_DEBUG(
dbgs()
<< "LV: Preference for VP intrinsics indicated. Will "
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 13fc0eaafb808..666c03565dc60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA
-; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL
; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control-without-rt-check < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL_NO_RT_CHECK
@@ -99,61 +98,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
; DATA: while.end.loopexit:
; DATA-NEXT: ret void
;
-; DATA_NO_LANEMASK-LABEL: @simple_memset_tailfold(
-; DATA_NO_LANEMASK-NEXT: entry:
-; DATA_NO_LANEMASK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; DATA_NO_LANEMASK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[UMAX]]
-; DATA_NO_LANEMASK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; DATA_NO_LANEMASK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DATA_NO_LANEMASK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA_NO_LANEMASK: vector.ph:
-; DATA_NO_LANEMASK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA_NO_LANEMASK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; DATA_NO_LANEMASK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; DATA_NO_LANEMASK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]]
-; DATA_NO_LANEMASK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; DATA_NO_LANEMASK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; DATA_NO_LANEMASK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
-; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
-; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA_NO_LANEMASK: vector.body:
-; DATA_NO_LANEMASK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ]
-; DATA_NO_LANEMASK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0
-; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i64 0
-; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA_NO_LANEMASK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; DATA_NO_LANEMASK-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
-; DATA_NO_LANEMASK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
-; DATA_NO_LANEMASK-NEXT: [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; DATA_NO_LANEMASK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]]
-; DATA_NO_LANEMASK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; DATA_NO_LANEMASK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]])
-; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]]
-; DATA_NO_LANEMASK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; DATA_NO_LANEMASK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA_NO_LANEMASK: middle.block:
-; DATA_NO_LANEMASK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA_NO_LANEMASK: scalar.ph:
-; DATA_NO_LANEMASK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DATA_NO_LANEMASK-NEXT: br label [[WHILE_BODY:%.*]]
-; DATA_NO_LANEMASK: while.body:
-; DATA_NO_LANEMASK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA_NO_LANEMASK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA_NO_LANEMASK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA_NO_LANEMASK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA_NO_LANEMASK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; DATA_NO_LANEMASK: while.end.loopexit:
-; DATA_NO_LANEMASK-NEXT: ret void
-;
; DATA_AND_CONTROL-LABEL: @simple_memset_tailfold(
; DATA_AND_CONTROL-NEXT: entry:
; DATA_AND_CONTROL-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
index bd52c2a8f0645..2b50b44a3d289 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
; REQUIRES: asserts
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
@@ -7,50 +8,183 @@
; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
-; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
-; CHECK-NEXT: Live-in vp<%0> = VF * UF
-; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
-; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
-; CHECK-NEXT: Live-in ir<%N> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
-; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
-; CHECK-NEXT: EMIT vp<%4> = icmp ule ir<%iv>, vp<%2>
-; CHECK-NEXT: Successor(s): pred.store
-; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.store: {
-; CHECK-NEXT: pred.store.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<%4>
-; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.if:
-; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>
-; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%5>
-; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx>
-; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%5>
-; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2>
-; CHECK-NEXT: REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%5>
-; CHECK-NEXT: REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
-; CHECK-NEXT: REPLICATE store ir<%add>, ir<%arrayidx4>
-; CHECK-NEXT: Successor(s): pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.continue:
-; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
-; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-LABEL: 'foo'
+; CHECK-NEXT: LV: Loop hints: force=? width=0 interleave=0
+; CHECK-NEXT: LV: Found a loop: for.body
+; CHECK-NEXT: LV: Found an induction variable.
+; CHECK-NEXT: LV: We can vectorize this loop!
+; CHECK-NEXT: LV: Found trip count: 0
+; CHECK-NEXT: LV: vector predicate hint/switch found.
+; CHECK-NEXT: LV: Not allowing scalar epilogue, creating predicated vector loop.
+; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 0.
+; CHECK-NEXT: LV: The Smallest and Widest types: 32 / 32 bits.
+; CHECK-NEXT: LV: The Widest register safe to use is: 128 bits.
+; CHECK-NEXT: LV: The Widest register safe to use is: vscale x 0 bits.
+; CHECK-NEXT: LV: The target has no scalable vector registers.
+; CHECK-NEXT: LV: checking if tail can be folded by masking.
+; CHECK-NEXT: LV: can fold tail by masking.
+; CHECK-NEXT: LV: Preference for VP intrinsics indicated. Will not try to generate VP Intrinsics due to non-interleaving reasons.
+; CHECK-NEXT: LV: Invalidate all interleaved groups due to fold-tail by masking which requires masked-interleaved support.
+; CHECK-NEXT: LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: LV: Found scalar instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT: LV: Found scalar instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT: LV: Found scalar instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT: LV: Found uniform instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: LV: Found scalar instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT: LV: Found scalar instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT: LV: Found scalar instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT: LV: Scalarizing and predicating: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT: LV: Scalarizing and predicating: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT: LV: Scalarizing: %add = add nsw i32 %1, %0
+; CHECK-NEXT: LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT: LV: Scalarizing and predicating: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT: LV: Scalarizing: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT: LV: Scalarizing and predicating: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT: LV: Scalarizing and predicating: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT: LV: Scalarizing: %add = add nsw i32 %1, %0
+; CHECK-NEXT: LV: Scalarizing: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT: LV: Scalarizing and predicating: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT: LV: Scalarizing: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: VPlan 'Initial VPlan for VF={1},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT: EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; CHECK-NEXT: Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT: <xVFxUF> pred.store: {
+; CHECK-NEXT: pred.store.entry:
+; CHECK-NEXT: BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT: pred.store.if:
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK-NEXT: CLONE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; CHECK-NEXT: CLONE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT: CLONE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK-NEXT: CLONE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT: CLONE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT: Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT: pred.store.continue:
+; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
+; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT: for.body.2:
+; CHECK-NEXT: EMIT vp<%8> = add vp<%3>, vp<%0>
+; CHECK-NEXT: EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT: EMIT vp<%5> = active lane mask vp<%4>, ir<%N>
+; CHECK-NEXT: Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT: <xVFxUF> pred.store: {
+; CHECK-NEXT: pred.store.entry:
+; CHECK-NEXT: BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT: pred.store.if:
+; CHECK-NEXT: REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%4>
+; CHECK-NEXT: REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT: REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%4>
+; CHECK-NEXT: REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT: REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%4>
+; CHECK-NEXT: REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT: REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT: Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT: pred.store.continue:
+; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%0>
+; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%1>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT: for.body.2:
+; CHECK-NEXT: EMIT vp<%8> = add vp<%3>, vp<%0>
+; CHECK-NEXT: EMIT branch-on-count vp<%8>, vp<%1>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): for.body.2
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT: LV: Scalar loop costs: 6.
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+; CHECK-NEXT: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+; CHECK-NEXT: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 3 for VF 2 For instruction: %add = add nsw i32 %1, %0
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: store i32 %add, ptr %arrayidx4, align 4
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond.not = icmp eq i64 %iv.next, %N
+; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+; CHECK-NEXT: LV: Vector loop of width 2 costs: 3000003.
+; CHECK-NEXT: LV: Found an estimated cost of 0 for ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/93303
More information about the llvm-commits
mailing list