[llvm] WIP:[LV][EVL] Support fixed-order recurrence idiom with EVL tail folding. (PR #124093)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 01:41:21 PST 2025
https://github.com/Mel-Chen created https://github.com/llvm/llvm-project/pull/124093
TODO: Transforms ExtractFromEnd to EVL based extractelement.
Fixes https://github.com/llvm/llvm-project/issues/122461
>From e7c3be162db21b17e3d0fe6b058983e85db2a44e Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 17 Jan 2025 02:29:28 -0800
Subject: [PATCH 1/7] Revert "[LV][EVL] Address post-comments for disabling
fixed-order recurrence with EVL tail folding. (NFC)"
This reverts commit 44a005ff4aa5bcfd06749b5f9ec6bbd582a1ebb0.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
.../vectorize-force-tail-with-evl-fixed-order-recurrence.ll | 4 ++--
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7167e2179af535..bf6e402579e5b0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1438,11 +1438,11 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
- // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
- // penultimate EVL.
bool EVLIsLegal =
UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty();
+ !EnableVPlanNativePath &&
+ // FIXME: remove this once fixed-ordered recurrence is supported.
+ Legal->getFixedOrderRecurrences().empty();
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 3e6588befaec59..809b69900731ac 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -12,8 +12,8 @@
; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding.
-; The llvm.splice may occur unexpected behavior if the evl of the second-to-last
-; iteration is not VF*UF.
+; The llvm.splice may occurs unexpected behavior if the evl of the
+; second-to-last iteration is not VF*UF.
define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-LABEL: define void @first_order_recurrence(
>From 97470fc4bf7caca4153ac395041763dd518f5db5 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Fri, 17 Jan 2025 02:29:48 -0800
Subject: [PATCH 2/7] Revert "[LV][EVL] Disable fixed-order recurrence idiom
with EVL tail folding. (#122458)"
This reverts commit 9720be95d63ce797437015d0f0edd10b02e80b7a.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 8 +-
...ce-tail-with-evl-fixed-order-recurrence.ll | 90 ++++++++-----------
2 files changed, 40 insertions(+), 58 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bf6e402579e5b0..237dabc8f11ec5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1438,11 +1438,9 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
- bool EVLIsLegal =
- UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- // FIXME: remove this once fixed-ordered recurrence is supported.
- Legal->getFixedOrderRecurrences().empty();
+ bool EVLIsLegal = UserIC <= 1 &&
+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 809b69900731ac..9f8cf169c05933 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -11,10 +11,6 @@
; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
-; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding.
-; The llvm.splice may occurs unexpected behavior if the evl of the
-; second-to-last iteration is not VF*UF.
-
define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-LABEL: define void @first_order_recurrence(
; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -31,35 +27,31 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1
; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
; IF-EVL: [[VECTOR_BODY]]:
-; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP25:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP26:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP25]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP26]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT: [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP19]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP27]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: [[MIDDLE_BLOCK]]:
@@ -180,7 +172,6 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -191,30 +182,27 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4
; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1
; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
; IF-EVL: [[VECTOR_BODY]]:
-; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP32:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP33:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP32]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP33]]
-; IF-EVL-NEXT: [[TMP34:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP34]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
; IF-EVL-NEXT: [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
; IF-EVL-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
-; IF-EVL-NEXT: [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP34]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP15]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: [[MIDDLE_BLOCK]]:
@@ -230,12 +218,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL: [[SCALAR_PH]]:
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT: [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
; IF-EVL-NEXT: br label %[[FOR_BODY:.*]]
; IF-EVL: [[FOR_BODY]]:
; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
@@ -354,7 +342,6 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -369,33 +356,30 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4
; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1
; IF-EVL-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP17]]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
; IF-EVL: [[VECTOR_BODY]]:
-; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP39:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP40:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP39]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP40]]
-; IF-EVL-NEXT: [[TMP41:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT6]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
; IF-EVL-NEXT: [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
; IF-EVL-NEXT: [[TMP23]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
; IF-EVL-NEXT: [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1)
-; IF-EVL-NEXT: [[TMP27:%.*]] = add nsw <vscale x 4 x i32> [[TMP23]], [[TMP24]]
-; IF-EVL-NEXT: [[TMP42:%.*]] = add <vscale x 4 x i32> [[TMP27]], [[TMP22]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT: [[VP_OP5:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP42]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP41]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP26]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP18]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; IF-EVL: [[MIDDLE_BLOCK]]:
@@ -415,14 +399,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL: [[SCALAR_PH]]:
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; IF-EVL-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
+; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
; IF-EVL-NEXT: br label %[[FOR_BODY:.*]]
; IF-EVL: [[FOR_BODY]]:
; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]
>From 9b43266e523d48fd27f1b344bcd4e540cc6742f1 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 20 Jan 2025 05:55:47 -0800
Subject: [PATCH 3/7] Step 1: Create scalar phi to save previous EVL
---
llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 23 +++++++++++++++++++
.../Transforms/Vectorize/VPlanVerifier.cpp | 3 ++-
...ce-tail-with-evl-fixed-order-recurrence.ll | 12 +++++++---
4 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index db45ad8aadbbe3..900d51e2f25eab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -885,6 +885,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
case VPRecipeBase::VPScalarCastSC:
+ case VPRecipeBase::VPScalarPHISC:
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9febd612c644e1..d9519229578bcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1728,6 +1728,29 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(CanonicalIVType);
LLVMContext &Ctx = CanonicalIVType->getContext();
VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+
+ // Create a scalar phi to track the previous EVL if fixed-order recurrence is
+ // contained.
+ bool ContainsFORs =
+ any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
+ if (ContainsFORs) {
+ // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
+ VPValue *MaxEVL = &Plan.getVF();
+ // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
+ if (unsigned VFSize =
+ TypeInfo.inferScalarType(MaxEVL)->getScalarSizeInBits();
+ VFSize != 32) {
+ MaxEVL = new VPScalarCastRecipe(
+ VFSize > 32 ? Instruction::Trunc : Instruction::ZExt, MaxEVL,
+ Type::getInt32Ty(Ctx), DebugLoc());
+ VPBasicBlock *Preheader = LoopRegion->getPreheaderVPBB();
+ Preheader->appendRecipe(cast<VPScalarCastRecipe>(MaxEVL));
+ }
+ auto *PrevEVL = new VPScalarPHIRecipe(MaxEVL, &EVL, DebugLoc(), "prev.evl");
+ PrevEVL->insertBefore(*Header, Header->getFirstNonPhi());
+ }
for (VPUser *U : to_vector(Plan.getVF().users())) {
if (auto *R = dyn_cast<VPReverseVectorPointerRecipe>(U))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index a30bc82cbde856..51128c1c1b794c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -143,7 +143,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
})
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
- .Case<VPWidenLoadEVLRecipe, VPReverseVectorPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPReverseVectorPointerRecipe,
+ VPScalarPHIRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPWidenEVLRecipe>([&](const VPWidenEVLRecipe *W) {
return VerifyEVLUse(*W,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 9f8cf169c05933..bae99d5f8c78a7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -29,6 +29,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP8]] to i32
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1
@@ -38,8 +39,9 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP25]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP12]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
@@ -174,6 +176,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP8]] to i32
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1
@@ -188,8 +191,9 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP32]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP15]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
@@ -344,6 +348,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP8]] to i32
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1
@@ -363,8 +368,9 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP39]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP18]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
>From 02db15bf71f7348fc1b121c73be60c077790cf72 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 23 Jan 2025 00:21:19 -0800
Subject: [PATCH 4/7] Step 2: Transform llvm.splice to vp.splice
This version depend on branch explicit-FOR-op.
---
llvm/lib/Analysis/VectorUtils.cpp | 2 ++
.../Transforms/Vectorize/VPlanTransforms.cpp | 20 +++++++++++++++----
2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index ad80e458ab57dd..24dc43886045d2 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -154,6 +154,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
case Intrinsic::umul_fix:
case Intrinsic::umul_fix_sat:
return (ScalarOpdIdx == 2);
+ case Intrinsic::experimental_vp_splice:
+ return ScalarOpdIdx == 2 || ScalarOpdIdx == 4 || ScalarOpdIdx == 5;
default:
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d9519229578bcb..31a14b49ac2b3b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1643,11 +1643,13 @@ void VPlanTransforms::addActiveLaneMask(
/// \p TypeInfo VPlan-based type analysis.
/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
/// \p EVL The explicit vector length parameter of vector-predication
+/// \p PrevEVL The explicit vector length of the previous iteration.
/// intrinsics.
static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
- VPValue &AllOneMask, VPValue &EVL) {
+ VPValue &AllOneMask, VPValue &EVL,
+ VPValue *PrevEVL) {
using namespace llvm::VPlanPatternMatch;
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
@@ -1705,6 +1707,15 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
Sel->getDebugLoc());
})
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
+ if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
+ assert(PrevEVL && "Fixed-order recurrences require previous EVL");
+ SmallVector<VPValue *> Ops(VPI->operands());
+ Ops.append({&AllOneMask, PrevEVL, &EVL});
+ return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splice,
+ Ops, TypeInfo.inferScalarType(VPI),
+ VPI->getDebugLoc());
+ }
+
VPValue *LHS, *RHS;
// Transform select with a header mask condition
// select(header_mask, LHS, RHS)
@@ -1733,6 +1744,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
// Create a scalar phi to track the previous EVL if fixed-order recurrence is
// contained.
+ VPScalarPHIRecipe *PrevEVL = nullptr;
bool ContainsFORs =
any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
if (ContainsFORs) {
@@ -1748,7 +1760,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPBasicBlock *Preheader = LoopRegion->getPreheaderVPBB();
Preheader->appendRecipe(cast<VPScalarCastRecipe>(MaxEVL));
}
- auto *PrevEVL = new VPScalarPHIRecipe(MaxEVL, &EVL, DebugLoc(), "prev.evl");
+ PrevEVL = new VPScalarPHIRecipe(MaxEVL, &EVL, DebugLoc(), "prev.evl");
PrevEVL->insertBefore(*Header, Header->getFirstNonPhi());
}
@@ -1762,8 +1774,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
- VPRecipeBase *EVLRecipe =
- createEVLRecipe(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+ VPRecipeBase *EVLRecipe = createEVLRecipe(
+ HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL, PrevEVL);
if (!EVLRecipe)
continue;
>From e7b6d6eb069d32127b9441b12374c157ba75ad83 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 23 Jan 2025 01:14:21 -0800
Subject: [PATCH 5/7] Step 2.1: Directly create -1 in createEVLRecipe.
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 31a14b49ac2b3b..ad962ff754438c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1709,8 +1709,11 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
assert(PrevEVL && "Fixed-order recurrences require previous EVL");
+ VPValue *MinusOneVPV = VPI->getParent()->getPlan()->getOrAddLiveIn(
+ ConstantInt::getSigned(Type::getInt32Ty(TypeInfo.getContext()),
+ -1));
SmallVector<VPValue *> Ops(VPI->operands());
- Ops.append({&AllOneMask, PrevEVL, &EVL});
+ Ops.append({MinusOneVPV, &AllOneMask, PrevEVL, &EVL});
return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splice,
Ops, TypeInfo.inferScalarType(VPI),
VPI->getDebugLoc());
>From 643e07f04c6278b2ad7ccca709020e7db4181c62 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 23 Jan 2025 01:15:36 -0800
Subject: [PATCH 6/7] Split PR: Make vp.splice not Invalid
---
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index add82dc80c4290..7f868ec08208c7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1235,6 +1235,8 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
: RISCV::VMV_V_X,
LT.second, CostKind);
}
+ case Intrinsic::experimental_vp_splice:
+ return 1;
}
if (ST->hasVInstructions() && RetTy->isVectorTy()) {
>From cd72b8de08fdc32f4415ed4cba917e4118c23093 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 23 Jan 2025 01:17:50 -0800
Subject: [PATCH 7/7] test update
---
...ize-force-tail-with-evl-fixed-order-recurrence.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index bae99d5f8c78a7..ed9c579ce34550 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -46,7 +46,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
+; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP12]])
; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
@@ -198,8 +198,8 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
-; IF-EVL-NEXT: [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
+; IF-EVL-NEXT: [[TMP19]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP15]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP15]])
; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
@@ -375,9 +375,9 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT: [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT: [[TMP23]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
-; IF-EVL-NEXT: [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1)
+; IF-EVL-NEXT: [[TMP22]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP18]])
+; IF-EVL-NEXT: [[TMP23]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP18]])
+; IF-EVL-NEXT: [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP18]])
; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
; IF-EVL-NEXT: [[VP_OP5:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]]
More information about the llvm-commits
mailing list