[llvm] [LV][EVL] Replace VPInstruction::Select with vp.merge for predicated div/rem (PR #154072)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 11 01:35:04 PDT 2025
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/154072
>From 06c6cff7ac211ebba17a899ef97afde8e96323a3 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Mon, 18 Aug 2025 00:16:45 -0700
Subject: [PATCH 1/2] EVL, transform VPInstruction::Select to vp.merge for div
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 18 +++++++++-------
.../Transforms/LoopVectorize/RISCV/divrem.ll | 21 +++----------------
2 files changed, 13 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2cac5557daeee..8134b95c0d534 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2493,18 +2493,20 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
})
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
- VPValue *LHS, *RHS;
+ VPValue *Cond, *LHS, *RHS;
// Transform select with a header mask condition
- // select(header_mask, LHS, RHS)
+ // select(mask_w/_header_mask, LHS, RHS)
// into vector predication merge.
- // vp.merge(all-true, LHS, RHS, EVL)
- if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
- m_VPValue(RHS))))
+ // vp.merge(mask_w/o_header_mask, LHS, RHS, EVL)
+ if (!match(VPI,
+ m_Select(m_VPValue(Cond), m_VPValue(LHS), m_VPValue(RHS))))
return nullptr;
- // Use all true as the condition because this transformation is
- // limited to selects whose condition is a header mask.
+
+ VPValue *NewMask = GetNewMask(Cond);
+ if (!NewMask)
+ NewMask = &AllOneMask;
return new VPWidenIntrinsicRecipe(
- Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
+ Intrinsic::vp_merge, {NewMask, LHS, RHS, &EVL},
TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
})
.Default([&](VPRecipeBase *R) { return nullptr; });
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index c35358b3eed0f..4e8577564246e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -364,14 +364,9 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]])
; CHECK-NEXT: [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
@@ -479,14 +474,9 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]])
; CHECK-NEXT: [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
@@ -799,15 +789,10 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP12]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 16 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]])
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 -128)
-; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[TMP16]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1)
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1), i32 [[TMP12]])
; CHECK-NEXT: [[TMP11:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP10]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[WIDE_LOAD]]
; CHECK-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[PREDPHI]], ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]])
>From 3060d05df4b0fffe74485a97f173bd195151e5c0 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 11 Sep 2025 01:30:18 -0700
Subject: [PATCH 2/2] Resolve select <all ones>, foo, bar -> vp.select <all
ones>, foo, bar, EVL
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 44 ++++++++++++++-----
1 file changed, 32 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8134b95c0d534..dbb39c8e14fbc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2443,15 +2443,23 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
VPValue &AllOneMask, VPValue &EVL) {
- // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
- // header mask.
+ // Derive a new mask by removing the header mask. Return nullptr if a new mask
+ // cannot be derived because the original mask does not contain the header
+ // mask.
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
// HeaderMask will be handled using EVL.
+ if (HeaderMask == OrigMask)
+ return &AllOneMask;
VPValue *Mask;
if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
return Mask;
- return HeaderMask == OrigMask ? nullptr : OrigMask;
+ return nullptr;
+ };
+
+ // TODO: Can be simplified by SimplifyRecipes.
+ auto OptimizeAllTrueMask = [](VPValue *Mask) -> VPValue * {
+ return match(Mask, m_True()) ? nullptr : Mask;
};
/// Adjust any end pointers so that they point to the end of EVL lanes not VF.
@@ -2474,23 +2482,35 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
};
return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
- .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+ .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) -> VPRecipeBase * {
VPValue *NewMask = GetNewMask(L->getMask());
+ if (!NewMask)
+ return nullptr;
VPValue *NewAddr = GetNewAddr(L->getAddr());
- return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
+ return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL,
+ OptimizeAllTrueMask(NewMask));
})
- .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+ .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) -> VPRecipeBase * {
VPValue *NewMask = GetNewMask(S->getMask());
+ if (!NewMask)
+ return nullptr;
VPValue *NewAddr = GetNewAddr(S->getAddr());
- return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
+ return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL,
+ OptimizeAllTrueMask(NewMask));
})
- .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
+ .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) -> VPRecipeBase * {
VPValue *NewMask = GetNewMask(IR->getMask());
- return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
+ if (!NewMask)
+ return nullptr;
+ return new VPInterleaveEVLRecipe(*IR, EVL,
+ OptimizeAllTrueMask(NewMask));
})
- .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
+ .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) -> VPRecipeBase * {
VPValue *NewMask = GetNewMask(Red->getCondOp());
- return new VPReductionEVLRecipe(*Red, EVL, NewMask);
+ if (!NewMask)
+ return nullptr;
+ return new VPReductionEVLRecipe(*Red, EVL,
+ OptimizeAllTrueMask(NewMask));
})
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
VPValue *Cond, *LHS, *RHS;
@@ -2504,7 +2524,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPValue *NewMask = GetNewMask(Cond);
if (!NewMask)
- NewMask = &AllOneMask;
+ return nullptr;
return new VPWidenIntrinsicRecipe(
Intrinsic::vp_merge, {NewMask, LHS, RHS, &EVL},
TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
More information about the llvm-commits
mailing list