[llvm] [VPlan] Fold safe divisors into VP intrinsics with EVL (PR #148828)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 15 04:27:52 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
If a udiv/sdiv/urem/srem needs predicated, either because of control flow in the loop or because of tail folding, then we replace any of the masked off lanes with a safe divisor, i.e. 1, with a select.
For EVL tail folding we can optimize away the select and use a VP intrinsic directly, which helps cases in SPEC 525.x264_r and llvm-test-suite on RISC-V e.g.:
```diff
- vmv.v.i v9, 1
lui a2, 4
- vmv.v.x v10, a2
+ vmv.v.x v9, a2
.Lpcrel_hi387:
auipc a2, %pcrel_hi(_ZL2dt)
addi a2, a2, %pcrel_lo(.Lpcrel_hi387)
.LBB75_4: # %vector.body
# =>This Inner Loop Header: Depth=1
sub a3, a1, a0
- vmv1r.v v11, v9
vsetvli a3, a3, e16, m1, ta, ma
- vadd.vv v12, v8, v8
- vsetvli zero, zero, e16, m1, tu, ma
- vadd.vi v11, v12, 3
- vsetvli zero, zero, e16, m1, ta, ma
- vdivu.vv v11, v10, v11
+ vadd.vv v10, v8, v8
+ vadd.vi v10, v10, 3
+ vdivu.vv v12, v9, v10
sh2add a4, a0, a2
add a0, a0, a3
vsetvli zero, zero, e32, m2, ta, ma
- vzext.vf2 v12, v11
- vse32.v v12, (a4)
+ vzext.vf2 v10, v12
+ vse32.v v10, (a4)
```
It's tempting to try and fold away any arbitrary mask/EVL combination feeding into a divisor operand, but care needs to be taken as this transform actually replaces lanes that were previously defined with poison. So we can only do this with the EVL-based IV and an all-ones mask, where we know that the lanes past EVL aren't used.
This is also the reason why I chose to do this as a VPlan transform as opposed to e.g. RISCVCodeGenPrepare, because it's much harder to figure out the EVL-based IV again outside of VPlan.
Also worth noting is that we still avoid transforming to VP intrinsics for non-trapping recipes. We only need to handle these cases due to their trapping behaviour.
I looked into whether or not we could make the "safe-divisor" case more recognisable, by either adding a new recipe or a new VPInstruction to represent a divisor that's safe to be folded into a div and produce poison, but this made things more complicated in the end.
Fixes #<!-- -->129538
---
Full diff: https://github.com/llvm/llvm-project/pull/148828.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+48)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll (+4-8)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6a3b3e6e41955..664e42b7e3318 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2176,6 +2176,52 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
.Default([&](VPRecipeBase *R) { return nullptr; });
}
+/// Try to optimize safe divisors away by converting their users to VP
+/// intrinsics:
+///
+/// udiv x, (vp.merge allones, y, 1, evl) -> vp.udiv x, y, allones, evl
+///
+/// Note the lanes past EVL will be changed from x to poison. This only works
+/// for the EVL-based IV and not any arbitrary EVL, because we know nothing
+/// will read the lanes past the EVL-based IV.
+static void
+optimizeSafeDivisorsToEVL(VPTypeAnalysis &TypeInfo, VPValue &AllOneMask,
+ VPValue &EVL,
+ SmallVectorImpl<VPRecipeBase *> &ToErase) {
+ using namespace VPlanPatternMatch;
+ for (VPUser *U : to_vector(EVL.users())) {
+ VPValue *Y;
+ if (!match(U, m_Intrinsic<Intrinsic::vp_merge>(m_AllOnes(), m_VPValue(Y),
+ m_SpecificInt(1),
+ m_Specific(&EVL))))
+ continue;
+ auto *Merge = cast<VPSingleDefRecipe>(U);
+
+ for (VPUser *User : to_vector(Merge->users())) {
+ auto *WidenR = dyn_cast<VPWidenRecipe>(User);
+ if (!WidenR || WidenR->getOperand(1) != Merge)
+ continue;
+ switch (WidenR->getOpcode()) {
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ break;
+ default:
+ continue;
+ }
+ VPValue *X = WidenR->getOperand(0);
+
+ auto *VPUDiv = new VPWidenIntrinsicRecipe(
+ VPIntrinsic::getForOpcode(WidenR->getOpcode()),
+ {X, Y, &AllOneMask, &EVL}, TypeInfo.inferScalarType(Merge));
+ VPUDiv->insertBefore(WidenR);
+ WidenR->replaceAllUsesWith(VPUDiv);
+ ToErase.push_back(WidenR);
+ }
+ }
+}
+
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
@@ -2259,6 +2305,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
+ optimizeSafeDivisorsToEVL(TypeInfo, *AllOneMask, EVL, ToErase);
+
for (VPRecipeBase *R : reverse(ToErase)) {
SmallVector<VPValue *> PossiblyDead(R->operands());
R->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
index 3e83d8a757b5d..9936b7ef0de54 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
@@ -35,8 +35,7 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP:%.*]] = sdiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
@@ -131,8 +130,7 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP:%.*]] = udiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.udiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
@@ -226,8 +224,7 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP:%.*]] = srem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.srem.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
@@ -321,8 +318,7 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP:%.*]] = urem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.urem.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
``````````
</details>
https://github.com/llvm/llvm-project/pull/148828
More information about the llvm-commits
mailing list