[llvm] [VPlan] Add simplifications to remove header masks in predicated AnyOf select reductions (PR #190196)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 08:32:21 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
@llvm/pr-subscribers-vectorizers
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
Fixes https://github.com/llvm/llvm-project/issues/189553
This adds two transforms to improve the RISC-V codegen for an anyof select reduction like this.
The first is `select x, (i1 y | z), y -> y | (x && z)`, [Alive2]( https://alive2.llvm.org/ce/z/qcQRn6).
The second is `lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl` in optimizeMaskToEVL.
The first transform on its own breaks some vp.merge patterns without the second one, so they are both included in this PR to avoid regressions. I've split it into two commits so reviewers can see the diff.
With this patch the below code:
```llvm
define i32 @<!-- -->f(ptr noalias %p, i64 %n) {
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
%rdx = phi i32 [ 0, %entry ], [ %phi, %latch ]
%gep = getelementptr i32, ptr %p, i64 %iv
%load = load i32, ptr %gep
%cmp1 = icmp uge i32 %load, 12
br i1 %cmp1, label %if, label %latch
if:
%cmp2 = icmp ult i32 %load, 15
%select = select i1 %cmp2, i32 1, i32 %rdx
br label %latch
latch:
%phi = phi i32 [ %rdx, %loop ], [ %select, %if ]
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv.next, %n
br i1 %ec, label %exit, label %loop
exit:
ret i32 %phi
}
```
Previously used to generate the below on RISC-V:
```asm
vsetvli a3, zero, e64, m4, ta, ma
vmclr.m v12
vid.v v8
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
vsetvli a3, a1, e8, mf2, ta, ma
slli a4, a2, 2
add a4, a0, a4
vle32.v v14, (a4)
vsetvli a4, zero, e64, m4, ta, ma
vmv.v.x v16, a3
vmsleu.vv v13, v16, v8
vsetvli zero, zero, e32, m2, ta, ma
vmsleu.vi v16, v14, 11
vmsgtu.vi v17, v14, 11
vmsleu.vi v18, v14, 14
vsetvli zero, zero, e64, m4, ta, ma
vmsltu.vx v14, v8, a3
sub a1, a1, a3
vmand.mm v13, v12, v13
vmor.mm v15, v12, v18
vmand.mm v12, v12, v16
vmand.mm v15, v15, v17
vmor.mm v12, v15, v12
vmand.mm v12, v12, v14
vmor.mm v12, v12, v13
add a2, a3, a2
bnez a1, .LBB0_1
# %bb.2: # %middle.block
vcpop.m a0, v12
snez a0, a0
ret
```
With this patch we can successfully remove the header mask, and more importantly remove the binary or in the chain which prevents RISCVCodeGenPrepare from widening the recurrence:
```asm
vsetvli a3, zero, e8, mf2, ta, ma
vmv.v.i v8, 0
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
vsetvli a3, a1, e32, m2, ta, ma
slli a4, a2, 2
add a4, a0, a4
vle32.v v10, (a4)
vmsgtu.vi v9, v10, 11
vmsleu.vi v12, v10, 14
vmand.mm v0, v9, v12
sub a1, a1, a3
vsetvli zero, zero, e8, mf2, tu, ma
vmerge.vim v8, v8, 1, v0
add a2, a3, a2
bnez a1, .LBB0_1
# %bb.2: # %middle.block
vsetvli a0, zero, e8, mf2, ta, ma
vand.vi v8, v8, 1
vmsne.vi v8, v8, 0
vcpop.m a0, v8
snez a0, a0
ret
```
---
Full diff: https://github.com/llvm/llvm-project/pull/190196.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+17)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll (+2-3)
- (modified) llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll (+7-7)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9944df4778774..0804c0c27a92d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1458,6 +1458,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
return;
}
+ // select x, (i1 y | z), y -> y | (x && z)
+ if (CanCreateNewRecipe &&
+ match(Def, m_Select(m_VPValue(X),
+ m_OneUse(m_c_BinaryOr(m_VPValue(Y), m_VPValue(Z))),
+ m_Deferred(Y))) &&
+ TypeInfo.inferScalarType(Y)->isIntegerTy(1))
+ return Def->replaceAllUsesWith(
+ Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
+
if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
return Def->replaceAllUsesWith(A);
@@ -3141,6 +3150,14 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
VPValue *LHS, *RHS;
+ // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
+ if (match(&CurRecipe,
+ m_c_BinaryOr(m_VPValue(LHS),
+ m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
+ return new VPWidenIntrinsicRecipe(
+ Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
+ TypeInfo.inferScalarType(LHS), {}, {}, DL);
+
if (match(&CurRecipe,
m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
return new VPWidenIntrinsicRecipe(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 1165d88312817..0a94b8bb39c9d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -169,8 +169,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; CHECK-VF4IC1: [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 35)
; CHECK-VF4IC1: [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 {{%.*}}, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], splat (i32 2)
-; CHECK-VF4IC1-NEXT: [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
-; CHECK-VF4IC1: [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
+; CHECK-VF4IC1: [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i1> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[VEC_SEL:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_SEL_TMP]]
; CHECK-VF4IC1: middle.block:
; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index d7bab8fda3fe3..7ed567a910bcf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -322,9 +322,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], i32 [[TMP17]])
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 2)
-; CHECK-NEXT: [[TMP10:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
-; CHECK-NEXT: [[PREDPHI1:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[VEC_PHI]]
-; CHECK-NEXT: [[PREDPHI]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PREDPHI1]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP17]])
+; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP17]])
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP17]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index 8ab7ea85ea7c7..6e6eea11647df 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -39,8 +39,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; CHECK-VF2IC1: [[PRED_LOAD_CONTINUE2]]:
; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
; CHECK-VF2IC1-NEXT: [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], splat (i32 2)
-; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = or <2 x i1> [[VEC_PHI]], [[TMP16]]
-; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP17]], <2 x i1> [[VEC_PHI]]
+; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
+; CHECK-VF2IC1-NEXT: [[PREDPHI]] = or <2 x i1> [[VEC_PHI]], [[TMP17]]
; CHECK-VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
; CHECK-VF2IC1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF2IC1-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -88,7 +88,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; CHECK-VF1IC2: [[VECTOR_BODY]]:
; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE3:.*]] ]
; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[PRED_LOAD_CONTINUE3]] ]
-; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], %[[PRED_LOAD_CONTINUE3]] ]
+; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], %[[PRED_LOAD_CONTINUE3]] ]
; CHECK-VF1IC2-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 1
; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[INDEX]]
; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP17]]
@@ -112,10 +112,10 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], %[[PRED_LOAD_IF2]] ]
; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
-; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = or i1 [[VEC_PHI]], [[TMP12]]
-; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = or i1 [[VEC_PHI2]], [[TMP13]]
-; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP4]], i1 [[TMP14]], i1 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = select i1 [[TMP5]], i1 [[TMP15]], i1 [[VEC_PHI2]]
+; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = select i1 [[TMP4]], i1 [[TMP12]], i1 false
+; CHECK-VF1IC2-NEXT: [[PREDPHI]] = or i1 [[VEC_PHI]], [[TMP15]]
+; CHECK-VF1IC2-NEXT: [[TMP21:%.*]] = select i1 [[TMP5]], i1 [[TMP13]], i1 false
+; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = or i1 [[VEC_PHI1]], [[TMP21]]
; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-VF1IC2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-VF1IC2-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
``````````
</details>
https://github.com/llvm/llvm-project/pull/190196
More information about the llvm-commits
mailing list