[llvm] [VPlan] Add simplifications to remove header masks in predicated AnyOf select reductions (PR #190196)

Thu Apr 2 08:32:21 PDT 2026

llvmbot wrote:



@llvm/pr-subscribers-backend-risc-v

@llvm/pr-subscribers-vectorizers

Author: Luke Lau (lukel97)

<details>
<summary>Changes</summary>

Fixes https://github.com/llvm/llvm-project/issues/189553
This adds two transforms to improve the RISC-V codegen for an anyof select reduction like this.

The first is `select x, (i1 y | z), y -> y | (x && z)`, [Alive2]( https://alive2.llvm.org/ce/z/qcQRn6).

The second is `lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl` in optimizeMaskToEVL. 

The first transform on its own breaks some vp.merge patterns without the second one, so they are both included in this PR to avoid regressions. I've split it into two commits so reviewers can see the diff.

With this patch the below code:

```llvm
define i32 @f(ptr noalias %p, i64 %n) {
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
  %rdx = phi i32 [ 0, %entry ], [ %phi, %latch ]
  %gep = getelementptr i32, ptr %p, i64 %iv
  %load = load i32, ptr %gep
  %cmp1 = icmp uge i32 %load, 12
  br i1 %cmp1, label %if, label %latch

if:
  %cmp2 = icmp ult i32 %load, 15
  %select = select i1 %cmp2, i32 1, i32 %rdx
  br label %latch

latch:
  %phi = phi i32 [ %rdx, %loop ], [ %select, %if ]
  %iv.next = add i64 %iv, 1
  %ec = icmp eq i64 %iv.next, %n
  br i1 %ec, label %exit, label %loop

exit:
  ret i32 %phi
}
```

Previously used to generate the below on RISC-V:

```asm
	vsetvli	a3, zero, e64, m4, ta, ma
	vmclr.m	v12
	vid.v	v8
.LBB0_1:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
	vsetvli	a3, a1, e8, mf2, ta, ma
	slli	a4, a2, 2
	add	a4, a0, a4
	vle32.v	v14, (a4)
	vsetvli	a4, zero, e64, m4, ta, ma
	vmv.v.x	v16, a3
	vmsleu.vv	v13, v16, v8
	vsetvli	zero, zero, e32, m2, ta, ma
	vmsleu.vi	v16, v14, 11
	vmsgtu.vi	v17, v14, 11
	vmsleu.vi	v18, v14, 14
	vsetvli	zero, zero, e64, m4, ta, ma
	vmsltu.vx	v14, v8, a3
	sub	a1, a1, a3
	vmand.mm	v13, v12, v13
	vmor.mm	v15, v12, v18
	vmand.mm	v12, v12, v16
	vmand.mm	v15, v15, v17
	vmor.mm	v12, v15, v12
	vmand.mm	v12, v12, v14
	vmor.mm	v12, v12, v13
	add	a2, a3, a2
	bnez	a1, .LBB0_1
# %bb.2:                                # %middle.block
	vcpop.m	a0, v12
	snez	a0, a0
	ret
```

With this patch we can successfully remove the header mask, and more importantly remove the binary or in the chain which prevents RISCVCodeGenPrepare from widening the recurrence:

```asm
	vsetvli	a3, zero, e8, mf2, ta, ma
	vmv.v.i	v8, 0
.LBB0_1:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
	vsetvli	a3, a1, e32, m2, ta, ma
	slli	a4, a2, 2
	add	a4, a0, a4
	vle32.v	v10, (a4)
	vmsgtu.vi	v9, v10, 11
	vmsleu.vi	v12, v10, 14
	vmand.mm	v0, v9, v12
	sub	a1, a1, a3
	vsetvli	zero, zero, e8, mf2, tu, ma
	vmerge.vim	v8, v8, 1, v0
	add	a2, a3, a2
	bnez	a1, .LBB0_1
# %bb.2:                                # %middle.block
	vsetvli	a0, zero, e8, mf2, ta, ma
	vand.vi	v8, v8, 1
	vmsne.vi	v8, v8, 0
	vcpop.m	a0, v8
	snez	a0, a0
	ret
```

---
Full diff: https://github.com/llvm/llvm-project/pull/190196.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+17) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll (+2-3) 
- (modified) llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll (+7-7) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9944df4778774..0804c0c27a92d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1458,6 +1458,15 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  // select x, (i1 y | z), y -> y | (x && z)
+  if (CanCreateNewRecipe &&
+      match(Def, m_Select(m_VPValue(X),
+                          m_OneUse(m_c_BinaryOr(m_VPValue(Y), m_VPValue(Z))),
+                          m_Deferred(Y))) &&
+      TypeInfo.inferScalarType(Y)->isIntegerTy(1))
+    return Def->replaceAllUsesWith(
+        Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
+
   if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
     return Def->replaceAllUsesWith(A);
 
@@ -3141,6 +3150,14 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
       return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
 
   VPValue *LHS, *RHS;
+  // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
+  if (match(&CurRecipe,
+            m_c_BinaryOr(m_VPValue(LHS),
+                         m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
+    return new VPWidenIntrinsicRecipe(
+        Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
+        TypeInfo.inferScalarType(LHS), {}, {}, DL);
+
   if (match(&CurRecipe,
             m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 1165d88312817..0a94b8bb39c9d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -169,8 +169,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 35)
 ; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 {{%.*}}, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], splat (i32 2)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
-; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
+; CHECK-VF4IC1:        [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i1> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_SEL_TMP]]
 ; CHECK-VF4IC1:      middle.block:
 ; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
 ; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index d7bab8fda3fe3..7ed567a910bcf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -322,9 +322,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], i32 [[TMP17]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 2)
-; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[VEC_PHI]]
-; CHECK-NEXT:    [[PREDPHI]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PREDPHI1]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP17]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP17]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index 8ab7ea85ea7c7..6e6eea11647df 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -39,8 +39,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF2IC1:       [[PRED_LOAD_CONTINUE2]]:
 ; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], splat (i32 2)
-; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = or <2 x i1> [[VEC_PHI]], [[TMP16]]
-; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP17]], <2 x i1> [[VEC_PHI]]
+; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
+; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = or <2 x i1> [[VEC_PHI]], [[TMP17]]
 ; CHECK-VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-VF2IC1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF2IC1-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -88,7 +88,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF1IC2:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE3:.*]] ]
 ; CHECK-VF1IC2-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[PRED_LOAD_CONTINUE3]] ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], %[[PRED_LOAD_CONTINUE3]] ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], %[[PRED_LOAD_CONTINUE3]] ]
 ; CHECK-VF1IC2-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-VF1IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[INDEX]]
 ; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP17]]
@@ -112,10 +112,10 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], %[[PRED_LOAD_IF2]] ]
 ; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
 ; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = or i1 [[VEC_PHI]], [[TMP12]]
-; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = or i1 [[VEC_PHI2]], [[TMP13]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i1 [[TMP14]], i1 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i1 [[TMP15]], i1 [[VEC_PHI2]]
+; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP4]], i1 [[TMP12]], i1 false
+; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = or i1 [[VEC_PHI]], [[TMP15]]
+; CHECK-VF1IC2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP5]], i1 [[TMP13]], i1 false
+; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = or i1 [[VEC_PHI1]], [[TMP21]]
 ; CHECK-VF1IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-VF1IC2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC2-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

``````````

</details>


https://github.com/llvm/llvm-project/pull/190196