[llvm] [VPlan] Explicitly reassociate header mask in logical and (PR #180898)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 12 04:14:23 PST 2026
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/180898
>From d68a857e9d542aee34672f08c12c0acb0e3f86ef Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Feb 2026 16:19:32 +0800
Subject: [PATCH 1/5] [VPlan] Explicitly reassociate header mask in logical and
We reassociate ((x && y) && z) -> (x && (y && z)) if x has more than use, in order to allow simplifying the header mask further. However this is somewhat unreliable as there are times when it doesn't have more than one use, e.g. see the case we run into in https://github.com/llvm/llvm-project/pull/173265/changes#r2769759907.
This moves it into a separate transformation that always reassociates the header mask regardless of the number of uses, which prevents some fragile test changes in #173265.
We need to run it before both calls to simplifyRecipes in optimize. I considered putting it in simplifyRecipes itself but simplifyRecipes is also called after unrolling and when the loop region is dissolved which causes vputils::findHeaderMask to assert.
There isn't really any benefit to reassociating masks that aren't the header mask so the existing simplification was removed.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 32 +++++++++++++------
.../AArch64/force-target-instruction-cost.ll | 20 ++++++------
.../RISCV/tail-folding-complex-mask.ll | 3 +-
.../LoopVectorize/X86/predicate-switch.ll | 12 +++----
.../LoopVectorize/reduction-inloop.ll | 12 +++----
5 files changed, 47 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index efea585114947..b9de83c24acb3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1332,15 +1332,6 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
return;
}
- // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
- // tail folding it is likely that x is a header mask and can be simplified
- // further.
- if (match(Def, m_LogicalAnd(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
- m_VPValue(Z))) &&
- X->hasMoreThanOneUniqueUser())
- return Def->replaceAllUsesWith(
- Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
-
if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
return Def->replaceAllUsesWith(A);
@@ -1614,6 +1605,27 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
}
}
+/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
+/// header mask to be simplified further, e.g. in optimizeEVLMasks.
+static void reassociateHeaderMask(VPlan &Plan) {
+ VPValue *HeaderMask = vputils::findHeaderMask(Plan);
+ if (!HeaderMask)
+ return;
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ VPValue *X, *Y;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB))
+ if (match(&R,
+ m_LogicalAnd(m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
+ m_VPValue(Y)))) {
+ VPBuilder Builder(&R);
+ R.getVPSingleValue()->replaceAllUsesWith(Builder.createLogicalAnd(
+ HeaderMask, Builder.createLogicalAnd(X, Y)));
+ R.eraseFromParent();
+ }
+}
+
static void narrowToSingleScalarRecipes(VPlan &Plan) {
if (Plan.hasScalarVFOnly())
return;
@@ -2758,12 +2770,14 @@ void VPlanTransforms::optimize(VPlan &Plan) {
RUN_VPLAN_PASS(removeRedundantCanonicalIVs, Plan);
RUN_VPLAN_PASS(removeRedundantInductionCasts, Plan);
+ RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
RUN_VPLAN_PASS(simplifyRecipes, Plan);
RUN_VPLAN_PASS(removeDeadRecipes, Plan);
RUN_VPLAN_PASS(simplifyBlends, Plan);
RUN_VPLAN_PASS(legalizeAndOptimizeInductions, Plan);
RUN_VPLAN_PASS(narrowToSingleScalarRecipes, Plan);
RUN_VPLAN_PASS(removeRedundantExpandSCEVRecipes, Plan);
+ RUN_VPLAN_PASS(reassociateHeaderMask, Plan);
RUN_VPLAN_PASS(simplifyRecipes, Plan);
RUN_VPLAN_PASS(removeBranchOnConst, Plan);
RUN_VPLAN_PASS(removeDeadRecipes, Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 6ea9809dc8ff8..892403415b335 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -184,9 +184,7 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
; COMMON-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT2]], <2 x i1> poison, <2 x i32> zeroinitializer
; COMMON-NEXT: [[TMP0:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT]], <2 x i1> zeroinitializer
; COMMON-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[TMP0]], splat (i1 true)
-; COMMON-NEXT: [[TMP2:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i1> [[BROADCAST_SPLAT3]], <2 x i1> zeroinitializer
; COMMON-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT3]], splat (i1 true)
-; COMMON-NEXT: [[TMP4:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i1> [[TMP3]], <2 x i1> zeroinitializer
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
; COMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ]
@@ -219,7 +217,8 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
; COMMON-NEXT: store i64 0, ptr [[DST_3]], align 8
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE9]]
; COMMON: [[PRED_STORE_CONTINUE9]]:
-; COMMON-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
+; COMMON-NEXT: [[TMP22:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[BROADCAST_SPLAT]], <2 x i1> zeroinitializer
+; COMMON-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP22]], <2 x i1> [[BROADCAST_SPLAT3]], <2 x i1> zeroinitializer
; COMMON-NEXT: [[TMP14:%.*]] = or <2 x i1> [[TMP6]], [[TMP13]]
; COMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP13]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
; COMMON-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0
@@ -236,20 +235,20 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
; COMMON-NEXT: store i64 [[TMP18]], ptr [[DST_2]], align 8
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE13]]
; COMMON: [[PRED_STORE_CONTINUE13]]:
-; COMMON-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP4]], <2 x i1> zeroinitializer
+; COMMON-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP22]], <2 x i1> [[TMP3]], <2 x i1> zeroinitializer
; COMMON-NEXT: [[TMP20:%.*]] = or <2 x i1> [[TMP14]], [[TMP19]]
; COMMON-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP20]], i32 0
; COMMON-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
; COMMON: [[PRED_STORE_IF14]]:
-; COMMON-NEXT: [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META7:![0-9]+]]
-; COMMON-NEXT: store i64 [[TMP22]], ptr [[DST]], align 8, !alias.scope [[META10:![0-9]+]], !noalias [[META7]]
+; COMMON-NEXT: [[TMP24:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META7:![0-9]+]]
+; COMMON-NEXT: store i64 [[TMP24]], ptr [[DST]], align 8, !alias.scope [[META10:![0-9]+]], !noalias [[META7]]
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE15]]
; COMMON: [[PRED_STORE_CONTINUE15]]:
; COMMON-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP20]], i32 1
; COMMON-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]]
; COMMON: [[PRED_STORE_IF16]]:
-; COMMON-NEXT: [[TMP24:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META7]]
-; COMMON-NEXT: store i64 [[TMP24]], ptr [[DST]], align 8, !alias.scope [[META10]], !noalias [[META7]]
+; COMMON-NEXT: [[TMP26:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META7]]
+; COMMON-NEXT: store i64 [[TMP26]], ptr [[DST]], align 8, !alias.scope [[META10]], !noalias [[META7]]
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE17]]
; COMMON: [[PRED_STORE_CONTINUE17]]:
; COMMON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
@@ -645,9 +644,12 @@ define void @forced_scalar_instr(ptr %gep.dst) {
; COMMON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; COMMON-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; COMMON-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
-; COMMON-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; COMMON-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; COMMON: [[MIDDLE_BLOCK]]:
; COMMON-NEXT: br label %[[EXIT:.*]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: ret void
+;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
index 2ef5f55126c95..1aa53e1ef95a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
@@ -44,8 +44,7 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[TMP4]], [[TMP6]]
; IF-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
; IF-EVL-NEXT: [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP12]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], <vscale x 4 x i1> zeroinitializer
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> poison)
; IF-EVL-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[PREDPHI8]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index 5a396f88b1a64..a53377af9e502 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -461,8 +461,8 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
; COST-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
; COST-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
; COST-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
-; COST-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
-; COST-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
+; COST-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; COST-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
; COST-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
@@ -540,10 +540,10 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
; FORCED-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true)
; FORCED-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true)
; FORCED-NEXT: [[TMP18:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
-; FORCED-NEXT: [[TMP19:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
-; FORCED-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
-; FORCED-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP19]], <4 x i1> zeroinitializer
-; FORCED-NEXT: [[TMP22:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer
+; FORCED-NEXT: [[TMP19:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP17]], <4 x i1> zeroinitializer
+; FORCED-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
+; FORCED-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
+; FORCED-NEXT: [[TMP22:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP21]])
; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[TMP8]], <4 x i1> [[TMP22]])
; FORCED-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 43dede0b612f3..d3a8927d52aa9 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1092,12 +1092,12 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00)
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP8]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP10]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP9]], <4 x float> [[PREDPHI]]
; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP3]], <4 x float> [[PREDPHI2]], <4 x float> [[VEC_PHI]]
@@ -1135,6 +1135,8 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD4]], splat (float 1.000000e+00)
; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP10]], <4 x i1> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00)
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD2]], splat (float 2.000000e+00)
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD3]]
@@ -1143,10 +1145,8 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
-; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP19]], <4 x i1> zeroinitializer
-; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = select <4 x i1> [[TMP21]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
; CHECK-INTERLEAVED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP20]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP13]]
; CHECK-INTERLEAVED-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP15]], <4 x float> [[TMP17]], <4 x float> [[PREDPHI]]
; CHECK-INTERLEAVED-NEXT: [[PREDPHI6]] = select <4 x i1> [[TMP5]], <4 x float> [[PREDPHI5]], <4 x float> [[VEC_PHI]]
>From 26f357a02c6d48c4077f82b85864dce97467db1a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Feb 2026 17:07:43 +0800
Subject: [PATCH 2/5] Only traverse from vector region, update
-vplan-print-after-all test
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +-
llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b9de83c24acb3..e53575ccd8e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1612,7 +1612,7 @@ static void reassociateHeaderMask(VPlan &Plan) {
if (!HeaderMask)
return;
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getEntry());
+ Plan.getVectorLoopRegion());
VPValue *X, *Y;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))
for (VPRecipeBase &R : make_early_inc_range(*VPBB))
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll
index 5b68887f0f7da..bfc0cd143c308 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-print-after-all.ll
@@ -19,12 +19,14 @@
; CHECK: VPlan after VPlanTransforms::truncateToMinimalBitwidths
; CHECK: VPlan after removeRedundantCanonicalIVs
; CHECK: VPlan after removeRedundantInductionCasts
+; CHECK: VPlan after reassociateHeaderMask
; CHECK: VPlan after simplifyRecipes
; CHECK: VPlan after removeDeadRecipes
; CHECK: VPlan after simplifyBlends
; CHECK: VPlan after legalizeAndOptimizeInductions
; CHECK: VPlan after narrowToSingleScalarRecipes
; CHECK: VPlan after removeRedundantExpandSCEVRecipes
+; CHECK: VPlan after reassociateHeaderMask
; CHECK: VPlan after simplifyRecipes
; CHECK: VPlan after removeBranchOnConst
; CHECK: VPlan after removeDeadRecipes
>From edf38fd765ce5e51feafb9cc48a65d04daf5a5be Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Feb 2026 18:06:57 +0800
Subject: [PATCH 3/5] Use collectUsersRecursively
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 25 +++++++++----------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e53575ccd8e1c..1649b7d49bab7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1611,19 +1611,18 @@ static void reassociateHeaderMask(VPlan &Plan) {
VPValue *HeaderMask = vputils::findHeaderMask(Plan);
if (!HeaderMask)
return;
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getVectorLoopRegion());
- VPValue *X, *Y;
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))
- for (VPRecipeBase &R : make_early_inc_range(*VPBB))
- if (match(&R,
- m_LogicalAnd(m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
- m_VPValue(Y)))) {
- VPBuilder Builder(&R);
- R.getVPSingleValue()->replaceAllUsesWith(Builder.createLogicalAnd(
- HeaderMask, Builder.createLogicalAnd(X, Y)));
- R.eraseFromParent();
- }
+ for (VPUser *U : collectUsersRecursively(HeaderMask)) {
+ auto *R = cast<VPRecipeBase>(U);
+ VPValue *X, *Y;
+ if (match(R,
+ m_LogicalAnd(m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
+ m_VPValue(Y)))) {
+ VPBuilder Builder(R);
+ R->getVPSingleValue()->replaceAllUsesWith(
+ Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
+ R->eraseFromParent();
+ }
+ }
}
static void narrowToSingleScalarRecipes(VPlan &Plan) {
>From f9816b542ae3e78b6aa376b00d2344ac87b4dd4b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Feb 2026 19:27:01 +0800
Subject: [PATCH 4/5] Only traverse necessary users
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1649b7d49bab7..3b4141c8bf544 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1611,13 +1611,18 @@ static void reassociateHeaderMask(VPlan &Plan) {
VPValue *HeaderMask = vputils::findHeaderMask(Plan);
if (!HeaderMask)
return;
- for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *R = cast<VPRecipeBase>(U);
+ SmallSetVector<VPUser *, 8> Worklist(HeaderMask->user_begin(),
+ HeaderMask->user_end());
+ while (!Worklist.empty()) {
+ auto *R = cast<VPRecipeBase>(Worklist.pop_back_val());
VPValue *X, *Y;
- if (match(R,
- m_LogicalAnd(m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
- m_VPValue(Y)))) {
+ if (match(R, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
+ Worklist.insert_range(R->getVPSingleValue()->users());
+ else if (match(R, m_LogicalAnd(
+ m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
+ m_VPValue(Y)))) {
VPBuilder Builder(R);
+ Worklist.insert_range(R->getVPSingleValue()->users());
R->getVPSingleValue()->replaceAllUsesWith(
Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
R->eraseFromParent();
>From c93da22694b472344eb3671daa0cb1bd59efa2c7 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 12 Feb 2026 20:13:56 +0800
Subject: [PATCH 5/5] Use VPSingleDefRecipe
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b4141c8bf544..ca29e5ee726ba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1614,16 +1614,18 @@ static void reassociateHeaderMask(VPlan &Plan) {
SmallSetVector<VPUser *, 8> Worklist(HeaderMask->user_begin(),
HeaderMask->user_end());
while (!Worklist.empty()) {
- auto *R = cast<VPRecipeBase>(Worklist.pop_back_val());
+ auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
+ if (!R)
+ continue;
VPValue *X, *Y;
if (match(R, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
- Worklist.insert_range(R->getVPSingleValue()->users());
+ Worklist.insert_range(R->users());
else if (match(R, m_LogicalAnd(
m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
m_VPValue(Y)))) {
VPBuilder Builder(R);
- Worklist.insert_range(R->getVPSingleValue()->users());
- R->getVPSingleValue()->replaceAllUsesWith(
+ Worklist.insert_range(R->users());
+ R->replaceAllUsesWith(
Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
R->eraseFromParent();
}
More information about the llvm-commits
mailing list