[llvm] [LLVM][VPlan] Pick more optimal initial value for VPBlend. (PR #104019)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 14 07:38:46 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Paul Walker (paulwalker-arm)
<details>
<summary>Changes</summary>
By choosing an initial value whose mask is only used by the blend we can remove the need for the mask entirely.
When going through my todo list I noticed my VPlanPatternMatch and OR simplification work (https://reviews.llvm.org/D157411) has been landed on main so here I'm updating https://reviews.llvm.org/D157412, which is now reimagined as a VPlan transformation.
This PR contains https://github.com/llvm/llvm-project/pull/104015 so the first should be ignored.
---
Patch is 38.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/104019.diff
12 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (-4)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+11-7)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+49-6)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll (+21-27)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll (+13-15)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll (+11-12)
- (modified) llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll (+12-15)
- (modified) llvm/test/Transforms/LoopVectorize/if-reduction.ll (+2-3)
- (modified) llvm/test/Transforms/LoopVectorize/phi-cost.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll (+1-3)
- (modified) llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll (+14-16)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f5337b11edc977..0d45c1f509ba54 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8170,8 +8170,6 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
- // TODO: At the moment the first mask is always skipped, but it would be
- // better to skip the most expensive mask.
SmallVector<VPValue *, 2> OperandsWithMask;
for (unsigned In = 0; In < NumIncoming; In++) {
@@ -8184,8 +8182,6 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
"Distinct incoming values with one having a full mask");
break;
}
- if (In == 0)
- continue;
OperandsWithMask.push_back(EdgeMask);
}
return new VPBlendRecipe(Phi, OperandsWithMask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 016ad75c21ceb0..e4c2e76fc381e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2033,12 +2033,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
class VPBlendRecipe : public VPSingleDefRecipe {
public:
/// The blend operation is a User of the incoming values and of their
- /// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first
- /// incoming value does not have a mask associated.
+ /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can
+ /// be ommited (implied by passing an odd number of operands) in which case
+ /// all other incoming values are merged into it.
VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
: VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
- assert((Operands.size() + 1) % 2 == 0 &&
- "Expected an odd number of operands");
+ assert(Operands.size() > 0 && "Expected at least one operand!");
}
VPBlendRecipe *clone() override {
@@ -2048,19 +2048,23 @@ class VPBlendRecipe : public VPSingleDefRecipe {
VP_CLASSOF_IMPL(VPDef::VPBlendSC)
+ /// A normalized blend is one that has an odd number of operands, whereby the
+ /// first operand does not have an associated mask.
+ bool isNormalized() const { return getNumOperands() % 2; }
+
/// Return the number of incoming values, taking into account that the first
/// incoming value has no mask.
unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
/// Return incoming value number \p Idx.
VPValue *getIncomingValue(unsigned Idx) const {
- return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1);
+ return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - isNormalized());
}
/// Return mask number \p Idx.
VPValue *getMask(unsigned Idx) const {
- assert(Idx > 0 && "First index has no mask associated.");
- return getOperand(Idx * 2);
+ assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
+ return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
}
/// Generate the phi/select nodes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 911b2fe9e9a1eb..e4d629ef8c27cd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1635,6 +1635,7 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPBlendRecipe::execute(VPTransformState &State) {
+ assert(isNormalized() && "Expected blend to be normalized!");
State.setDebugLocFrom(getDebugLoc());
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 045f6c356669fa..9e5b4c3cf30429 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -989,15 +989,57 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
/// Try to simplify recipe \p R.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
using namespace llvm::VPlanPatternMatch;
- // Try to remove redundant blend recipes.
+
if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
- VPValue *Inc0 = Blend->getIncomingValue(0);
+ // Try to remove redundant blend recipes.
+ SmallSet<VPValue *, 4> UniqueValues;
+ if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
+ UniqueValues.insert(Blend->getIncomingValue(0));
for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
- if (Inc0 != Blend->getIncomingValue(I) &&
- !match(Blend->getMask(I), m_False()))
- return;
- Blend->replaceAllUsesWith(Inc0);
+ if (!match(Blend->getMask(I), m_False()))
+ UniqueValues.insert(Blend->getIncomingValue(I));
+
+ if (UniqueValues.size() == 1) {
+ Blend->replaceAllUsesWith(*UniqueValues.begin());
+ Blend->eraseFromParent();
+ return;
+ }
+
+ if (Blend->isNormalized())
+ return;
+
+ // Normalize the blend so its first incomming value is used as the initial
+ // value with the others blended into it.
+
+ unsigned StartIndex = 0;
+ for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+ // If a value's mask is only used by the blend then is can be deadcoded.
+ // TODO: Fine the most expensive mask that can be deadcoded.
+ VPValue *Mask = Blend->getMask(I);
+ if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
+ StartIndex = I;
+ break;
+ }
+ }
+
+ SmallVector<VPValue *, 4> OperandsWithMask;
+ OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
+
+ for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+ if (I == StartIndex)
+ continue;
+ OperandsWithMask.push_back(Blend->getIncomingValue(I));
+ OperandsWithMask.push_back(Blend->getMask(I));
+ }
+
+ auto *NewBlend = new VPBlendRecipe(
+ cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
+ NewBlend->insertBefore(&R);
+
+ VPValue *DeadMask = Blend->getMask(StartIndex);
+ Blend->replaceAllUsesWith(NewBlend);
Blend->eraseFromParent();
+ recursivelyDeleteDeadRecipes(DeadMask);
return;
}
@@ -1058,6 +1100,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
X == X1 && Y == Y1) {
R.getVPSingleValue()->replaceAllUsesWith(X);
+ R.eraseFromParent();
return;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index f467f3cf262d2f..93034f4dbe56ec 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -215,16 +215,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; TFCOMMON-NEXT: [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
; TFCOMMON-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
-; TFCOMMON-NEXT: [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP6]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
-; TFCOMMON-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> zeroinitializer
+; TFCOMMON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
-; TFCOMMON-NEXT: [[TMP12:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT: [[TMP13:%.*]] = extractelement <vscale x 2 x i1> [[TMP12]], i32 0
-; TFCOMMON-NEXT: br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TFCOMMON-NEXT: [[TMP10:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT: [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[TMP10]], i32 0
+; TFCOMMON-NEXT: br i1 [[TMP11]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TFCOMMON: for.cond.cleanup:
; TFCOMMON-NEXT: ret void
;
@@ -259,27 +257,23 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP13]])
; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP14]])
-; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = xor <vscale x 2 x i1> [[TMP11]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP15]]
-; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP16]]
-; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2
-; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP23]]
-; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP21]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP24]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> zeroinitializer
+; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
+; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 2
-; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX_NEXT]], [[TMP26]]
+; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2
+; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = add i64 [[INDEX_NEXT]], [[TMP22]]
; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
-; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP27]], i64 1025)
-; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i1> [[TMP28]], i32 0
-; TFA_INTERLEAVE-NEXT: br i1 [[TMP29]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP23]], i64 1025)
+; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <vscale x 2 x i1> [[TMP24]], i32 0
+; TFA_INTERLEAVE-NEXT: br i1 [[TMP25]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TFA_INTERLEAVE: for.cond.cleanup:
; TFA_INTERLEAVE-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index f922873210b052..66d001498e457b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1216,7 +1216,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF: vector.body:
; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]]
; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
@@ -1226,41 +1226,39 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP15]], i32 0
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT: [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP19]])
+; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP17]])
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
-; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP19]], i32 0
+; CHECK-ORDERED-TF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-ORDERED-TF: middle.block:
; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-ORDERED-TF: scalar.ph:
; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/104019
More information about the llvm-commits
mailing list