[llvm] [VPlan] Don't cost FOR splice if unused in legacy cost model (PR #131486)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 16 21:47:19 PDT 2025
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/131486
>From 3e3a16732e3ec81ad781e4ac7b1a4d4fc32e0c6c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sun, 16 Mar 2025 10:32:57 +0800
Subject: [PATCH 1/2] [VPlan] Don't cost FOR splice if unused in legacy cost
model
Fixes #131359
After #129645, a first-order recurrence will no longer have it's splice costed if the VPInstruction::FirstOrderRecurrenceSplice has no users and is dead.
The legacy cost model didn't account for this, so update this to avoid the "VPlan cost model and legacy cost model disagreed" assertion.
Alternatively we could also account for this in planContainsAdditionalSimplifications
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 ++
.../Transforms/LoopVectorize/X86/pr131359.ll | 61 +++++++++++++++++++
2 files changed, 66 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr131359.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 08e125eca591e..f6b91000a2e7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6541,6 +6541,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// TODO: Consider vscale_range info.
if (VF.isScalable() && VF.getKnownMinValue() == 1)
return InstructionCost::getInvalid();
+ // If a FOR has no users inside the loop we won't generate a splice.
+ if (none_of(Phi->users(), [this](User *U) {
+ return TheLoop->contains(cast<Instruction>(U));
+ }))
+ return 0;
SmallVector<int> Mask(VF.getKnownMinValue());
std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll b/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll
new file mode 100644
index 0000000000000..2d0c0ce891ae7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+; Make sure the legacy cost model doesn't add a cost for a splice when the
+; first-order recurrence isn't used inside the loop. The VPlan cost model
+; eliminates the dead VPInstruction::FirstOrderRecurrenceSplice so the two cost
+; models would go out of sync otherwise.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64"
+
+define void @h() {
+; CHECK-LABEL: define void @h() {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, %[[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
+; CHECK-NEXT: br i1 false, label %[[F_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: [[D_0_I:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[E_0_I:%.*]], %[[FOR_COND_I]] ]
+; CHECK-NEXT: [[E_0_I]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC_I:%.*]], %[[FOR_COND_I]] ]
+; CHECK-NEXT: [[INC_I]] = add i32 [[E_0_I]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[E_0_I]], 43
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I]], label %[[F_EXIT]], label %[[FOR_COND_I]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[F_EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.cond.i
+
+for.cond.i:
+ %d.0.i = phi i32 [ 0, %entry ], [ %e.0.i, %for.cond.i ]
+ %e.0.i = phi i32 [ 0, %entry ], [ %inc.i, %for.cond.i ]
+ %inc.i = add i32 %e.0.i, 1
+ %exitcond.not.i = icmp eq i32 %e.0.i, 43
+ br i1 %exitcond.not.i, label %f.exit, label %for.cond.i
+
+f.exit:
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
>From 3952b01c9e7daf0c92064a970b299e84b0b53c85 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 17 Mar 2025 12:46:47 +0800
Subject: [PATCH 2/2] Move check into planContainsAdditionalSimplifications to
account for dead uses of FOR
---
.../Transforms/Vectorize/LoopVectorize.cpp | 15 +++--
.../Transforms/LoopVectorize/X86/pr131359.ll | 62 +++++++++++++++++--
2 files changed, 66 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f6b91000a2e7b..88b58cccdbff2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6541,11 +6541,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// TODO: Consider vscale_range info.
if (VF.isScalable() && VF.getKnownMinValue() == 1)
return InstructionCost::getInvalid();
- // If a FOR has no users inside the loop we won't generate a splice.
- if (none_of(Phi->users(), [this](User *U) {
- return TheLoop->contains(cast<Instruction>(U));
- }))
- return 0;
SmallVector<int> Mask(VF.getKnownMinValue());
std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
@@ -7472,6 +7467,16 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
}
continue;
}
+ // If a FOR's splice wasn't used it will have been removed, so the VPlan
+ // model won't cost it whilst the legacy will.
+ if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
+ if (none_of(FOR->users(), [](VPUser *U) {
+ auto *VPI = dyn_cast<VPInstruction>(U);
+ return VPI && VPI->getOpcode() ==
+ VPInstruction::FirstOrderRecurrenceSplice;
+ }))
+ return true;
+ }
// The VPlan-based cost model is more accurate for partial reduction and
// comparing against the legacy cost isn't desirable.
if (isa<VPPartialReductionRecipe>(&R))
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll b/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll
index 2d0c0ce891ae7..a5e1796bdc5fc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr131359.ll
@@ -1,16 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-vectorize -S %s | FileCheck %s
-; Make sure the legacy cost model doesn't add a cost for a splice when the
-; first-order recurrence isn't used inside the loop. The VPlan cost model
-; eliminates the dead VPInstruction::FirstOrderRecurrenceSplice so the two cost
-; models would go out of sync otherwise.
+; If a FOR isn't used the VPInstruction::FirstOrderRecurrenceSplice will be dead
+; and won't be costed in the VPlan cost model. Make sure we account for this
+; simplifcation in comparison to the legacy cost model.
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64"
-define void @h() {
-; CHECK-LABEL: define void @h() {
+define void @no_use() {
+; CHECK-LABEL: define void @no_use() {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
@@ -53,9 +52,60 @@ for.cond.i:
f.exit:
ret void
}
+
+
+define void @dead_use() {
+; CHECK-LABEL: define void @dead_use() {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, %[[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
+; CHECK-NEXT: br i1 false, label %[[F_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_COND_I:.*]]
+; CHECK: [[FOR_COND_I]]:
+; CHECK-NEXT: [[D_0_I:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[E_0_I:%.*]], %[[FOR_COND_I]] ]
+; CHECK-NEXT: [[E_0_I]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC_I:%.*]], %[[FOR_COND_I]] ]
+; CHECK-NEXT: [[DEAD:%.*]] = add i32 [[D_0_I]], 1
+; CHECK-NEXT: [[INC_I]] = add i32 [[E_0_I]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[E_0_I]], 43
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I]], label %[[F_EXIT]], label %[[FOR_COND_I]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[F_EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.cond.i
+
+for.cond.i:
+ %d.0.i = phi i32 [ 0, %entry ], [ %e.0.i, %for.cond.i ]
+ %e.0.i = phi i32 [ 0, %entry ], [ %inc.i, %for.cond.i ]
+ %dead = add i32 %d.0.i, 1
+ %inc.i = add i32 %e.0.i, 1
+ %exitcond.not.i = icmp eq i32 %e.0.i, 43
+ br i1 %exitcond.not.i, label %f.exit, label %for.cond.i
+
+f.exit:
+ ret void
+}
+
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
;.
More information about the llvm-commits
mailing list