[llvm] [SCEVExpander] Fix addrec cost model (PR #106704)
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 30 03:23:59 PDT 2024
https://github.com/nikic created https://github.com/llvm/llvm-project/pull/106704
The current isHighCostExpansion cost model for addrecs computes the cost for some kind of polynomial expansion that does not appear to have any relation to addrec expansion whatsoever.
A literal expansion of an affine addrec is a phi and add (plus the expansion of start and step). For a non-affine addrec, we get another phi+add for each additional addrec nested in the step recurrence.
This partially fixes https://github.com/llvm/llvm-project/issues/53205 (the runtime unroll test case in this PR).
>From 889f113d6cf61d9be5943dcea9da9a6f439434d5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 30 Aug 2024 12:10:56 +0200
Subject: [PATCH] fix cost model
---
.../Utils/ScalarEvolutionExpander.cpp | 48 +++-------
.../rewrite-loop-exit-values-phi.ll | 13 +--
.../X86/runtime-unroll-addrec-cost.ll | 91 ++++++++++++++++---
3 files changed, 96 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index c7d758aa575e61..0927a3015818fd 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1911,43 +1911,17 @@ template<typename T> static InstructionCost costAndCollectOperands(
break;
}
case scAddRecExpr: {
- // In this polynominal, we may have some zero operands, and we shouldn't
- // really charge for those. So how many non-zero coefficients are there?
- int NumTerms = llvm::count_if(S->operands(), [](const SCEV *Op) {
- return !Op->isZero();
- });
-
- assert(NumTerms >= 1 && "Polynominal should have at least one term.");
- assert(!(*std::prev(S->operands().end()))->isZero() &&
- "Last operand should not be zero");
-
- // Ignoring constant term (operand 0), how many of the coefficients are u> 1?
- int NumNonZeroDegreeNonOneTerms =
- llvm::count_if(S->operands(), [](const SCEV *Op) {
- auto *SConst = dyn_cast<SCEVConstant>(Op);
- return !SConst || SConst->getAPInt().ugt(1);
- });
-
- // Much like with normal add expr, the polynominal will require
- // one less addition than the number of it's terms.
- InstructionCost AddCost = ArithCost(Instruction::Add, NumTerms - 1,
- /*MinIdx*/ 1, /*MaxIdx*/ 1);
- // Here, *each* one of those will require a multiplication.
- InstructionCost MulCost =
- ArithCost(Instruction::Mul, NumNonZeroDegreeNonOneTerms);
- Cost = AddCost + MulCost;
-
- // What is the degree of this polynominal?
- int PolyDegree = S->getNumOperands() - 1;
- assert(PolyDegree >= 1 && "Should be at least affine.");
-
- // The final term will be:
- // Op_{PolyDegree} * x ^ {PolyDegree}
- // Where x ^ {PolyDegree} will again require PolyDegree-1 mul operations.
- // Note that x ^ {PolyDegree} = x * x ^ {PolyDegree-1} so charging for
- // x ^ {PolyDegree} will give us x ^ {2} .. x ^ {PolyDegree-1} for free.
- // FIXME: this is conservatively correct, but might be overly pessimistic.
- Cost += MulCost * (PolyDegree - 1);
+ // Addrec expands to a phi and add per recurrence.
+ unsigned NumRecurrences = S->getNumOperands() - 1;
+ Cost += TTI.getCFInstrCost(Instruction::PHI, CostKind) * NumRecurrences;
+ Cost +=
+ TTI.getArithmeticInstrCost(Instruction::Add, S->getType(), CostKind) *
+ NumRecurrences;
+ // AR start is used in phi.
+ Worklist.emplace_back(Instruction::PHI, 0, S->getOperand(0));
+ // Other operands are used in add.
+ for (const SCEV *Op : S->operands().drop_front())
+ Worklist.emplace_back(Instruction::Add, 1, Op);
break;
}
}
diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
index 63f3da7af46ecc..37bc67c23adb75 100644
--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
+++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
@@ -14,27 +14,28 @@ define dso_local void @hoge() local_unnamed_addr {
; CHECK-LABEL: @hoge(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[N:%.*]] = sdiv exact i64 undef, 40
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 undef, [[N]]
; CHECK-NEXT: br label [[HEADER:%.*]]
; CHECK: header:
-; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], [[LATCH:%.*]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], [[LATCH]] ], [ undef, [[ENTRY]] ]
; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N]], [[IDX]]
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[INNER_PREHEADER:%.*]]
; CHECK: inner.preheader:
; CHECK-NEXT: br label [[INNER:%.*]]
; CHECK: inner:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[INNER]] ], [ 0, [[INNER_PREHEADER]] ]
-; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNER]] ], [ [[N]], [[INNER_PREHEADER]] ]
-; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT: [[J_NEXT]] = add nsw i64 [[J]], 1
+; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I]], 1
; CHECK-NEXT: store i64 undef, ptr @ptr, align 8
-; CHECK-NEXT: [[COND1:%.*]] = icmp slt i64 [[J]], [[IDX]]
-; CHECK-NEXT: br i1 [[COND1]], label [[INNER]], label [[INNER_EXIT:%.*]]
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[I_NEXT]], [[INDVARS_IV]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNER]], label [[INNER_EXIT:%.*]]
; CHECK: inner_exit:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[I_NEXT]], [[INNER]] ]
; CHECK-NEXT: [[INDVAR_USE:%.*]] = add i64 [[INDVAR]], 1
; CHECK-NEXT: br label [[LATCH]]
; CHECK: latch:
; CHECK-NEXT: [[IDX_NEXT]] = add nsw i64 [[IDX]], -1
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
; CHECK-NEXT: br label [[HEADER]]
; CHECK: end:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll b/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll
index 3c2c840aeb8103..367ef90d6a3862 100644
--- a/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll
+++ b/llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-addrec-cost.ll
@@ -13,17 +13,27 @@ define void @selsort(ptr %array) #0 {
; CHECK-NEXT: br i1 [[CMP21_NOT]], label %[[FOR_END18:.*]], label %[[FOR_BODY_LR_PH:.*]]
; CHECK: [[FOR_BODY_LR_PH]]:
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAY]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP0]], -1
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], -2
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[BASE_022:%.*]] = phi i64 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], %[[FOR_END:.*]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[BASE_022]], -1
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP7]], [[TMP10]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[ADD]] = add nuw i64 [[BASE_022]], 1
; CHECK-NEXT: [[CMP318:%.*]] = icmp ult i64 [[ADD]], [[TMP0]]
; CHECK-NEXT: br i1 [[CMP318]], label %[[FOR_BODY4_PREHEADER:.*]], label %[[FOR_END]]
; CHECK: [[FOR_BODY4_PREHEADER]]:
-; CHECK-NEXT: br label %[[FOR_BODY4:.*]]
-; CHECK: [[FOR_BODY4]]:
-; CHECK-NEXT: [[MIN_020:%.*]] = phi i64 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY4]] ], [ [[BASE_022]], %[[FOR_BODY4_PREHEADER]] ]
-; CHECK-NEXT: [[C_019:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY4]] ], [ [[ADD]], %[[FOR_BODY4_PREHEADER]] ]
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP11]], 3
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY4_PROL_PREHEADER:.*]], label %[[FOR_BODY4_PROL_LOOPEXIT:.*]]
+; CHECK: [[FOR_BODY4_PROL_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY4_PROL:.*]]
+; CHECK: [[FOR_BODY4_PROL]]:
+; CHECK-NEXT: [[MIN_020:%.*]] = phi i64 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY4_PROL]] ], [ [[BASE_022]], %[[FOR_BODY4_PROL_PREHEADER]] ]
+; CHECK-NEXT: [[C_019:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY4_PROL]] ], [ [[ADD]], %[[FOR_BODY4_PROL_PREHEADER]] ]
+; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY4_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[FOR_BODY4_PROL]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[C_019]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_020]]
@@ -32,18 +42,69 @@ define void @selsort(ptr %array) #0 {
; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP7]], i64 [[C_019]], i64 [[MIN_020]]
; CHECK-NEXT: [[INC]] = add nuw i64 [[C_019]], 1
; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INC]], [[TMP0]]
-; CHECK-NEXT: br i1 [[CMP3]], label %[[FOR_BODY4]], label %[[FOR_END_LOOPEXIT:.*]]
-; CHECK: [[FOR_END_LOOPEXIT]]:
-; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4]] ]
-; CHECK-NEXT: br label %[[FOR_END]]
-; CHECK: [[FOR_END]]:
-; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY]] ], [ [[SPEC_SELECT_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label %[[FOR_BODY4_PROL]], label %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT: [[MIN_020_UNR_PH:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4_PROL]] ]
+; CHECK-NEXT: [[C_019_UNR_PH:%.*]] = phi i64 [ [[INC]], %[[FOR_BODY4_PROL]] ]
+; CHECK-NEXT: [[SPEC_SELECT_LCSSA_UNR_PH:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[FOR_BODY4_PROL]] ]
+; CHECK-NEXT: br label %[[FOR_BODY4_PROL_LOOPEXIT]]
+; CHECK: [[FOR_BODY4_PROL_LOOPEXIT]]:
+; CHECK-NEXT: [[MIN_020_UNR:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY4_PREHEADER]] ], [ [[MIN_020_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[C_019_UNR:%.*]] = phi i64 [ [[ADD]], %[[FOR_BODY4_PREHEADER]] ], [ [[C_019_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[SPEC_SELECT_LCSSA_UNR:%.*]] = phi i64 [ poison, %[[FOR_BODY4_PREHEADER]] ], [ [[SPEC_SELECT_LCSSA_UNR_PH]], %[[FOR_BODY4_PROL_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], 3
+; CHECK-NEXT: br i1 [[TMP9]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY4_PREHEADER_NEW:.*]]
+; CHECK: [[FOR_BODY4_PREHEADER_NEW]]:
+; CHECK-NEXT: br label %[[FOR_BODY4:.*]]
+; CHECK: [[FOR_BODY4]]:
+; CHECK-NEXT: [[MIN_20:%.*]] = phi i64 [ [[MIN_020_UNR]], %[[FOR_BODY4_PREHEADER_NEW]] ], [ [[SPEC_SELECT_3:%.*]], %[[FOR_BODY4]] ]
+; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i64 [ [[C_019_UNR]], %[[FOR_BODY4_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[FOR_BODY4]] ]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_0_LCSSA]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[BASE_022]]
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_20]]
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT: [[CMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[SPEC_SELECT1:%.*]] = select i1 [[CMP8]], i64 [[MIN_0_LCSSA]], i64 [[MIN_20]]
+; CHECK-NEXT: [[INC1:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 1
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC1]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT1]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX6_1]], align 4
+; CHECK-NEXT: [[CMP7_1:%.*]] = icmp ult i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[SPEC_SELECT_1:%.*]] = select i1 [[CMP7_1]], i64 [[INC1]], i64 [[SPEC_SELECT1]]
+; CHECK-NEXT: [[INC_1:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 2
+; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC_1]]
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4
+; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT_1]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_2]], align 4
+; CHECK-NEXT: [[CMP7_2:%.*]] = icmp ult i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[SPEC_SELECT_2:%.*]] = select i1 [[CMP7_2]], i64 [[INC_1]], i64 [[SPEC_SELECT_1]]
+; CHECK-NEXT: [[INC_2:%.*]] = add nuw i64 [[MIN_0_LCSSA]], 3
+; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INC_2]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4
+; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[SPEC_SELECT_2]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX6_3]], align 4
+; CHECK-NEXT: [[CMP7_3:%.*]] = icmp ult i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[SPEC_SELECT_3]] = select i1 [[CMP7_3]], i64 [[INC_2]], i64 [[SPEC_SELECT_2]]
+; CHECK-NEXT: [[INC_3]] = add nuw i64 [[MIN_0_LCSSA]], 4
+; CHECK-NEXT: [[CMP3_3:%.*]] = icmp ult i64 [[INC_3]], [[TMP0]]
+; CHECK-NEXT: br i1 [[CMP3_3]], label %[[FOR_BODY4]], label %[[FOR_END_LOOPEXIT_UNR_LCSSA:.*]]
+; CHECK: [[FOR_END_LOOPEXIT_UNR_LCSSA]]:
+; CHECK-NEXT: [[SPEC_SELECT_LCSSA_PH:%.*]] = phi i64 [ [[SPEC_SELECT_3]], %[[FOR_BODY4]] ]
+; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT]]
+; CHECK: [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT_LCSSA_UNR]], %[[FOR_BODY4_PROL_LOOPEXIT]] ], [ [[SPEC_SELECT_LCSSA_PH]], %[[FOR_END_LOOPEXIT_UNR_LCSSA]] ]
+; CHECK-NEXT: br label %[[FOR_END]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: [[MIN_0_LCSSA1:%.*]] = phi i64 [ [[BASE_022]], %[[FOR_BODY]] ], [ [[SPEC_SELECT_LCSSA]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[MIN_0_LCSSA1]]
+; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[BASE_022]]
+; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX12]], align 4
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD]], [[TMP0]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END18_LOOPEXIT:.*]], label %[[FOR_BODY]]
; CHECK: [[FOR_END18_LOOPEXIT]]:
@@ -96,3 +157,7 @@ for.end18: ; preds = %for.end, %entry
}
attributes #0 = { "tune-cpu"="generic" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
More information about the llvm-commits
mailing list