[llvm] e88a83a - [GlobalOpt][FMV] Perform expensive checks when NumVersions < Threshold (#168054)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 05:46:43 PST 2025
Author: Alexandros Lamprineas
Date: 2025-12-02T13:46:39Z
New Revision: e88a83acde69b2fc395474c905b9a17c22f61c05
URL: https://github.com/llvm/llvm-project/commit/e88a83acde69b2fc395474c905b9a17c22f61c05
DIFF: https://github.com/llvm/llvm-project/commit/e88a83acde69b2fc395474c905b9a17c22f61c05.diff
LOG: [GlobalOpt][FMV] Perform expensive checks when NumVersions < Threshold (#168054)
Extends the static resolution algorith to handle cases where we can
infer additional information on why a prior caller version of higher
priority was skipped, based on the features of the current caller
version.
For example let's say the current caller is aes+sve2 and a previous
caller was mops+sve2. Knowing that sve2 is available we could deduce
that mops is unavailable. This would allow us to skip callee versions
which depend on mops.
This comes at the expense of performing more checks. However we can
control the threshold (number of versions) which decides whether the
expensive checks will be performed or not.
Added:
Modified:
llvm/lib/Transforms/IPO/GlobalOpt.cpp
llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 6add1e5c092d3..939071725253f 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -99,6 +99,11 @@ static cl::opt<bool>
"functions from non-versioned callers."),
cl::init(true), cl::Hidden);
+static cl::opt<unsigned> MaxIFuncVersions(
+ "max-ifunc-versions", cl::Hidden, cl::init(5),
+ cl::desc("Maximum number of caller/callee versions that is allowed for "
+ "using the expensive (cubic) static resolution algorithm."));
+
static cl::opt<bool>
EnableColdCCStressTest("enable-coldcc-stress-test",
cl::desc("Enable stress test of coldcc by adding "
@@ -2632,31 +2637,56 @@ static bool OptimizeNonTrivialIFuncs(
LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
<< CalleeIF->getResolverFunction()->getName() << "\n");
- // The complexity of this algorithm is linear: O(NumCallers + NumCallees).
- // TODO
- // A limitation it has is that we are not using information about the
- // current caller to deduce why an earlier caller of higher priority was
- // skipped. For example let's say the current caller is aes+sve2 and a
- // previous caller was mops+sve2. Knowing that sve2 is available we could
- // infer that mops is unavailable. This would allow us to skip callee
- // versions which depend on mops. I tried implementing this but the
- // complexity was cubic :/
+ // The complexity of this algorithm is linear: O(NumCallers + NumCallees)
+ // if NumCallers > MaxIFuncVersions || NumCallees > MaxIFuncVersions,
+ // otherwise it is cubic: O((NumCallers ^ 2) x NumCallees).
auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers,
ArrayRef<Function *> Callees,
bool CallerIsFMV) {
+ bool AllowExpensiveChecks = CallerIsFMV &&
+ Callers.size() <= MaxIFuncVersions &&
+ Callees.size() <= MaxIFuncVersions;
// Index to the highest callee candidate.
- unsigned I = 0;
+ unsigned J = 0;
- for (Function *const &Caller : Callers) {
- if (I == Callees.size())
+ for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
+ // There are no callee candidates left.
+ if (J == Callees.size())
break;
+ Function *Caller = Callers[I];
+ APInt CallerBits = FeatureMask[Caller];
+
+ // Compare the feature bits of the best callee candidate with all the
+ // caller versions preceeding the current one. For each prior caller
+ // discard feature bits that are known to be available in the current
+ // caller. As long as the known missing feature bits are a subset of the
+ // callee feature bits, advance to the next callee and start over.
+ auto eliminateAvailableFeatures = [&](unsigned BestCandidate) {
+ unsigned K = 0;
+ while (K < I && BestCandidate < Callees.size()) {
+ APInt MissingBits = FeatureMask[Callers[K]] & ~CallerBits;
+ if (MissingBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) {
+ ++BestCandidate;
+ // Start over.
+ K = 0;
+ } else
+ ++K;
+ }
+ return BestCandidate;
+ };
+
+ unsigned BestCandidate =
+ AllowExpensiveChecks ? eliminateAvailableFeatures(J) : J;
+ // No callee candidate was found for this caller.
+ if (BestCandidate == Callees.size())
+ continue;
+
LLVM_DEBUG(dbgs() << " Examining "
<< (CallerIsFMV ? "FMV" : "regular") << " caller "
<< Caller->getName() << "\n");
- Function *Callee = Callees[I];
- APInt CallerBits = FeatureMask[Caller];
+ Function *Callee = Callees[BestCandidate];
APInt CalleeBits = FeatureMask[Callee];
// Statically resolve calls from the current caller to the current
@@ -2682,8 +2712,8 @@ static bool OptimizeNonTrivialIFuncs(
// the callee feature bits, advance to the next callee. This effectively
// prevents considering the current callee as a candidate for static
// resolution by following callers.
- while (CallerBits.isSubsetOf(FeatureMask[Callees[I]]) &&
- ++I < Callees.size())
+ while (CallerBits.isSubsetOf(FeatureMask[Callees[J]]) &&
+ ++J < Callees.size())
;
}
};
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 156c49c8b6677..a7fcc667dbedc 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers|test_known_bits)" --version 4
; REQUIRES: aarch64-registered-target
@@ -14,6 +14,7 @@ $test_non_fmv_caller.resolver = comdat any
$test_priority.resolver = comdat any
$test_alternative_names.resolver = comdat any
$test_unrelated_callers.resolver = comdat any
+$test_known_bits.resolver = comdat any
$caller1.resolver = comdat any
$caller2.resolver = comdat any
$caller3.resolver = comdat any
@@ -21,6 +22,7 @@ $caller6.resolver = comdat any
$caller7.resolver = comdat any
$caller8.resolver = comdat any
$caller9.resolver = comdat any
+$caller11.resolver = comdat any
@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
@@ -31,6 +33,7 @@ $caller9.resolver = comdat any
@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
@test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver
+ at test_known_bits = weak_odr ifunc i32 (), ptr @test_known_bits.resolver
@caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver
@caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver
@caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver
@@ -38,6 +41,7 @@ $caller9.resolver = comdat any
@caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver
@caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver
@caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver
+ at caller11 = weak_odr ifunc i32 (), ptr @caller11.resolver
declare void @__init_cpu_features_resolver() local_unnamed_addr
@@ -509,7 +513,7 @@ entry:
define dso_local i32 @caller8._Msve2() #2 {
; CHECK-LABEL: define dso_local i32 @caller8._Msve2(
; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
;
entry:
%call = tail call i32 @test_unrelated_callers()
@@ -591,6 +595,89 @@ entry:
ret i32 %call
}
+declare i32 @test_known_bits._Mmops() #3
+declare i32 @test_known_bits._Maes() #6
+declare i32 @test_known_bits.default() #0
+
+define weak_odr ptr @test_known_bits.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_known_bits.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 33536
+ %3 = icmp eq i64 %2, 33536
+ %test_known_bits._Maes.test_known_bits.default = select i1 %3, ptr @test_known_bits._Maes, ptr @test_known_bits.default
+ %common.ret.op = select i1 %.not, ptr %test_known_bits._Maes.test_known_bits.default, ptr @test_known_bits._Mmops
+ ret ptr %common.ret.op
+}
+
+define i32 @caller11._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller11._MmopsMsve2(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits._Mmops()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define i32 @caller11._Msme() #5 {
+; CHECK-LABEL: define i32 @caller11._Msme(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define noundef i32 @caller11._MaesMsve2() #19 {
+; CHECK-LABEL: define noundef i32 @caller11._MaesMsve2(
+; CHECK-SAME: ) #[[ATTR19:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits._Maes()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define i32 @caller11.default() #0 {
+; CHECK-LABEL: define i32 @caller11.default(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define weak_odr ptr @caller11.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller11.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460822096707840
+ %2 = icmp eq i64 %1, 576460822096707840
+ br i1 %2, label %common.ret, label %resolver_else
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @caller11._MmopsMsve2, %resolver_entry ], [ @caller11._Msme, %resolver_else ], [ %caller11._MaesMsve2.caller11.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %3 = and i64 %0, 4398180795136
+ %4 = icmp eq i64 %3, 4398180795136
+ br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2: ; preds = %resolver_else
+ %5 = and i64 %0, 69793317632
+ %6 = icmp eq i64 %5, 69793317632
+ %caller11._MaesMsve2.caller11.default = select i1 %6, ptr @caller11._MaesMsve2, ptr @caller11.default
+ br label %common.ret
+}
+
attributes #0 = { "fmv-features" }
attributes #1 = { "fmv-features"="sve" }
attributes #2 = { "fmv-features"="sve2" }
@@ -610,3 +697,4 @@ attributes #15 = { "fmv-features"="flagm2,frintts" }
attributes #16 = { "fmv-features"="rcpc2" }
attributes #17 = { "fmv-features"="frintts" }
attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" }
+attributes #19 = { "fmv-features"="aes,sve2" }
More information about the llvm-commits
mailing list