[llvm] [GlobalOpt][FMV] Perform expensive checks when NumVersions < Threshold (PR #168054)
Alexandros Lamprineas via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 28 01:25:22 PST 2025
https://github.com/labrinea updated https://github.com/llvm/llvm-project/pull/168054
>From bc57d27d36fdb5495593b92f06f8e3d11c9c4b57 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Thu, 6 Nov 2025 18:44:21 +0000
Subject: [PATCH 1/3] [GlobalOpt][FMV] Perform expensive checks when
NumVersions < Threshold
Extends the static resolution algorith to handle cases where we can infer
additional information on why a prior caller version of higher priority
was skipped, based on the features of the current caller version.
For example let's say the current caller is aes+sve2 and a previous caller
was mops+sve2. Knowing that sve2 is available we could deduce that mops is
unavailable. This would allow us to skip callee versions which depend on
mops.
This comes at the expense of performing more checks. However we can control
the threshold (number of versions) which decides whether the expensive
checks will be performed or not.
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 54 +++++++----
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 92 ++++++++++++++++++-
2 files changed, 128 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index c3dede31540d6..7edf5120ccccd 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -99,6 +99,11 @@ static cl::opt<bool>
"functions from non-versioned callers."),
cl::init(true), cl::Hidden);
+static cl::opt<unsigned> MaxIFuncVersions(
+ "max-ifunc-versions", cl::Hidden, cl::init(5),
+ cl::desc("Maximum number of caller/callee versions that is allowed for "
+ "using the expensive (cubic) static resolution algorithm."));
+
static cl::opt<bool>
EnableColdCCStressTest("enable-coldcc-stress-test",
cl::desc("Enable stress test of coldcc by adding "
@@ -2629,31 +2634,48 @@ static bool OptimizeNonTrivialIFuncs(
LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
<< CalleeIF->getResolverFunction()->getName() << "\n");
- // The complexity of this algorithm is linear: O(NumCallers + NumCallees).
- // TODO
- // A limitation it has is that we are not using information about the
- // current caller to deduce why an earlier caller of higher priority was
- // skipped. For example let's say the current caller is aes+sve2 and a
- // previous caller was mops+sve2. Knowing that sve2 is available we could
- // infer that mops is unavailable. This would allow us to skip callee
- // versions which depend on mops. I tried implementing this but the
- // complexity was cubic :/
+ // The complexity of this algorithm is linear: O(NumCallers + NumCallees)
+ // if NumCallers > MaxIFuncVersions || NumCallees > MaxIFuncVersions,
+ // otherwise it is cubic: O((NumCallers ^ 2) x NumCallees).
auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers,
ArrayRef<Function *> Callees,
bool CallerIsFMV) {
+ bool AllowExpensiveChecks = CallerIsFMV &&
+ Callers.size() <= MaxIFuncVersions &&
+ Callees.size() <= MaxIFuncVersions;
// Index to the highest callee candidate.
- unsigned I = 0;
+ unsigned J = 0;
- for (Function *const &Caller : Callers) {
- if (I == Callees.size())
+ for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
+ if (J == Callees.size())
break;
+ Function *Caller = Callers[I];
+ APInt CallerBits = FeatureMask[Caller];
+ unsigned BestCandidate = J;
+
+ if (AllowExpensiveChecks) {
+ unsigned K = 0;
+ while (K < I && BestCandidate < Callees.size()) {
+ // Discard feature bits that are known to be available
+ // in the current iteration.
+ APInt KnownBits = FeatureMask[Callers[K]] & ~CallerBits;
+ if (KnownBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) {
+ ++BestCandidate;
+ // Start over.
+ K = 0;
+ } else
+ ++K;
+ }
+ if (BestCandidate == Callees.size())
+ break;
+ }
+
LLVM_DEBUG(dbgs() << " Examining "
<< (CallerIsFMV ? "FMV" : "regular") << " caller "
<< Caller->getName() << "\n");
- Function *Callee = Callees[I];
- APInt CallerBits = FeatureMask[Caller];
+ Function *Callee = Callees[BestCandidate];
APInt CalleeBits = FeatureMask[Callee];
// Statically resolve calls from the current caller to the current
@@ -2679,8 +2701,8 @@ static bool OptimizeNonTrivialIFuncs(
// the callee feature bits, advance to the next callee. This effectively
// prevents considering the current callee as a candidate for static
// resolution by following callers.
- while (CallerBits.isSubsetOf(FeatureMask[Callees[I]]) &&
- ++I < Callees.size())
+ while (CallerBits.isSubsetOf(FeatureMask[Callees[J]]) &&
+ ++J < Callees.size())
;
}
};
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 156c49c8b6677..a7fcc667dbedc 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers|test_known_bits)" --version 4
; REQUIRES: aarch64-registered-target
@@ -14,6 +14,7 @@ $test_non_fmv_caller.resolver = comdat any
$test_priority.resolver = comdat any
$test_alternative_names.resolver = comdat any
$test_unrelated_callers.resolver = comdat any
+$test_known_bits.resolver = comdat any
$caller1.resolver = comdat any
$caller2.resolver = comdat any
$caller3.resolver = comdat any
@@ -21,6 +22,7 @@ $caller6.resolver = comdat any
$caller7.resolver = comdat any
$caller8.resolver = comdat any
$caller9.resolver = comdat any
+$caller11.resolver = comdat any
@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
@@ -31,6 +33,7 @@ $caller9.resolver = comdat any
@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
@test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver
+ at test_known_bits = weak_odr ifunc i32 (), ptr @test_known_bits.resolver
@caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver
@caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver
@caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver
@@ -38,6 +41,7 @@ $caller9.resolver = comdat any
@caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver
@caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver
@caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver
+ at caller11 = weak_odr ifunc i32 (), ptr @caller11.resolver
declare void @__init_cpu_features_resolver() local_unnamed_addr
@@ -509,7 +513,7 @@ entry:
define dso_local i32 @caller8._Msve2() #2 {
; CHECK-LABEL: define dso_local i32 @caller8._Msve2(
; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
;
entry:
%call = tail call i32 @test_unrelated_callers()
@@ -591,6 +595,89 @@ entry:
ret i32 %call
}
+declare i32 @test_known_bits._Mmops() #3
+declare i32 @test_known_bits._Maes() #6
+declare i32 @test_known_bits.default() #0
+
+define weak_odr ptr @test_known_bits.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_known_bits.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 33536
+ %3 = icmp eq i64 %2, 33536
+ %test_known_bits._Maes.test_known_bits.default = select i1 %3, ptr @test_known_bits._Maes, ptr @test_known_bits.default
+ %common.ret.op = select i1 %.not, ptr %test_known_bits._Maes.test_known_bits.default, ptr @test_known_bits._Mmops
+ ret ptr %common.ret.op
+}
+
+define i32 @caller11._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller11._MmopsMsve2(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits._Mmops()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define i32 @caller11._Msme() #5 {
+; CHECK-LABEL: define i32 @caller11._Msme(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define noundef i32 @caller11._MaesMsve2() #19 {
+; CHECK-LABEL: define noundef i32 @caller11._MaesMsve2(
+; CHECK-SAME: ) #[[ATTR19:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits._Maes()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define i32 @caller11.default() #0 {
+; CHECK-LABEL: define i32 @caller11.default(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_known_bits()
+;
+entry:
+ %call = tail call i32 @test_known_bits()
+ ret i32 %call
+}
+
+define weak_odr ptr @caller11.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller11.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460822096707840
+ %2 = icmp eq i64 %1, 576460822096707840
+ br i1 %2, label %common.ret, label %resolver_else
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @caller11._MmopsMsve2, %resolver_entry ], [ @caller11._Msme, %resolver_else ], [ %caller11._MaesMsve2.caller11.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %3 = and i64 %0, 4398180795136
+ %4 = icmp eq i64 %3, 4398180795136
+ br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2: ; preds = %resolver_else
+ %5 = and i64 %0, 69793317632
+ %6 = icmp eq i64 %5, 69793317632
+ %caller11._MaesMsve2.caller11.default = select i1 %6, ptr @caller11._MaesMsve2, ptr @caller11.default
+ br label %common.ret
+}
+
attributes #0 = { "fmv-features" }
attributes #1 = { "fmv-features"="sve" }
attributes #2 = { "fmv-features"="sve2" }
@@ -610,3 +697,4 @@ attributes #15 = { "fmv-features"="flagm2,frintts" }
attributes #16 = { "fmv-features"="rcpc2" }
attributes #17 = { "fmv-features"="frintts" }
attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" }
+attributes #19 = { "fmv-features"="aes,sve2" }
>From 3719ab88f59aac1a3d844aeb33fe299480f3dd1d Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Wed, 26 Nov 2025 14:46:34 +0000
Subject: [PATCH 2/3] move code into lambda and explain what it does in comment
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index d2db07da25f70..6cfd537887784 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2650,18 +2650,17 @@ static bool OptimizeNonTrivialIFuncs(
unsigned J = 0;
for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
- if (J == Callees.size())
- break;
-
Function *Caller = Callers[I];
APInt CallerBits = FeatureMask[Caller];
- unsigned BestCandidate = J;
- if (AllowExpensiveChecks) {
+ // Compare the feature bits of the best callee candidate with all the
+ // caller versions preceeding the current one. For each prior caller
+ // discard feature bits that are known to be available in the current
+ // caller. As long as the known missing feature bits are a subset of the
+ // callee feature bits, advance to the next callee and start over.
+ auto computeKnownBits = [&](unsigned BestCandidate) {
unsigned K = 0;
while (K < I && BestCandidate < Callees.size()) {
- // Discard feature bits that are known to be available
- // in the current iteration.
APInt KnownBits = FeatureMask[Callers[K]] & ~CallerBits;
if (KnownBits.isSubsetOf(FeatureMask[Callees[BestCandidate]])) {
++BestCandidate;
@@ -2670,9 +2669,12 @@ static bool OptimizeNonTrivialIFuncs(
} else
++K;
}
- if (BestCandidate == Callees.size())
- break;
- }
+ return BestCandidate;
+ };
+
+ unsigned BestCandidate = AllowExpensiveChecks ? computeKnownBits(J) : J;
+ if (BestCandidate == Callees.size())
+ break;
LLVM_DEBUG(dbgs() << " Examining "
<< (CallerIsFMV ? "FMV" : "regular") << " caller "
>From b821d743f20e182fd2fb872da5252d48a48221c6 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Fri, 28 Nov 2025 09:23:32 +0000
Subject: [PATCH 3/3] Continue with next caller if no callee candidate was
found for the current caller.
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 6cfd537887784..537314c145b30 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2650,6 +2650,10 @@ static bool OptimizeNonTrivialIFuncs(
unsigned J = 0;
for (unsigned I = 0, E = Callers.size(); I < E; ++I) {
+ // There are no callee candidates left.
+ if (J == Callees.size())
+ break;
+
Function *Caller = Callers[I];
APInt CallerBits = FeatureMask[Caller];
@@ -2673,8 +2677,9 @@ static bool OptimizeNonTrivialIFuncs(
};
unsigned BestCandidate = AllowExpensiveChecks ? computeKnownBits(J) : J;
+ // No callee candidate was found for this caller.
if (BestCandidate == Callees.size())
- break;
+ continue;
LLVM_DEBUG(dbgs() << " Examining "
<< (CallerIsFMV ? "FMV" : "regular") << " caller "
More information about the llvm-commits
mailing list