[llvm] [GlobalOpt][FMV] Fix static resolution of calls. (PR #160011)
Alexandros Lamprineas via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 03:19:59 PST 2025
https://github.com/labrinea updated https://github.com/llvm/llvm-project/pull/160011
>From 4bac313f935dab1ef9731d22f6e7cfbf5b6a7bb0 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Fri, 19 Sep 2025 18:26:55 +0100
Subject: [PATCH 01/11] [GlobalOpt][FMV] Fix static resolution of calls.
Addresses the issues found on the review of
https://github.com/llvm/llvm-project/pull/150267/files#r2356936355
Currently when collecting the users of an IFunc symbol to determine the callers,
we incorrectly mix versions of different functions together, alongside non-FMV
callers all in the same bag. That is problematic because we incorrectly deduce
which features are unavailable as we iterate the callers.
I have updated the unit tests to require a resolver function for the callers
and regenerated the resolvers since some FMV features have been removed making
the detection bitmasks different. I've replaced the deleted FMV feature ls64
with cssc. I've added a new test to cover unrelated callers.
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 190 ++++++----
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 355 +++++++++++++++---
2 files changed, 416 insertions(+), 129 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index f88d51f443bcf..0707eb5eacf5d 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2482,20 +2482,21 @@ DeleteDeadIFuncs(Module &M,
// Follows the use-def chain of \p V backwards until it finds a Function,
// in which case it collects in \p Versions. Return true on successful
// use-def chain traversal, false otherwise.
-static bool collectVersions(TargetTransformInfo &TTI, Value *V,
- SmallVectorImpl<Function *> &Versions) {
+static bool
+collectVersions(Value *V, SmallVectorImpl<Function *> &Versions,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI) {
if (auto *F = dyn_cast<Function>(V)) {
- if (!TTI.isMultiversionedFunction(*F))
+ if (!GetTTI(*F).isMultiversionedFunction(*F))
return false;
Versions.push_back(F);
} else if (auto *Sel = dyn_cast<SelectInst>(V)) {
- if (!collectVersions(TTI, Sel->getTrueValue(), Versions))
+ if (!collectVersions(Sel->getTrueValue(), Versions, GetTTI))
return false;
- if (!collectVersions(TTI, Sel->getFalseValue(), Versions))
+ if (!collectVersions(Sel->getFalseValue(), Versions, GetTTI))
return false;
} else if (auto *Phi = dyn_cast<PHINode>(V)) {
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
- if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions))
+ if (!collectVersions(Phi->getIncomingValue(I), Versions, GetTTI))
return false;
} else {
// Unknown instruction type. Bail.
@@ -2525,8 +2526,14 @@ static bool OptimizeNonTrivialIFuncs(
Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
bool Changed = false;
- // Cache containing the mask constructed from a function's target features.
+ // Map containing the feature bits for a given function.
DenseMap<Function *, APInt> FeatureMask;
+ // Map containing all the versions corresponding to an IFunc symbol.
+ DenseMap<GlobalIFunc *, SmallVector<Function *>> VersionedFuncs;
+ // Map containing the IFunc symbol a function is version of.
+ DenseMap<Function *, GlobalIFunc *> VersionOf;
+ // List of all the interesting IFuncs found in the module.
+ SmallVector<GlobalIFunc *> IFuncs;
for (GlobalIFunc &IF : M.ifuncs()) {
if (IF.isInterposable())
@@ -2539,107 +2546,140 @@ static bool OptimizeNonTrivialIFuncs(
if (Resolver->isInterposable())
continue;
- TargetTransformInfo &TTI = GetTTI(*Resolver);
-
- // Discover the callee versions.
- SmallVector<Function *> Callees;
- if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) {
+ SmallVector<Function *> Versions;
+ // Discover the versioned functions.
+ if (any_of(*Resolver, [&](BasicBlock &BB) {
if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
- if (!collectVersions(TTI, Ret->getReturnValue(), Callees))
+ if (!collectVersions(Ret->getReturnValue(), Versions, GetTTI))
return true;
return false;
}))
continue;
- if (Callees.empty())
+ if (Versions.empty())
continue;
- LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
- << Resolver->getName() << "\n");
-
- // Cache the feature mask for each callee.
- for (Function *Callee : Callees) {
- auto [It, Inserted] = FeatureMask.try_emplace(Callee);
+ for (Function *V : Versions) {
+ VersionOf.insert({V, &IF});
+ auto [It, Inserted] = FeatureMask.try_emplace(V);
if (Inserted)
- It->second = TTI.getFeatureMask(*Callee);
+ It->second = GetTTI(*V).getFeatureMask(*V);
}
- // Sort the callee versions in decreasing priority order.
- sort(Callees, [&](auto *LHS, auto *RHS) {
+ // Sort function versions in decreasing priority order.
+ sort(Versions, [&](auto *LHS, auto *RHS) {
return FeatureMask[LHS].ugt(FeatureMask[RHS]);
});
- // Find the callsites and cache the feature mask for each caller.
- SmallVector<Function *> Callers;
+ IFuncs.push_back(&IF);
+ VersionedFuncs.try_emplace(&IF, std::move(Versions));
+ }
+
+ for (GlobalIFunc *CalleeIF : IFuncs) {
+ SmallVector<Function *> NonFMVCallers;
+ SmallVector<GlobalIFunc *> CallerIFuncs;
DenseMap<Function *, SmallVector<CallBase *>> CallSites;
- for (User *U : IF.users()) {
+
+ // Find the callsites.
+ for (User *U : CalleeIF->users()) {
if (auto *CB = dyn_cast<CallBase>(U)) {
- if (CB->getCalledOperand() == &IF) {
+ if (CB->getCalledOperand() == CalleeIF) {
Function *Caller = CB->getFunction();
- auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
- if (FeatInserted)
- FeatIt->second = TTI.getFeatureMask(*Caller);
- auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
- if (CallInserted)
- Callers.push_back(Caller);
- CallIt->second.push_back(CB);
+ GlobalIFunc *CallerIFunc = nullptr;
+ TargetTransformInfo &TTI = GetTTI(*Caller);
+ bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller);
+ // The caller is a version of a known IFunc.
+ if (auto It = VersionOf.find(Caller); It != VersionOf.end())
+ CallerIFunc = It->second;
+ else if (!CallerIsFMV && OptimizeNonFMVCallers) {
+ // The caller is non-FMV.
+ auto [It, Inserted] = FeatureMask.try_emplace(Caller);
+ if (Inserted)
+ It->second = TTI.getFeatureMask(*Caller);
+ } else
+ // The caller is none of the above, skip.
+ continue;
+ auto [It, Inserted] = CallSites.try_emplace(Caller);
+ if (Inserted) {
+ if (CallerIsFMV)
+ CallerIFuncs.push_back(CallerIFunc);
+ else
+ NonFMVCallers.push_back(Caller);
+ }
+ It->second.push_back(CB);
}
}
}
- // Sort the caller versions in decreasing priority order.
- sort(Callers, [&](auto *LHS, auto *RHS) {
- return FeatureMask[LHS].ugt(FeatureMask[RHS]);
- });
-
- auto implies = [](APInt A, APInt B) { return B.isSubsetOf(A); };
+ LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
+ << CalleeIF->getResolverFunction()->getName() << "\n");
- // Index to the highest priority candidate.
- unsigned I = 0;
- // Now try to redirect calls starting from higher priority callers.
- for (Function *Caller : Callers) {
- assert(I < Callees.size() && "Found callers of equal priority");
+ auto redirectCalls = [&](SmallVectorImpl<Function *> &Callers,
+ SmallVectorImpl<Function *> &Callees) {
+ // Index to the current callee candidate.
+ unsigned I = 0;
- Function *Callee = Callees[I];
- APInt CallerBits = FeatureMask[Caller];
- APInt CalleeBits = FeatureMask[Callee];
+ // Try to redirect calls starting from higher priority callers.
+ for (Function *Caller : Callers) {
+ if (I == Callees.size())
+ break;
- // In the case of FMV callers, we know that all higher priority callers
- // than the current one did not get selected at runtime, which helps
- // reason about the callees (if they have versions that mandate presence
- // of the features which we already know are unavailable on this target).
- if (TTI.isMultiversionedFunction(*Caller)) {
+ bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller);
+ // In the case of FMV callers, we know that all higher priority callers
+ // than the current one did not get selected at runtime, which helps
+ // reason about the callees (if they have versions that mandate presence
+ // of the features which we already know are unavailable on this
+ // target).
+ if (!CallerIsFMV)
+ // We can't reason much about non-FMV callers. Just pick the highest
+ // priority callee if it matches, otherwise bail.
+ assert(I == 0 && "Should only select the highest priority candidate");
+
+ Function *Callee = Callees[I];
+ APInt CallerBits = FeatureMask[Caller];
+ APInt CalleeBits = FeatureMask[Callee];
// If the feature set of the caller implies the feature set of the
- // highest priority candidate then it shall be picked. In case of
- // identical sets advance the candidate index one position.
- if (CallerBits == CalleeBits)
- ++I;
- else if (!implies(CallerBits, CalleeBits)) {
- // Keep advancing the candidate index as long as the caller's
- // features are a subset of the current candidate's.
- while (implies(CalleeBits, CallerBits)) {
+ // highest priority candidate then it shall be picked.
+ if (CalleeBits.isSubsetOf(CallerBits)) {
+ // If there are no records of call sites for this particular function
+ // version, then it is not actually a caller, in which case skip.
+ if (auto It = CallSites.find(Caller); It != CallSites.end()) {
+ for (CallBase *CS : It->second) {
+ LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName()
+ << " -> " << Callee->getName() << "\n");
+ CS->setCalledOperand(Callee);
+ }
+ Changed = true;
+ }
+ }
+ // Keep advancing the candidate index as long as the caller's
+ // features are a subset of the current candidate's.
+ if (CallerIsFMV) {
+ while (CallerBits.isSubsetOf(CalleeBits)) {
if (++I == Callees.size())
break;
CalleeBits = FeatureMask[Callees[I]];
}
- continue;
}
- } else {
- // We can't reason much about non-FMV callers. Just pick the highest
- // priority callee if it matches, otherwise bail.
- if (!OptimizeNonFMVCallers || I > 0 || !implies(CallerBits, CalleeBits))
- continue;
}
- auto &Calls = CallSites[Caller];
- for (CallBase *CS : Calls) {
- LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() << " -> "
- << Callee->getName() << "\n");
- CS->setCalledOperand(Callee);
+ };
+
+ auto &Callees = VersionedFuncs[CalleeIF];
+
+ // Optimize non-FMV calls.
+ if (!NonFMVCallers.empty() && OptimizeNonFMVCallers)
+ redirectCalls(NonFMVCallers, Callees);
+
+ // Optimize FMV calls.
+ if (!CallerIFuncs.empty()) {
+ for (GlobalIFunc *CallerIF : CallerIFuncs) {
+ auto &Callers = VersionedFuncs[CallerIF];
+ redirectCalls(Callers, Callees);
}
- Changed = true;
}
- if (IF.use_empty() ||
- all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+
+ if (CalleeIF->use_empty() ||
+ all_of(CalleeIF->users(), [](User *U) { return isa<GlobalAlias>(U); }))
NumIFuncsResolved++;
}
return Changed;
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 4b6a19d3f05cf..7ace67e3857ff 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4
; REQUIRES: aarch64-registered-target
@@ -13,6 +13,14 @@ $test_caller_feats_not_implied.resolver = comdat any
$test_non_fmv_caller.resolver = comdat any
$test_priority.resolver = comdat any
$test_alternative_names.resolver = comdat any
+$test_unrelated_callers.resolver = comdat any
+$caller1.resolver = comdat any
+$caller2.resolver = comdat any
+$caller3.resolver = comdat any
+$caller6.resolver = comdat any
+$caller7.resolver = comdat any
+$caller8.resolver = comdat any
+$caller9.resolver = comdat any
@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
@@ -22,6 +30,14 @@ $test_alternative_names.resolver = comdat any
@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver
@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
+ at test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver
+ at caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver
+ at caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver
+ at caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver
+ at caller6 = weak_odr ifunc i32 (), ptr @caller6.resolver
+ at caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver
+ at caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver
+ at caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver
declare void @__init_cpu_features_resolver() local_unnamed_addr
@@ -34,18 +50,18 @@ define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
resolver_entry:
tail call void @__init_cpu_features_resolver()
%0 = load i64, ptr @__aarch64_cpu_features, align 8
- %1 = and i64 %0, 68719476736
- %.not = icmp eq i64 %1, 0
- %2 = and i64 %0, 1073741824
- %.not3 = icmp eq i64 %2, 0
- %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve
- %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2
+ %1 = and i64 %0, 69793284352
+ %2 = icmp eq i64 %1, 69793284352
+ %3 = and i64 %0, 1073807616
+ %4 = icmp eq i64 %3, 1073807616
+ %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %4, ptr @test_single_bb_resolver._Msve, ptr @test_single_bb_resolver.default
+ %common.ret.op = select i1 %2, ptr @test_single_bb_resolver._Msve2, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default
ret ptr %common.ret.op
}
define i32 @caller1._Msve() #1 {
; CHECK-LABEL: define i32 @caller1._Msve(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
;
entry:
@@ -55,7 +71,7 @@ entry:
define i32 @caller1._Msve2() #2 {
; CHECK-LABEL: define i32 @caller1._Msve2(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
;
entry:
@@ -65,7 +81,7 @@ entry:
define i32 @caller1.default() #0 {
; CHECK-LABEL: define i32 @caller1.default(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
;
entry:
@@ -73,6 +89,20 @@ entry:
ret i32 %call
}
+define weak_odr ptr @caller1.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller1.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 69793284352
+ %2 = icmp eq i64 %1, 69793284352
+ %3 = and i64 %0, 1073807616
+ %4 = icmp eq i64 %3, 1073807616
+ %caller1._Msve.caller1.default = select i1 %4, ptr @caller1._Msve, ptr @caller1.default
+ %common.ret.op = select i1 %2, ptr @caller1._Msve2, ptr %caller1._Msve.caller1.default
+ ret ptr %common.ret.op
+}
+
declare i32 @test_multi_bb_resolver._Mmops() #3
declare i32 @test_multi_bb_resolver._Msve2() #2
declare i32 @test_multi_bb_resolver._Msve() #1
@@ -92,20 +122,20 @@ common.ret: ; preds = %resolver_else2, %re
ret ptr %common.ret.op
resolver_else: ; preds = %resolver_entry
- %2 = and i64 %0, 68719476736
- %.not5 = icmp eq i64 %2, 0
- br i1 %.not5, label %resolver_else2, label %common.ret
+ %2 = and i64 %0, 69793284352
+ %3 = icmp eq i64 %2, 69793284352
+ br i1 %3, label %common.ret, label %resolver_else2
resolver_else2: ; preds = %resolver_else
- %3 = and i64 %0, 1073741824
- %.not6 = icmp eq i64 %3, 0
- %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve
+ %4 = and i64 %0, 1073807616
+ %5 = icmp eq i64 %4, 1073807616
+ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %5, ptr @test_multi_bb_resolver._Msve, ptr @test_multi_bb_resolver.default
br label %common.ret
}
define i32 @caller2._MmopsMsve2() #4 {
; CHECK-LABEL: define i32 @caller2._MmopsMsve2(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
;
entry:
@@ -115,7 +145,7 @@ entry:
define i32 @caller2._Mmops() #3 {
; CHECK-LABEL: define i32 @caller2._Mmops(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
;
entry:
@@ -125,7 +155,7 @@ entry:
define i32 @caller2._Msve() #1 {
; CHECK-LABEL: define i32 @caller2._Msve(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR1]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
;
entry:
@@ -135,7 +165,7 @@ entry:
define i32 @caller2.default() #0 {
; CHECK-LABEL: define i32 @caller2.default(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
;
entry:
@@ -143,6 +173,31 @@ entry:
ret i32 %call
}
+define weak_odr ptr @caller2.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller2.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460822096707840
+ %2 = icmp eq i64 %1, 576460822096707840
+ br i1 %2, label %common.ret, label %resolver_else
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @caller2._MmopsMsve2, %resolver_entry ], [ @caller2._Mmops, %resolver_else ], [ %caller2._Msve.caller2.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %3 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %3, 0
+ br i1 %.not, label %resolver_else2, label %common.ret
+
+resolver_else2: ; preds = %resolver_else
+ %4 = and i64 %0, 1073807616
+ %5 = icmp eq i64 %4, 1073807616
+ %caller2._Msve.caller2.default = select i1 %5, ptr @caller2._Msve, ptr @caller2.default
+ br label %common.ret
+}
+
declare i32 @test_caller_feats_not_implied._Mmops() #3
declare i32 @test_caller_feats_not_implied._Msme() #5
declare i32 @test_caller_feats_not_implied._Msve() #1
@@ -162,20 +217,20 @@ common.ret: ; preds = %resolver_else2, %re
ret ptr %common.ret.op
resolver_else: ; preds = %resolver_entry
- %2 = and i64 %0, 4398046511104
- %.not5 = icmp eq i64 %2, 0
- br i1 %.not5, label %resolver_else2, label %common.ret
+ %2 = and i64 %0, 4398180795136
+ %3 = icmp eq i64 %2, 4398180795136
+ br i1 %3, label %common.ret, label %resolver_else2
resolver_else2: ; preds = %resolver_else
- %3 = and i64 %0, 1073741824
- %.not6 = icmp eq i64 %3, 0
- %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve
+ %4 = and i64 %0, 1073807616
+ %5 = icmp eq i64 %4, 1073807616
+ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %5, ptr @test_caller_feats_not_implied._Msve, ptr @test_caller_feats_not_implied.default
br label %common.ret
}
define i32 @caller3._Mmops() #3 {
; CHECK-LABEL: define i32 @caller3._Mmops(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] {
+; CHECK-SAME: ) #[[ATTR3]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
;
entry:
@@ -185,7 +240,7 @@ entry:
define i32 @caller3._Msve() #1 {
; CHECK-LABEL: define i32 @caller3._Msve(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR1]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
;
entry:
@@ -195,7 +250,7 @@ entry:
define i32 @caller3.default() #0 {
; CHECK-LABEL: define i32 @caller3.default(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
;
entry:
@@ -203,6 +258,20 @@ entry:
ret i32 %call
}
+define weak_odr ptr @caller3.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller3.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 1073807616
+ %3 = icmp eq i64 %2, 1073807616
+ %caller3._Msve.caller3.default = select i1 %3, ptr @caller3._Msve, ptr @caller3.default
+ %common.ret.op = select i1 %.not, ptr %caller3._Msve.caller3.default, ptr @caller3._Mmops
+ ret ptr %common.ret.op
+}
+
declare i32 @test_non_fmv_caller._Maes() #6
declare i32 @test_non_fmv_caller._Msm4() #7
declare i32 @test_non_fmv_caller.default() #0
@@ -212,15 +281,18 @@ define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
resolver_entry:
tail call void @__init_cpu_features_resolver()
%0 = load i64, ptr @__aarch64_cpu_features, align 8
- %1 = and i64 %0, 32768
- %.not = icmp eq i64 %1, 0
- %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes
- ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default
+ %1 = and i64 %0, 33536
+ %2 = icmp eq i64 %1, 33536
+ %3 = and i64 %0, 800
+ %4 = icmp eq i64 %3, 800
+ %test_non_fmv_caller._Msm4.test_non_fmv_caller.default = select i1 %4, ptr @test_non_fmv_caller._Msm4, ptr @test_non_fmv_caller.default
+ %common.ret.op = select i1 %2, ptr @test_non_fmv_caller._Maes, ptr %test_non_fmv_caller._Msm4.test_non_fmv_caller.default
+ ret ptr %common.ret.op
}
define i32 @caller4() #8 {
; CHECK-LABEL: define i32 @caller4(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes()
;
entry:
@@ -230,7 +302,7 @@ entry:
define i32 @caller5() #9 {
; CHECK-LABEL: define i32 @caller5(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR9:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller()
;
entry:
@@ -239,7 +311,7 @@ entry:
}
declare i32 @test_priority._Msve2-sha3() #10
-declare i32 @test_priority._Mls64Mssbs() #11
+declare i32 @test_priority._McsscMssbs() #11
declare i32 @test_priority._MflagmMlseMrng() #12
declare i32 @test_priority.default() #0
@@ -248,36 +320,57 @@ define weak_odr ptr @test_priority.resolver() comdat {
resolver_entry:
tail call void @__init_cpu_features_resolver()
%0 = load i64, ptr @__aarch64_cpu_features, align 8
- %1 = and i64 %0, 131
- %2 = icmp eq i64 %1, 131
+ %1 = and i64 %0, 562949953423360
+ %2 = icmp eq i64 %1, 562949953423360
br i1 %2, label %common.ret, label %resolver_else
common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
- %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ]
+ %common.ret.op = phi ptr [ @test_priority._McsscMssbs, %resolver_entry ], [ @test_priority._Msve2-sha3, %resolver_else ], [ %test_priority._MflagmMlseMrng.test_priority.default, %resolver_else2 ]
ret ptr %common.ret.op
resolver_else: ; preds = %resolver_entry
- %3 = and i64 %0, 9570149208162304
- %4 = icmp eq i64 %3, 9570149208162304
+ %3 = and i64 %0, 1169304924928
+ %4 = icmp eq i64 %3, 1169304924928
br i1 %4, label %common.ret, label %resolver_else2
resolver_else2: ; preds = %resolver_else
- %5 = and i64 %0, 1099511627776
- %.not = icmp eq i64 %5, 0
- %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3
+ %5 = and i64 %0, 131
+ %6 = icmp eq i64 %5, 131
+ %test_priority._MflagmMlseMrng.test_priority.default = select i1 %6, ptr @test_priority._MflagmMlseMrng, ptr @test_priority.default
br label %common.ret
}
-define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 {
-; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs()
+define i32 @caller6._McsscMflagmMlseMrngMssbsMsve2-sha3() #13 {
+; CHECK-LABEL: define i32 @caller6._McsscMflagmMlseMrngMssbsMsve2-sha3(
+; CHECK-SAME: ) #[[ATTR13:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._McsscMssbs()
;
entry:
%call = tail call i32 @test_priority()
ret i32 %call
}
+define i32 @caller6.default() #0 {
+; CHECK-LABEL: define i32 @caller6.default(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_priority()
+;
+entry:
+ %call = tail call i32 @test_priority()
+ ret i32 %call
+}
+
+define weak_odr ptr @caller6.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller6.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 564119258348419
+ %2 = icmp eq i64 %1, 564119258348419
+ %caller6._McsscMflagmMlseMrngMssbsMsve2-sha3.caller6.default = select i1 %2, ptr @caller6._McsscMflagmMlseMrngMssbsMsve2-sha3, ptr @caller6.default
+ ret ptr %caller6._McsscMflagmMlseMrngMssbsMsve2-sha3.caller6.default
+}
+
declare i32 @test_alternative_names._Mdpb2Mfrintts() #14
declare i32 @test_alternative_names._Mflagm2Mfrintts() #15
declare i32 @test_alternative_names._Mrcpc2() #16
@@ -310,7 +403,7 @@ resolver_else2: ; preds = %resolver_else
define i32 @caller7._Mdpb2Mfrintts() #14 {
; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR14:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts()
;
entry:
@@ -320,7 +413,7 @@ entry:
define i32 @caller7._Mfrintts() #17 {
; CHECK-LABEL: define i32 @caller7._Mfrintts(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR17:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names()
;
entry:
@@ -330,7 +423,7 @@ entry:
define i32 @caller7._Mrcpc2() #16 {
; CHECK-LABEL: define i32 @caller7._Mrcpc2(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR16:[0-9]+]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2()
;
entry:
@@ -340,7 +433,7 @@ entry:
define i32 @caller7.default() #0 {
; CHECK-LABEL: define i32 @caller7.default(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default()
;
entry:
@@ -348,6 +441,159 @@ entry:
ret i32 %call
}
+define weak_odr ptr @caller7.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller7.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 17563904
+ %2 = icmp eq i64 %1, 17563904
+ br i1 %2, label %common.ret, label %resolver_else
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @caller7._Mdpb2Mfrintts, %resolver_entry ], [ @caller7._Mfrintts, %resolver_else ], [ %caller7._Mrcpc2.caller7.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %3 = and i64 %0, 16777472
+ %4 = icmp eq i64 %3, 16777472
+ br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2: ; preds = %resolver_else
+ %5 = and i64 %0, 12582912
+ %6 = icmp eq i64 %5, 12582912
+ %caller7._Mrcpc2.caller7.default = select i1 %6, ptr @caller7._Mrcpc2, ptr @caller7.default
+ br label %common.ret
+}
+
+declare i32 @test_unrelated_callers._Mmops() #3
+declare i32 @test_unrelated_callers._Msve2() #2
+declare i32 @test_unrelated_callers._Msve() #1
+declare i32 @test_unrelated_callers.default() #0
+
+define weak_odr ptr @test_unrelated_callers.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_unrelated_callers.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @test_unrelated_callers._Mmops, %resolver_entry ], [ @test_unrelated_callers._Msve2, %resolver_else ], [ %test_unrelated_callers._Msve.test_unrelated_callers.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %2 = and i64 %0, 69793284352
+ %3 = icmp eq i64 %2, 69793284352
+ br i1 %3, label %common.ret, label %resolver_else2
+
+resolver_else2: ; preds = %resolver_else
+ %4 = and i64 %0, 1073807616
+ %5 = icmp eq i64 %4, 1073807616
+ %test_unrelated_callers._Msve.test_unrelated_callers.default = select i1 %5, ptr @test_unrelated_callers._Msve, ptr @test_unrelated_callers.default
+ br label %common.ret
+}
+
+define i32 @caller8._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller8._MmopsMsve2(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Mmops()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
+define i32 @caller8._Msve() #1 {
+; CHECK-LABEL: define i32 @caller8._Msve(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
+define i32 @caller8.default() #0 {
+; CHECK-LABEL: define i32 @caller8.default(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
+define weak_odr ptr @caller8.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller8.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460822096707840
+ %2 = icmp eq i64 %1, 576460822096707840
+ %3 = and i64 %0, 1073807616
+ %4 = icmp eq i64 %3, 1073807616
+ %caller8._Msve.caller8.default = select i1 %4, ptr @caller8._Msve, ptr @caller8.default
+ %common.ret.op = select i1 %2, ptr @caller8._MmopsMsve2, ptr %caller8._Msve.caller8.default
+ ret ptr %common.ret.op
+}
+
+define i32 @caller9._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller9._Mmops(
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Mmops()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
+define i32 @caller9._Msve2() #2 {
+; CHECK-LABEL: define i32 @caller9._Msve2(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
+define i32 @caller9.default() #0 {
+; CHECK-LABEL: define i32 @caller9.default(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
+define weak_odr ptr @caller9.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller9.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 69793284352
+ %3 = icmp eq i64 %2, 69793284352
+ %caller9._Msve2.caller9.default = select i1 %3, ptr @caller9._Msve2, ptr @caller9.default
+ %common.ret.op = select i1 %.not, ptr %caller9._Msve2.caller9.default, ptr @caller9._Mmops
+ ret ptr %common.ret.op
+}
+
+define i32 @caller10() #18 {
+; CHECK-LABEL: define i32 @caller10(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR18:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Mmops()
+;
+entry:
+ %call = tail call i32 @test_unrelated_callers()
+ ret i32 %call
+}
+
attributes #0 = { "fmv-features" }
attributes #1 = { "fmv-features"="sve" }
attributes #2 = { "fmv-features"="sve2" }
@@ -359,10 +605,11 @@ attributes #7 = { "fmv-features"="sm4" }
attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" }
attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" }
attributes #10 = { "fmv-features"="sve2-sha3" }
-attributes #11 = { "fmv-features"="ls64,ssbs" }
+attributes #11 = { "fmv-features"="cssc,ssbs" }
attributes #12 = { "fmv-features"="flagm,lse,rng" }
-attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" }
+attributes #13 = { "fmv-features"="cssc,flagm,lse,rng,ssbs,sve2-sha3" }
attributes #14 = { "fmv-features"="dpb2,frintts" }
attributes #15 = { "fmv-features"="flagm2,frintts" }
attributes #16 = { "fmv-features"="rcpc2" }
attributes #17 = { "fmv-features"="frintts" }
+attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" }
>From 28991c037b268788e3ca292d84fce4196ab3bd48 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Mon, 22 Sep 2025 09:50:18 +0100
Subject: [PATCH 02/11] rename var
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 0707eb5eacf5d..cf01936ff5611 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2585,12 +2585,12 @@ static bool OptimizeNonTrivialIFuncs(
if (auto *CB = dyn_cast<CallBase>(U)) {
if (CB->getCalledOperand() == CalleeIF) {
Function *Caller = CB->getFunction();
- GlobalIFunc *CallerIFunc = nullptr;
+ GlobalIFunc *CallerIF = nullptr;
TargetTransformInfo &TTI = GetTTI(*Caller);
bool CallerIsFMV = TTI.isMultiversionedFunction(*Caller);
// The caller is a version of a known IFunc.
if (auto It = VersionOf.find(Caller); It != VersionOf.end())
- CallerIFunc = It->second;
+ CallerIF = It->second;
else if (!CallerIsFMV && OptimizeNonFMVCallers) {
// The caller is non-FMV.
auto [It, Inserted] = FeatureMask.try_emplace(Caller);
@@ -2602,7 +2602,7 @@ static bool OptimizeNonTrivialIFuncs(
auto [It, Inserted] = CallSites.try_emplace(Caller);
if (Inserted) {
if (CallerIsFMV)
- CallerIFuncs.push_back(CallerIFunc);
+ CallerIFuncs.push_back(CallerIF);
else
NonFMVCallers.push_back(Caller);
}
>From 704fbe455a0f0fc5441d8ff03588284d9fa9c03f Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Mon, 22 Sep 2025 13:26:08 +0100
Subject: [PATCH 03/11] Change caller8._Msve -> caller8._Msve2 and
caller9._Msve2 -> caller9._Msve, and remove callsite from caller9._Msve.
---
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 35 +++++++++----------
1 file changed, 16 insertions(+), 19 deletions(-)
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 7ace67e3857ff..156c49c8b6677 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -506,9 +506,9 @@ entry:
ret i32 %call
}
-define i32 @caller8._Msve() #1 {
-; CHECK-LABEL: define i32 @caller8._Msve(
-; CHECK-SAME: ) #[[ATTR1]] {
+define dso_local i32 @caller8._Msve2() #2 {
+; CHECK-LABEL: define dso_local i32 @caller8._Msve2(
+; CHECK-SAME: ) #[[ATTR2]] {
; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
;
entry:
@@ -533,10 +533,10 @@ resolver_entry:
%0 = load i64, ptr @__aarch64_cpu_features, align 8
%1 = and i64 %0, 576460822096707840
%2 = icmp eq i64 %1, 576460822096707840
- %3 = and i64 %0, 1073807616
- %4 = icmp eq i64 %3, 1073807616
- %caller8._Msve.caller8.default = select i1 %4, ptr @caller8._Msve, ptr @caller8.default
- %common.ret.op = select i1 %2, ptr @caller8._MmopsMsve2, ptr %caller8._Msve.caller8.default
+ %3 = and i64 %0, 69793284352
+ %4 = icmp eq i64 %3, 69793284352
+ %caller8._Msve2.caller8.default = select i1 %4, ptr @caller8._Msve2, ptr @caller8.default
+ %common.ret.op = select i1 %2, ptr @caller8._MmopsMsve2, ptr %caller8._Msve2.caller8.default
ret ptr %common.ret.op
}
@@ -550,20 +550,17 @@ entry:
ret i32 %call
}
-define i32 @caller9._Msve2() #2 {
-; CHECK-LABEL: define i32 @caller9._Msve2(
-; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
-;
+define i32 @caller9._Msve() #1 {
+; CHECK-LABEL: define i32 @caller9._Msve(
+; CHECK-SAME: ) #[[ATTR1]] {
entry:
- %call = tail call i32 @test_unrelated_callers()
- ret i32 %call
+ ret i32 1
}
define i32 @caller9.default() #0 {
; CHECK-LABEL: define i32 @caller9.default(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers.default()
;
entry:
%call = tail call i32 @test_unrelated_callers()
@@ -577,10 +574,10 @@ resolver_entry:
%0 = load i64, ptr @__aarch64_cpu_features, align 8
%1 = and i64 %0, 576460752303423488
%.not = icmp eq i64 %1, 0
- %2 = and i64 %0, 69793284352
- %3 = icmp eq i64 %2, 69793284352
- %caller9._Msve2.caller9.default = select i1 %3, ptr @caller9._Msve2, ptr @caller9.default
- %common.ret.op = select i1 %.not, ptr %caller9._Msve2.caller9.default, ptr @caller9._Mmops
+ %2 = and i64 %0, 1073807616
+ %3 = icmp eq i64 %2, 1073807616
+ %caller9._Msve.caller9.default = select i1 %3, ptr @caller9._Msve, ptr @caller9.default
+ %common.ret.op = select i1 %.not, ptr %caller9._Msve.caller9.default, ptr @caller9._Mmops
ret ptr %common.ret.op
}
>From 94ec4ab8755c4d75c00dcff9523ac838c3cb2fda Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Tue, 23 Sep 2025 16:29:50 +0100
Subject: [PATCH 04/11] Keep track of unavailable features from previous
callers. This allows smarter elimination of candidates at the expense of more
comparisons.
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 59 +++++++++++--------
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 4 +-
2 files changed, 38 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index cf01936ff5611..a071ebc2fd628 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2618,6 +2618,8 @@ static bool OptimizeNonTrivialIFuncs(
SmallVectorImpl<Function *> &Callees) {
// Index to the current callee candidate.
unsigned I = 0;
+ // Feature bits from callers of previous iterations.
+ SmallVector<APInt> KnownBits;
// Try to redirect calls starting from higher priority callers.
for (Function *Caller : Callers) {
@@ -2625,19 +2627,41 @@ static bool OptimizeNonTrivialIFuncs(
break;
bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller);
+ // We can't reason much about non-FMV callers. Just pick the highest
+ // priority callee if it matches, otherwise bail.
+ if (!CallerIsFMV)
+ assert(I == 0 && "Should only select the highest priority candidate");
+
+ APInt CallerBits = FeatureMask[Caller];
+ APInt CalleeBits = FeatureMask[Callees[I]];
// In the case of FMV callers, we know that all higher priority callers
// than the current one did not get selected at runtime, which helps
// reason about the callees (if they have versions that mandate presence
// of the features which we already know are unavailable on this
- // target).
- if (!CallerIsFMV)
- // We can't reason much about non-FMV callers. Just pick the highest
- // priority callee if it matches, otherwise bail.
- assert(I == 0 && "Should only select the highest priority candidate");
-
+ // target, then we can skip over those versions/candidates).
+ if (CallerIsFMV) {
+ // Discard feature bits that are known to be available
+ // in the current iteration.
+ for (APInt &Version: KnownBits)
+ if (CallerBits.isSubsetOf(Version))
+ Version &= ~CallerBits;
+ // Keep advancing the candidate index as long as the unavailable
+ // features are a subset of the current candidate's.
+ unsigned J = 0;
+ while (J < KnownBits.size()) {
+ APInt Version = KnownBits[J];
+ if (Version.isSubsetOf(CalleeBits)) {
+ if (++I == Callees.size())
+ break;
+ CalleeBits = FeatureMask[Callees[I]];
+ // Start over.
+ J = 0;
+ } else
+ ++J;
+ }
+ KnownBits.push_back(CallerBits);
+ }
Function *Callee = Callees[I];
- APInt CallerBits = FeatureMask[Caller];
- APInt CalleeBits = FeatureMask[Callee];
// If the feature set of the caller implies the feature set of the
// highest priority candidate then it shall be picked.
if (CalleeBits.isSubsetOf(CallerBits)) {
@@ -2652,30 +2676,19 @@ static bool OptimizeNonTrivialIFuncs(
Changed = true;
}
}
- // Keep advancing the candidate index as long as the caller's
- // features are a subset of the current candidate's.
- if (CallerIsFMV) {
- while (CallerBits.isSubsetOf(CalleeBits)) {
- if (++I == Callees.size())
- break;
- CalleeBits = FeatureMask[Callees[I]];
- }
- }
}
};
auto &Callees = VersionedFuncs[CalleeIF];
// Optimize non-FMV calls.
- if (!NonFMVCallers.empty() && OptimizeNonFMVCallers)
+ if (OptimizeNonFMVCallers)
redirectCalls(NonFMVCallers, Callees);
// Optimize FMV calls.
- if (!CallerIFuncs.empty()) {
- for (GlobalIFunc *CallerIF : CallerIFuncs) {
- auto &Callers = VersionedFuncs[CallerIF];
- redirectCalls(Callers, Callees);
- }
+ for (GlobalIFunc *CallerIF : CallerIFuncs) {
+ auto &Callers = VersionedFuncs[CallerIF];
+ redirectCalls(Callers, Callees);
}
if (CalleeIF->use_empty() ||
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 156c49c8b6677..e6706b1ced217 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -156,7 +156,7 @@ entry:
define i32 @caller2._Msve() #1 {
; CHECK-LABEL: define i32 @caller2._Msve(
; CHECK-SAME: ) #[[ATTR1]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Msve()
;
entry:
%call = tail call i32 @test_multi_bb_resolver()
@@ -509,7 +509,7 @@ entry:
define dso_local i32 @caller8._Msve2() #2 {
; CHECK-LABEL: define dso_local i32 @caller8._Msve2(
; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
;
entry:
%call = tail call i32 @test_unrelated_callers()
>From bd9b454e6a3a8a1746a9752ca49ced1f7a1734b2 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Tue, 23 Sep 2025 19:18:18 +0100
Subject: [PATCH 05/11] clang format
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index a071ebc2fd628..ab9ed9efad317 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2642,7 +2642,7 @@ static bool OptimizeNonTrivialIFuncs(
if (CallerIsFMV) {
// Discard feature bits that are known to be available
// in the current iteration.
- for (APInt &Version: KnownBits)
+ for (APInt &Version : KnownBits)
if (CallerBits.isSubsetOf(Version))
Version &= ~CallerBits;
// Keep advancing the candidate index as long as the unavailable
>From 98f9197a4da5144a690524d0f098ca2008a7ea3b Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Wed, 24 Sep 2025 16:04:59 +0100
Subject: [PATCH 06/11] When disregarding feature bits that are known to be
available in the current iteration, the lifespan of this knowledge should
expire in the next iteration. Therefore we should not clear those bits from
KnownBits.
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 9 +-
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 92 ++++++++++++++++++-
2 files changed, 93 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ab9ed9efad317..4449c1e74a612 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2640,16 +2640,13 @@ static bool OptimizeNonTrivialIFuncs(
// of the features which we already know are unavailable on this
// target, then we can skip over those versions/candidates).
if (CallerIsFMV) {
- // Discard feature bits that are known to be available
- // in the current iteration.
- for (APInt &Version : KnownBits)
- if (CallerBits.isSubsetOf(Version))
- Version &= ~CallerBits;
// Keep advancing the candidate index as long as the unavailable
// features are a subset of the current candidate's.
unsigned J = 0;
while (J < KnownBits.size()) {
- APInt Version = KnownBits[J];
+ // Discard feature bits that are known to be available
+ // in the current iteration.
+ APInt Version = KnownBits[J] & ~CallerBits;
if (Version.isSubsetOf(CalleeBits)) {
if (++I == Callees.size())
break;
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index e6706b1ced217..3a6866c4e16a4 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers|test_clear_known_bits)" --version 4
; REQUIRES: aarch64-registered-target
@@ -14,6 +14,7 @@ $test_non_fmv_caller.resolver = comdat any
$test_priority.resolver = comdat any
$test_alternative_names.resolver = comdat any
$test_unrelated_callers.resolver = comdat any
+$test_clear_known_bits.resolver = comdat any
$caller1.resolver = comdat any
$caller2.resolver = comdat any
$caller3.resolver = comdat any
@@ -21,6 +22,7 @@ $caller6.resolver = comdat any
$caller7.resolver = comdat any
$caller8.resolver = comdat any
$caller9.resolver = comdat any
+$caller11.resolver = comdat any
@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
@@ -31,6 +33,7 @@ $caller9.resolver = comdat any
@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
@test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver
+ at test_clear_known_bits = weak_odr ifunc i32 (), ptr @test_clear_known_bits.resolver
@caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver
@caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver
@caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver
@@ -38,6 +41,7 @@ $caller9.resolver = comdat any
@caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver
@caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver
@caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver
+ at caller11 = weak_odr ifunc i32 (), ptr @caller11.resolver
declare void @__init_cpu_features_resolver() local_unnamed_addr
@@ -156,7 +160,7 @@ entry:
define i32 @caller2._Msve() #1 {
; CHECK-LABEL: define i32 @caller2._Msve(
; CHECK-SAME: ) #[[ATTR1]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Msve()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
;
entry:
%call = tail call i32 @test_multi_bb_resolver()
@@ -591,6 +595,89 @@ entry:
ret i32 %call
}
+declare i32 @test_clear_known_bits._Mmops() #3
+declare i32 @test_clear_known_bits._Maes() #6
+declare i32 @test_clear_known_bits.default() #0
+
+define weak_odr ptr @test_clear_known_bits.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_clear_known_bits.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 33536
+ %3 = icmp eq i64 %2, 33536
+ %test_clear_known_bits._Maes.test_clear_known_bits.default = select i1 %3, ptr @test_clear_known_bits._Maes, ptr @test_clear_known_bits.default
+ %common.ret.op = select i1 %.not, ptr %test_clear_known_bits._Maes.test_clear_known_bits.default, ptr @test_clear_known_bits._Mmops
+ ret ptr %common.ret.op
+}
+
+define i32 @caller11._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller11._MmopsMsve2(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits._Mmops()
+;
+entry:
+ %call = tail call i32 @test_clear_known_bits()
+ ret i32 %call
+}
+
+define i32 @caller11._Msme() #5 {
+; CHECK-LABEL: define i32 @caller11._Msme(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits()
+;
+entry:
+ %call = tail call i32 @test_clear_known_bits()
+ ret i32 %call
+}
+
+define noundef i32 @caller11._MaesMsve2() #19 {
+; CHECK-LABEL: define noundef i32 @caller11._MaesMsve2(
+; CHECK-SAME: ) #[[ATTR19:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits._Maes()
+;
+entry:
+ %call = tail call i32 @test_clear_known_bits()
+ ret i32 %call
+}
+
+define i32 @caller11.default() #0 {
+; CHECK-LABEL: define i32 @caller11.default(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits()
+;
+entry:
+ %call = tail call i32 @test_clear_known_bits()
+ ret i32 %call
+}
+
+define weak_odr ptr @caller11.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @caller11.resolver() comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460822096707840
+ %2 = icmp eq i64 %1, 576460822096707840
+ br i1 %2, label %common.ret, label %resolver_else
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @caller11._MmopsMsve2, %resolver_entry ], [ @caller11._Msme, %resolver_else ], [ %caller11._MaesMsve2.caller11.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %3 = and i64 %0, 4398180795136
+ %4 = icmp eq i64 %3, 4398180795136
+ br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2: ; preds = %resolver_else
+ %5 = and i64 %0, 69793317632
+ %6 = icmp eq i64 %5, 69793317632
+ %caller11._MaesMsve2.caller11.default = select i1 %6, ptr @caller11._MaesMsve2, ptr @caller11.default
+ br label %common.ret
+}
+
attributes #0 = { "fmv-features" }
attributes #1 = { "fmv-features"="sve" }
attributes #2 = { "fmv-features"="sve2" }
@@ -610,3 +697,4 @@ attributes #15 = { "fmv-features"="flagm2,frintts" }
attributes #16 = { "fmv-features"="rcpc2" }
attributes #17 = { "fmv-features"="frintts" }
attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" }
+attributes #19 = { "fmv-features"="aes,sve2" }
>From 6805b403f7ba13de6c212007522632f36c595d58 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Mon, 3 Nov 2025 13:29:44 +0000
Subject: [PATCH 07/11] Revert "When disregarding feature bits that are known
to be available" Revert "clang format" Revert "Keep track of unavailable
features from previous callers. This allows"
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 56 +++++------
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 92 +------------------
2 files changed, 25 insertions(+), 123 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 4449c1e74a612..cf01936ff5611 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2618,8 +2618,6 @@ static bool OptimizeNonTrivialIFuncs(
SmallVectorImpl<Function *> &Callees) {
// Index to the current callee candidate.
unsigned I = 0;
- // Feature bits from callers of previous iterations.
- SmallVector<APInt> KnownBits;
// Try to redirect calls starting from higher priority callers.
for (Function *Caller : Callers) {
@@ -2627,38 +2625,19 @@ static bool OptimizeNonTrivialIFuncs(
break;
bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller);
- // We can't reason much about non-FMV callers. Just pick the highest
- // priority callee if it matches, otherwise bail.
- if (!CallerIsFMV)
- assert(I == 0 && "Should only select the highest priority candidate");
-
- APInt CallerBits = FeatureMask[Caller];
- APInt CalleeBits = FeatureMask[Callees[I]];
// In the case of FMV callers, we know that all higher priority callers
// than the current one did not get selected at runtime, which helps
// reason about the callees (if they have versions that mandate presence
// of the features which we already know are unavailable on this
- // target, then we can skip over those versions/candidates).
- if (CallerIsFMV) {
- // Keep advancing the candidate index as long as the unavailable
- // features are a subset of the current candidate's.
- unsigned J = 0;
- while (J < KnownBits.size()) {
- // Discard feature bits that are known to be available
- // in the current iteration.
- APInt Version = KnownBits[J] & ~CallerBits;
- if (Version.isSubsetOf(CalleeBits)) {
- if (++I == Callees.size())
- break;
- CalleeBits = FeatureMask[Callees[I]];
- // Start over.
- J = 0;
- } else
- ++J;
- }
- KnownBits.push_back(CallerBits);
- }
+ // target).
+ if (!CallerIsFMV)
+ // We can't reason much about non-FMV callers. Just pick the highest
+ // priority callee if it matches, otherwise bail.
+ assert(I == 0 && "Should only select the highest priority candidate");
+
Function *Callee = Callees[I];
+ APInt CallerBits = FeatureMask[Caller];
+ APInt CalleeBits = FeatureMask[Callee];
// If the feature set of the caller implies the feature set of the
// highest priority candidate then it shall be picked.
if (CalleeBits.isSubsetOf(CallerBits)) {
@@ -2673,19 +2652,30 @@ static bool OptimizeNonTrivialIFuncs(
Changed = true;
}
}
+ // Keep advancing the candidate index as long as the caller's
+ // features are a subset of the current candidate's.
+ if (CallerIsFMV) {
+ while (CallerBits.isSubsetOf(CalleeBits)) {
+ if (++I == Callees.size())
+ break;
+ CalleeBits = FeatureMask[Callees[I]];
+ }
+ }
}
};
auto &Callees = VersionedFuncs[CalleeIF];
// Optimize non-FMV calls.
- if (OptimizeNonFMVCallers)
+ if (!NonFMVCallers.empty() && OptimizeNonFMVCallers)
redirectCalls(NonFMVCallers, Callees);
// Optimize FMV calls.
- for (GlobalIFunc *CallerIF : CallerIFuncs) {
- auto &Callers = VersionedFuncs[CallerIF];
- redirectCalls(Callers, Callees);
+ if (!CallerIFuncs.empty()) {
+ for (GlobalIFunc *CallerIF : CallerIFuncs) {
+ auto &Callers = VersionedFuncs[CallerIF];
+ redirectCalls(Callers, Callees);
+ }
}
if (CalleeIF->use_empty() ||
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 3a6866c4e16a4..156c49c8b6677 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers|test_clear_known_bits)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names|test_unrelated_callers)" --version 4
; REQUIRES: aarch64-registered-target
@@ -14,7 +14,6 @@ $test_non_fmv_caller.resolver = comdat any
$test_priority.resolver = comdat any
$test_alternative_names.resolver = comdat any
$test_unrelated_callers.resolver = comdat any
-$test_clear_known_bits.resolver = comdat any
$caller1.resolver = comdat any
$caller2.resolver = comdat any
$caller3.resolver = comdat any
@@ -22,7 +21,6 @@ $caller6.resolver = comdat any
$caller7.resolver = comdat any
$caller8.resolver = comdat any
$caller9.resolver = comdat any
-$caller11.resolver = comdat any
@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
@@ -33,7 +31,6 @@ $caller11.resolver = comdat any
@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
@test_unrelated_callers = weak_odr ifunc i32 (), ptr @test_unrelated_callers.resolver
- at test_clear_known_bits = weak_odr ifunc i32 (), ptr @test_clear_known_bits.resolver
@caller1 = weak_odr ifunc i32 (), ptr @caller1.resolver
@caller2 = weak_odr ifunc i32 (), ptr @caller2.resolver
@caller3 = weak_odr ifunc i32 (), ptr @caller3.resolver
@@ -41,7 +38,6 @@ $caller11.resolver = comdat any
@caller7 = weak_odr ifunc i32 (), ptr @caller7.resolver
@caller8 = weak_odr ifunc i32 (), ptr @caller8.resolver
@caller9 = weak_odr ifunc i32 (), ptr @caller9.resolver
- at caller11 = weak_odr ifunc i32 (), ptr @caller11.resolver
declare void @__init_cpu_features_resolver() local_unnamed_addr
@@ -513,7 +509,7 @@ entry:
define dso_local i32 @caller8._Msve2() #2 {
; CHECK-LABEL: define dso_local i32 @caller8._Msve2(
; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers._Msve2()
+; CHECK: [[CALL:%.*]] = tail call i32 @test_unrelated_callers()
;
entry:
%call = tail call i32 @test_unrelated_callers()
@@ -595,89 +591,6 @@ entry:
ret i32 %call
}
-declare i32 @test_clear_known_bits._Mmops() #3
-declare i32 @test_clear_known_bits._Maes() #6
-declare i32 @test_clear_known_bits.default() #0
-
-define weak_odr ptr @test_clear_known_bits.resolver() comdat {
-; CHECK-LABEL: define weak_odr ptr @test_clear_known_bits.resolver() comdat {
-resolver_entry:
- tail call void @__init_cpu_features_resolver()
- %0 = load i64, ptr @__aarch64_cpu_features, align 8
- %1 = and i64 %0, 576460752303423488
- %.not = icmp eq i64 %1, 0
- %2 = and i64 %0, 33536
- %3 = icmp eq i64 %2, 33536
- %test_clear_known_bits._Maes.test_clear_known_bits.default = select i1 %3, ptr @test_clear_known_bits._Maes, ptr @test_clear_known_bits.default
- %common.ret.op = select i1 %.not, ptr %test_clear_known_bits._Maes.test_clear_known_bits.default, ptr @test_clear_known_bits._Mmops
- ret ptr %common.ret.op
-}
-
-define i32 @caller11._MmopsMsve2() #4 {
-; CHECK-LABEL: define i32 @caller11._MmopsMsve2(
-; CHECK-SAME: ) #[[ATTR4]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits._Mmops()
-;
-entry:
- %call = tail call i32 @test_clear_known_bits()
- ret i32 %call
-}
-
-define i32 @caller11._Msme() #5 {
-; CHECK-LABEL: define i32 @caller11._Msme(
-; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits()
-;
-entry:
- %call = tail call i32 @test_clear_known_bits()
- ret i32 %call
-}
-
-define noundef i32 @caller11._MaesMsve2() #19 {
-; CHECK-LABEL: define noundef i32 @caller11._MaesMsve2(
-; CHECK-SAME: ) #[[ATTR19:[0-9]+]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits._Maes()
-;
-entry:
- %call = tail call i32 @test_clear_known_bits()
- ret i32 %call
-}
-
-define i32 @caller11.default() #0 {
-; CHECK-LABEL: define i32 @caller11.default(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK: [[CALL:%.*]] = tail call i32 @test_clear_known_bits()
-;
-entry:
- %call = tail call i32 @test_clear_known_bits()
- ret i32 %call
-}
-
-define weak_odr ptr @caller11.resolver() comdat {
-; CHECK-LABEL: define weak_odr ptr @caller11.resolver() comdat {
-resolver_entry:
- tail call void @__init_cpu_features_resolver()
- %0 = load i64, ptr @__aarch64_cpu_features, align 8
- %1 = and i64 %0, 576460822096707840
- %2 = icmp eq i64 %1, 576460822096707840
- br i1 %2, label %common.ret, label %resolver_else
-
-common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
- %common.ret.op = phi ptr [ @caller11._MmopsMsve2, %resolver_entry ], [ @caller11._Msme, %resolver_else ], [ %caller11._MaesMsve2.caller11.default, %resolver_else2 ]
- ret ptr %common.ret.op
-
-resolver_else: ; preds = %resolver_entry
- %3 = and i64 %0, 4398180795136
- %4 = icmp eq i64 %3, 4398180795136
- br i1 %4, label %common.ret, label %resolver_else2
-
-resolver_else2: ; preds = %resolver_else
- %5 = and i64 %0, 69793317632
- %6 = icmp eq i64 %5, 69793317632
- %caller11._MaesMsve2.caller11.default = select i1 %6, ptr @caller11._MaesMsve2, ptr @caller11.default
- br label %common.ret
-}
-
attributes #0 = { "fmv-features" }
attributes #1 = { "fmv-features"="sve" }
attributes #2 = { "fmv-features"="sve2" }
@@ -697,4 +610,3 @@ attributes #15 = { "fmv-features"="flagm2,frintts" }
attributes #16 = { "fmv-features"="rcpc2" }
attributes #17 = { "fmv-features"="frintts" }
attributes #18 = { "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+sve,+v8a" }
-attributes #19 = { "fmv-features"="aes,sve2" }
>From ab1d64a0e28d8bb36f6baec7e082ef0bb4714633 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Mon, 3 Nov 2025 17:42:21 +0000
Subject: [PATCH 08/11] [NFC] minor refactoring and rewording
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 109 ++++++++++++--------------
1 file changed, 52 insertions(+), 57 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 9742104fb89e3..50478902406e8 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2511,30 +2511,28 @@ collectVersions(Value *V, SmallVectorImpl<Function *> &Versions,
return true;
}
-// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
-// deduce whether the optimization is legal we need to compare the target
-// features between caller and callee versions. The criteria for bypassing
-// the resolver are the following:
-//
-// * If the callee's feature set is a subset of the caller's feature set,
-// then the callee is a candidate for direct call.
-//
-// * Among such candidates the one of highest priority is the best match
-// and it shall be picked, unless there is a version of the callee with
-// higher priority than the best match which cannot be picked from a
-// higher priority caller (directly or through the resolver).
-//
-// * For every higher priority callee version than the best match, there
-// is a higher priority caller version whose feature set availability
-// is implied by the callee's feature set.
+// Try to statically resolve calls to versioned functions when possible. First
+// we identify the function versions which are associated with an IFUNC symbol.
+// We do that by examining the resolver function of the IFUNC. Once we have
+// collected all the function versions, we sort them in decreasing priority
+// order. This is necessary for identifying the highest priority callee version
+// for a given caller version. We then collect all the callsites to versioned
+// functions. The static resolution is performed by comparing the feature sets
+// between callers and callees. Versions of the callee may be skipped if they
+// depend on features we already know are unavailable. This information can
+// be deduced on each subsequent iteration of the set of caller versions: prior
+// iterations correspond to higher priority caller versions which would not have
+// been selected in a hypothetical runtime execution.
//
+// Presentation in EuroLLVM2025:
+// https://www.youtube.com/watch?v=k54MFimPz-A&t=867s
static bool OptimizeNonTrivialIFuncs(
Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
bool Changed = false;
// Map containing the feature bits for a given function.
DenseMap<Function *, APInt> FeatureMask;
- // Map containing all the versions corresponding to an IFunc symbol.
+ // Map containing all the function versions corresponding to an IFunc symbol.
DenseMap<GlobalIFunc *, SmallVector<Function *>> VersionedFuncs;
// Map containing the IFunc symbol a function is version of.
DenseMap<Function *, GlobalIFunc *> VersionOf;
@@ -2620,52 +2618,51 @@ static bool OptimizeNonTrivialIFuncs(
LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
<< CalleeIF->getResolverFunction()->getName() << "\n");
- auto redirectCalls = [&](SmallVectorImpl<Function *> &Callers,
- SmallVectorImpl<Function *> &Callees) {
- // Index to the current callee candidate.
+ // The complexity of this algorithm is linear: O(NumCallers + NumCallees).
+ // TODO
+ // A limitation it has is that we are not using information about the
+ // current caller to deduce why an earlier caller of higher priority was
+ // skipped. For example let's say the current caller is aes+sve2 and a
+ // previous caller was mops+sve2. Knowing that sve2 is available we could
+ // infer that mops is unavailable. This would allow us to skip callee
+ // versions which depend on mops. I tried implementing this but the
+ // complexity was cubic :/
+ auto redirectCalls = [&](ArrayRef<Function *> Callers,
+ ArrayRef<Function *> Callees) {
+ // Index to the highest callee candidate.
unsigned I = 0;
- // Try to redirect calls starting from higher priority callers.
- for (Function *Caller : Callers) {
+ for (Function *const &Caller : Callers) {
if (I == Callees.size())
break;
- bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller);
- // In the case of FMV callers, we know that all higher priority callers
- // than the current one did not get selected at runtime, which helps
- // reason about the callees (if they have versions that mandate presence
- // of the features which we already know are unavailable on this
- // target).
- if (!CallerIsFMV)
- // We can't reason much about non-FMV callers. Just pick the highest
- // priority callee if it matches, otherwise bail.
- assert(I == 0 && "Should only select the highest priority candidate");
-
Function *Callee = Callees[I];
APInt CallerBits = FeatureMask[Caller];
APInt CalleeBits = FeatureMask[Callee];
+
// If the feature set of the caller implies the feature set of the
- // highest priority candidate then it shall be picked.
+ // callee then all the callsites can be statically resolved.
if (CalleeBits.isSubsetOf(CallerBits)) {
- // If there are no records of call sites for this particular function
- // version, then it is not actually a caller, in which case skip.
- if (auto It = CallSites.find(Caller); It != CallSites.end()) {
- for (CallBase *CS : It->second) {
- LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName()
- << " -> " << Callee->getName() << "\n");
- CS->setCalledOperand(Callee);
- }
- Changed = true;
+ auto &Calls = CallSites[Caller];
+ for (CallBase *CS : Calls) {
+ LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName()
+ << " -> " << Callee->getName() << "\n");
+ CS->setCalledOperand(Callee);
}
+ Changed = true;
}
- // Keep advancing the candidate index as long as the caller's
- // features are a subset of the current candidate's.
- if (CallerIsFMV) {
- while (CallerBits.isSubsetOf(CalleeBits)) {
- if (++I == Callees.size())
- break;
- CalleeBits = FeatureMask[Callees[I]];
- }
+
+ // Nothing else to do about non-FMV callers.
+ if (!GetTTI(*Caller).isMultiversionedFunction(*Caller))
+ continue;
+
+ // Subsequent iterations of the outermost loop (set of callers)
+ // will consider the caller of the current iteration unavailable.
+ // Therefore we can skip all those callees which depend on it.
+ while (CallerBits.isSubsetOf(CalleeBits)) {
+ if (++I == Callees.size())
+ break;
+ CalleeBits = FeatureMask[Callees[I]];
}
}
};
@@ -2673,15 +2670,13 @@ static bool OptimizeNonTrivialIFuncs(
auto &Callees = VersionedFuncs[CalleeIF];
// Optimize non-FMV calls.
- if (!NonFMVCallers.empty() && OptimizeNonFMVCallers)
+ if (OptimizeNonFMVCallers)
redirectCalls(NonFMVCallers, Callees);
// Optimize FMV calls.
- if (!CallerIFuncs.empty()) {
- for (GlobalIFunc *CallerIF : CallerIFuncs) {
- auto &Callers = VersionedFuncs[CallerIF];
- redirectCalls(Callers, Callees);
- }
+ for (GlobalIFunc *CallerIF : CallerIFuncs) {
+ auto &Callers = VersionedFuncs[CallerIF];
+ redirectCalls(Callers, Callees);
}
if (CalleeIF->use_empty() ||
>From 17c09218136da044ee35232f784e7b2e6d041fc7 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Mon, 3 Nov 2025 22:48:42 +0000
Subject: [PATCH 09/11] * Add debug messages * Keep the caller IFUNCs in a set
to avoid examining them multiple times * Skip caller versions without
callsites (fixes infinite loop)
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 31 +++++++++++++++++++--------
1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 50478902406e8..a3bb3f213f816 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2540,6 +2540,8 @@ static bool OptimizeNonTrivialIFuncs(
SmallVector<GlobalIFunc *> IFuncs;
for (GlobalIFunc &IF : M.ifuncs()) {
+ LLVM_DEBUG(dbgs() << "Examining IFUNC " << IF.getName() << "\n");
+
if (IF.isInterposable())
continue;
@@ -2581,7 +2583,7 @@ static bool OptimizeNonTrivialIFuncs(
for (GlobalIFunc *CalleeIF : IFuncs) {
SmallVector<Function *> NonFMVCallers;
- SmallVector<GlobalIFunc *> CallerIFuncs;
+ DenseSet<GlobalIFunc *> CallerIFuncs;
DenseMap<Function *, SmallVector<CallBase *>> CallSites;
// Find the callsites.
@@ -2606,7 +2608,7 @@ static bool OptimizeNonTrivialIFuncs(
auto [It, Inserted] = CallSites.try_emplace(Caller);
if (Inserted) {
if (CallerIsFMV)
- CallerIFuncs.push_back(CallerIF);
+ CallerIFuncs.insert(CallerIF);
else
NonFMVCallers.push_back(Caller);
}
@@ -2615,6 +2617,9 @@ static bool OptimizeNonTrivialIFuncs(
}
}
+ if (CallSites.empty())
+ continue;
+
LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
<< CalleeIF->getResolverFunction()->getName() << "\n");
@@ -2633,6 +2638,12 @@ static bool OptimizeNonTrivialIFuncs(
unsigned I = 0;
for (Function *const &Caller : Callers) {
+ bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller);
+
+ LLVM_DEBUG(dbgs() << " Examining "
+ << (CallerIsFMV ? "FMV" : "regular") << " caller "
+ << Caller->getName() << "\n");
+
if (I == Callees.size())
break;
@@ -2643,17 +2654,19 @@ static bool OptimizeNonTrivialIFuncs(
// If the feature set of the caller implies the feature set of the
// callee then all the callsites can be statically resolved.
if (CalleeBits.isSubsetOf(CallerBits)) {
- auto &Calls = CallSites[Caller];
- for (CallBase *CS : Calls) {
- LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName()
- << " -> " << Callee->getName() << "\n");
- CS->setCalledOperand(Callee);
+ // Not all caller versions are necessarily users of the callee IFUNC.
+ if (auto It = CallSites.find(Caller); It != CallSites.end()) {
+ for (CallBase *CS : It->second) {
+ LLVM_DEBUG(dbgs() << " Redirecting call " << Caller->getName()
+ << " -> " << Callee->getName() << "\n");
+ CS->setCalledOperand(Callee);
+ }
+ Changed = true;
}
- Changed = true;
}
// Nothing else to do about non-FMV callers.
- if (!GetTTI(*Caller).isMultiversionedFunction(*Caller))
+ if (!CallerIsFMV)
continue;
// Subsequent iterations of the outermost loop (set of callers)
>From 6f402c0b66404d1502cf6ac40526ee9fe1dc630a Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Thu, 6 Nov 2025 16:09:08 +0000
Subject: [PATCH 10/11] Changes: * rewrite comments * move early exit in the
beginning of loop * rename lambda * pass constant arg instead of
recomputing * rewrite loop for advancing the callee index
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 54 ++++++++++++++-------------
1 file changed, 29 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index a3bb3f213f816..9bcc986d99571 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2515,14 +2515,20 @@ collectVersions(Value *V, SmallVectorImpl<Function *> &Versions,
// we identify the function versions which are associated with an IFUNC symbol.
// We do that by examining the resolver function of the IFUNC. Once we have
// collected all the function versions, we sort them in decreasing priority
-// order. This is necessary for identifying the highest priority callee version
-// for a given caller version. We then collect all the callsites to versioned
+// order. This is necessary for determining the most suitable callee version
+// for each caller version. We then collect all the callsites to versioned
// functions. The static resolution is performed by comparing the feature sets
-// between callers and callees. Versions of the callee may be skipped if they
-// depend on features we already know are unavailable. This information can
-// be deduced on each subsequent iteration of the set of caller versions: prior
-// iterations correspond to higher priority caller versions which would not have
-// been selected in a hypothetical runtime execution.
+// between callers and callees. Specifically:
+// * Start a walk over caller and callee lists simultaneously in order of
+// decreasing priority.
+// * Statically resolve calls from the current caller to the current callee,
+// iff the caller feature bits are a superset of the callee feature bits.
+// * For FMV callers, as long as the caller feature bits are a subset of the
+// callee feature bits, advance to the next callee. This effectively prevents
+// considering the current callee as a candidate for static resolution by
+// following callers (explanation: preceding callers would not have been
+// selected in a hypothetical runtime execution).
+// * Advance to the next caller.
//
// Presentation in EuroLLVM2025:
// https://www.youtube.com/watch?v=k54MFimPz-A&t=867s
@@ -2632,27 +2638,27 @@ static bool OptimizeNonTrivialIFuncs(
// infer that mops is unavailable. This would allow us to skip callee
// versions which depend on mops. I tried implementing this but the
// complexity was cubic :/
- auto redirectCalls = [&](ArrayRef<Function *> Callers,
- ArrayRef<Function *> Callees) {
+ auto staticallyResolveCalls = [&](ArrayRef<Function *> Callers,
+ ArrayRef<Function *> Callees,
+ bool CallerIsFMV) {
// Index to the highest callee candidate.
unsigned I = 0;
for (Function *const &Caller : Callers) {
- bool CallerIsFMV = GetTTI(*Caller).isMultiversionedFunction(*Caller);
+ if (I == Callees.size())
+ break;
LLVM_DEBUG(dbgs() << " Examining "
<< (CallerIsFMV ? "FMV" : "regular") << " caller "
<< Caller->getName() << "\n");
- if (I == Callees.size())
- break;
-
Function *Callee = Callees[I];
APInt CallerBits = FeatureMask[Caller];
APInt CalleeBits = FeatureMask[Callee];
- // If the feature set of the caller implies the feature set of the
- // callee then all the callsites can be statically resolved.
+ // Statically resolve calls from the current caller to the current
+ // callee, iff the caller feature bits are a superset of the callee
+ // feature bits.
if (CalleeBits.isSubsetOf(CallerBits)) {
// Not all caller versions are necessarily users of the callee IFUNC.
if (auto It = CallSites.find(Caller); It != CallSites.end()) {
@@ -2669,14 +2675,12 @@ static bool OptimizeNonTrivialIFuncs(
if (!CallerIsFMV)
continue;
- // Subsequent iterations of the outermost loop (set of callers)
- // will consider the caller of the current iteration unavailable.
- // Therefore we can skip all those callees which depend on it.
- while (CallerBits.isSubsetOf(CalleeBits)) {
- if (++I == Callees.size())
- break;
- CalleeBits = FeatureMask[Callees[I]];
- }
+ // For FMV callers, as long as the caller feature bits are a subset of
+ // the callee feature bits, advance to the next callee. This effectively
+ // prevents considering the current callee as a candidate for static
+ // resolution by following callers.
+ while (CallerBits.isSubsetOf(FeatureMask[Callees[I]]) &&
+ ++I < Callees.size());
}
};
@@ -2684,12 +2688,12 @@ static bool OptimizeNonTrivialIFuncs(
// Optimize non-FMV calls.
if (OptimizeNonFMVCallers)
- redirectCalls(NonFMVCallers, Callees);
+ staticallyResolveCalls(NonFMVCallers, Callees, /*CallerIsFMV=*/false);
// Optimize FMV calls.
for (GlobalIFunc *CallerIF : CallerIFuncs) {
auto &Callers = VersionedFuncs[CallerIF];
- redirectCalls(Callers, Callees);
+ staticallyResolveCalls(Callers, Callees, /*CallerIsFMV=*/true);
}
if (CalleeIF->use_empty() ||
>From e8596338d51de3a63daa5669a6d94a4faa0a8485 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Thu, 6 Nov 2025 16:17:53 +0000
Subject: [PATCH 11/11] clang format
---
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 9bcc986d99571..c3dede31540d6 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2680,7 +2680,8 @@ static bool OptimizeNonTrivialIFuncs(
// prevents considering the current callee as a candidate for static
// resolution by following callers.
while (CallerBits.isSubsetOf(FeatureMask[Callees[I]]) &&
- ++I < Callees.size());
+ ++I < Callees.size())
+ ;
}
};
More information about the llvm-commits
mailing list