[llvm] [FMV][GlobalOpt] Statically resolve calls to versioned functions. (PR #87939)
Alexandros Lamprineas via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 13 12:04:22 PST 2024
https://github.com/labrinea updated https://github.com/llvm/llvm-project/pull/87939
>From 02bd5a7013c558f1e5220fc89bafa68f40276549 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Sun, 7 Apr 2024 21:06:47 +0100
Subject: [PATCH 1/2] [FMV][GlobalOpt] Bypass the IFunc Resolver of
MultiVersioned functions.
To deduce whether the optimization is legal we need to compare the target
features between caller and callee versions. The criteria for bypassing
the resolver are the following:
* If the callee's feature set is a subset of the caller's feature set,
then the callee is a candidate for direct call.
* Among such candidates the one of highest priority is the best match
and it shall be picked, unless there is a version of the callee with
higher priority than the best match which cannot be picked from a
higher priority caller (directly or through the resolver).
* For every higher priority callee version than the best match, there
is a higher priority caller version whose feature set availability
is implied by the callee's feature set.
Example:
Callers and Callees are ordered in decreasing priority.
The arrows indicate successful call redirections.
Caller Callee Explanation
=========================================================================
mops+sve2 --+--> mops all the callee versions are subsets of the
| caller but mops has the highest priority
|
mops --+ sve2 between mops and default callees, mops wins
sve sve between sve and default callees, sve wins
but sve2 does not have a high priority caller
default -----> default sve (callee) implies sve (caller),
sve2(callee) implies sve (caller),
mops(callee) implies mops(caller)
---
.../llvm/Analysis/TargetTransformInfo.h | 14 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +
.../llvm/TargetParser/AArch64TargetParser.h | 4 +-
llvm/lib/Analysis/TargetTransformInfo.cpp | 6 +
.../AArch64/AArch64TargetTransformInfo.cpp | 8 +
.../AArch64/AArch64TargetTransformInfo.h | 4 +
llvm/lib/TargetParser/AArch64TargetParser.cpp | 17 +-
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 141 +++++-
.../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 412 ++++++++++++++++++
9 files changed, 604 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fa9392b86c15b9..49adecbc81e2bb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1762,6 +1762,12 @@ class TargetTransformInfo {
/// false, but it shouldn't matter what it returns anyway.
bool hasArmWideBranch(bool Thumb) const;
+ /// Returns true if the target supports Function MultiVersioning.
+ bool hasFMV() const;
+
+ /// Returns a bitmask constructed from the target features of a function.
+ uint64_t getFeatureMask(Function &F) const;
+
/// \return The maximum number of function arguments the target supports.
unsigned getMaxNumArgs() const;
@@ -2152,6 +2158,8 @@ class TargetTransformInfo::Concept {
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
+ virtual bool hasFMV() const = 0;
+ virtual uint64_t getFeatureMask(Function &F) const = 0;
virtual unsigned getMaxNumArgs() const = 0;
};
@@ -2904,6 +2912,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.hasArmWideBranch(Thumb);
}
+ bool hasFMV() const override { return Impl.hasFMV(); }
+
+ uint64_t getFeatureMask(Function &F) const override {
+ return Impl.getFeatureMask(F);
+ }
+
unsigned getMaxNumArgs() const override {
return Impl.getMaxNumArgs();
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 63c2ef8912b29c..6b8cae928ff6e9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -941,6 +941,10 @@ class TargetTransformInfoImplBase {
bool hasArmWideBranch(bool) const { return false; }
+ bool hasFMV() const { return false; }
+
+ uint64_t getFeatureMask(Function &F) const { return 0; }
+
unsigned getMaxNumArgs() const { return UINT_MAX; }
protected:
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 805b963a7a13c7..152cfee8cf373d 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -846,6 +846,7 @@ const ArchInfo *getArchForCpu(StringRef CPU);
// Parser
const ArchInfo *parseArch(StringRef Arch);
std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
+std::optional<ExtensionInfo> parseTargetFeature(StringRef Feature);
// Given the name of a CPU or alias, return the correponding CpuInfo.
std::optional<CpuInfo> parseCpu(StringRef Name);
// Used by target parser tests
@@ -856,7 +857,8 @@ bool isX18ReservedByDefault(const Triple &TT);
// For given feature names, return a bitmask corresponding to the entries of
// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks
// themselves, they are sequential (0, 1, 2, 3, ...).
-uint64_t getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
+uint64_t getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs,
+ bool IsBackEndFeature = false);
void PrintSupportedExtensions(StringMap<StringRef> DescMap);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 5f933b4587843c..3caca8a417d3ee 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1296,6 +1296,12 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const {
return TTIImpl->hasArmWideBranch(Thumb);
}
+bool TargetTransformInfo::hasFMV() const { return TTIImpl->hasFMV(); }
+
+uint64_t TargetTransformInfo::getFeatureMask(Function &F) const {
+ return TTIImpl->getFeatureMask(F);
+}
+
unsigned TargetTransformInfo::getMaxNumArgs() const {
return TTIImpl->getMaxNumArgs();
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ee7137b92445bb..e68565ed16f06f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
@@ -231,6 +232,13 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
return false;
}
+uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const {
+ StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString();
+ SmallVector<StringRef, 8> Features;
+ FeatureStr.split(Features, ",");
+ return AArch64::getCpuSupportsMask(Features, /*IsBackEndFeature = */ true);
+}
+
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e1..fe275341930ba5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -83,6 +83,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
unsigned DefaultCallPenalty) const;
+ bool hasFMV() const { return ST->hasFMV(); }
+
+ uint64_t getFeatureMask(Function &F) const;
+
/// \name Scalar TTI Implementations
/// @{
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 71099462d5ecff..5eecde791a0336 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -47,12 +47,13 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
return {};
}
-uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs) {
+uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs,
+ bool IsBackEndFeature) {
uint64_t FeaturesMask = 0;
- for (const StringRef &FeatureStr : FeatureStrs) {
- if (auto Ext = parseArchExtension(FeatureStr))
+ for (const StringRef FeatureStr : FeatureStrs)
+ if (auto Ext = IsBackEndFeature ? parseTargetFeature(FeatureStr)
+ : parseArchExtension(FeatureStr))
FeaturesMask |= (1ULL << Ext->CPUFeature);
- }
return FeaturesMask;
}
@@ -132,6 +133,14 @@ std::optional<AArch64::ExtensionInfo> AArch64::parseArchExtension(StringRef Arch
return {};
}
+std::optional<AArch64::ExtensionInfo>
+AArch64::parseTargetFeature(StringRef Feature) {
+ for (const auto &E : Extensions)
+ if (Feature == E.Feature)
+ return E;
+ return {};
+}
+
std::optional<AArch64::CpuInfo> AArch64::parseCpu(StringRef Name) {
// Resolve aliases first.
Name = resolveCPUAlias(Name);
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index da714c9a75701b..159362058ef42c 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -89,7 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
STATISTIC(NumInternalFunc, "Number of internal functions");
STATISTIC(NumColdCC, "Number of functions marked coldcc");
-STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
+STATISTIC(NumIFuncsResolved, "Number of resolved IFuncs");
STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
static cl::opt<bool>
@@ -2462,6 +2462,142 @@ DeleteDeadIFuncs(Module &M,
return Changed;
}
+// Follows the use-def chain of \p V backwards until it finds a Function,
+// in which case it collects in \p Versions.
+static void collectVersions(Value *V, SmallVectorImpl<Function *> &Versions) {
+ if (auto *F = dyn_cast<Function>(V)) {
+ Versions.push_back(F);
+ } else if (auto *Sel = dyn_cast<SelectInst>(V)) {
+ collectVersions(Sel->getTrueValue(), Versions);
+ collectVersions(Sel->getFalseValue(), Versions);
+ } else if (auto *Phi = dyn_cast<PHINode>(V)) {
+ for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+ collectVersions(Phi->getIncomingValue(I), Versions);
+ }
+}
+
+// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
+// deduce whether the optimization is legal we need to compare the target
+// features between caller and callee versions. The criteria for bypassing
+// the resolver are the following:
+//
+// * If the callee's feature set is a subset of the caller's feature set,
+// then the callee is a candidate for direct call.
+//
+// * Among such candidates the one of highest priority is the best match
+// and it shall be picked, unless there is a version of the callee with
+// higher priority than the best match which cannot be picked from a
+// higher priority caller (directly or through the resolver).
+//
+// * For every higher priority callee version than the best match, there
+// is a higher priority caller version whose feature set availability
+// is implied by the callee's feature set.
+//
+static bool OptimizeNonTrivialIFuncs(
+ Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+ bool Changed = false;
+
+ // Cache containing the mask constructed from a function's target features.
+ DenseMap<Function *, uint64_t> FeatureMask;
+
+ for (GlobalIFunc &IF : M.ifuncs()) {
+ if (IF.isInterposable())
+ continue;
+
+ Function *Resolver = IF.getResolverFunction();
+ if (!Resolver)
+ continue;
+
+ if (Resolver->isInterposable())
+ continue;
+
+ TargetTransformInfo &TTI = GetTTI(*Resolver);
+ if (!TTI.hasFMV())
+ return false;
+
+ // Discover the callee versions.
+ SmallVector<Function *> Callees;
+ for (BasicBlock &BB : *Resolver)
+ if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
+ collectVersions(Ret->getReturnValue(), Callees);
+
+ if (Callees.empty())
+ continue;
+
+ // Cache the feature mask for each callee.
+ for (Function *Callee : Callees) {
+ auto [It, Inserted] = FeatureMask.try_emplace(Callee);
+ if (Inserted)
+ It->second = TTI.getFeatureMask(*Callee);
+ }
+
+ // Sort the callee versions in decreasing priority order.
+ sort(Callees, [&](auto *LHS, auto *RHS) {
+ return FeatureMask[LHS] > FeatureMask[RHS];
+ });
+
+ // Find the callsites and cache the feature mask for each caller.
+ SmallVector<Function *> Callers;
+ DenseMap<Function *, SmallVector<CallBase *>> CallSites;
+ for (User *U : IF.users()) {
+ if (auto *CB = dyn_cast<CallBase>(U)) {
+ if (CB->getCalledOperand() == &IF) {
+ Function *Caller = CB->getFunction();
+ auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
+ if (FeatInserted)
+ FeatIt->second = TTI.getFeatureMask(*Caller);
+ auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
+ if (CallInserted)
+ Callers.push_back(Caller);
+ CallIt->second.push_back(CB);
+ }
+ }
+ }
+
+ // Sort the caller versions in decreasing priority order.
+ sort(Callers, [&](auto *LHS, auto *RHS) {
+ return FeatureMask[LHS] > FeatureMask[RHS];
+ });
+
+ auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; };
+
+ // Index to the highest priority candidate.
+ unsigned I = 0;
+ // Now try to redirect calls starting from higher priority callers.
+ for (Function *Caller : Callers) {
+ // Getting here means we found callers of equal priority.
+ if (I == Callees.size())
+ break;
+ Function *Callee = Callees[I];
+ uint64_t CallerBits = FeatureMask[Caller];
+ uint64_t CalleeBits = FeatureMask[Callee];
+ // If the feature set of the caller implies the feature set of the
+ // highest priority candidate then it shall be picked. In case of
+ // identical sets advance the candidate index one position.
+ if (CallerBits == CalleeBits)
+ ++I;
+ else if (!implies(CallerBits, CalleeBits)) {
+ // Keep advancing the candidate index as long as the caller's
+ // features are a subset of the current candidate's.
+ while (implies(CalleeBits, CallerBits)) {
+ if (++I == Callees.size())
+ break;
+ CalleeBits = FeatureMask[Callees[I]];
+ }
+ continue;
+ }
+ auto &Calls = CallSites[Caller];
+ for (CallBase *CS : Calls)
+ CS->setCalledOperand(Callee);
+ Changed = true;
+ }
+ if (IF.use_empty() ||
+ all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+ NumIFuncsResolved++;
+ }
+ return Changed;
+}
+
static bool
optimizeGlobalsInModule(Module &M, const DataLayout &DL,
function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2525,6 +2661,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
// Optimize IFuncs whose callee's are statically known.
LocalChange |= OptimizeStaticIFuncs(M);
+ // Optimize IFuncs based on the target features of the caller.
+ LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI);
+
// Remove any IFuncs that are now dead.
LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats);
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
new file mode 100644
index 00000000000000..2805ce6fb2a3dc
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_same_priority_callers)" --version 4
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$test_single_bb_resolver.resolver = comdat any
+$test_multi_bb_resolver.resolver = comdat any
+$test_caller_feats_not_implied.resolver = comdat any
+$test_same_priority_callers.resolver = comdat any
+$foo.resolver = comdat any
+$bar.resolver = comdat any
+$goo.resolver = comdat any
+$baz.resolver = comdat any
+
+ at __aarch64_cpu_features = external local_unnamed_addr global { i64 }
+
+ at test_single_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_single_bb_resolver
+ at test_multi_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_multi_bb_resolver
+ at test_caller_feats_not_implied.ifunc = weak_odr alias i32 (), ptr @test_caller_feats_not_implied
+ at test_same_priority_callers.ifunc = weak_odr alias i32 (), ptr @test_same_priority_callers
+ at foo.ifunc = weak_odr alias i32 (), ptr @foo
+ at bar.ifunc = weak_odr alias i32 (), ptr @bar
+ at goo.ifunc = weak_odr alias i32 (), ptr @goo
+ at baz.ifunc = weak_odr alias i32 (), ptr @baz
+
+ at test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
+ at test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
+ at test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
+ at test_same_priority_callers = weak_odr ifunc i32 (), ptr @test_same_priority_callers.resolver
+ at foo = weak_odr ifunc i32 (), ptr @foo.resolver
+ at bar = weak_odr ifunc i32 (), ptr @bar.resolver
+ at goo = weak_odr ifunc i32 (), ptr @goo.resolver
+ at baz = weak_odr ifunc i32 (), ptr @baz.resolver
+
+declare void @__init_cpu_features_resolver() local_unnamed_addr
+
+declare i32 @test_single_bb_resolver._Msve() #2
+
+declare i32 @test_single_bb_resolver._Msve2() #3
+
+define i32 @test_single_bb_resolver.default() #1 {
+; CHECK-LABEL: define i32 @test_single_bb_resolver.default(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+entry:
+ ret i32 0
+}
+
+define weak_odr ptr @test_single_bb_resolver.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 68719476736
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 1073741824
+ %.not3 = icmp eq i64 %2, 0
+ %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve
+ %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2
+ ret ptr %common.ret.op
+}
+
+define i32 @foo._Msve() #2 {
+; CHECK-LABEL: define i32 @foo._Msve(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
+;
+entry:
+ %call = tail call i32 @test_single_bb_resolver()
+ %add = add nsw i32 %call, 30
+ ret i32 %add
+}
+
+define i32 @foo._Msve2() #3 {
+; CHECK-LABEL: define i32 @foo._Msve2(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK: [[CALL1:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+; CHECK: [[CALL2:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+;
+entry:
+ %call1 = tail call i32 @test_single_bb_resolver()
+ %call2 = tail call i32 @test_single_bb_resolver()
+ %added = add nsw i32 %call1, %call2
+ %add = add nsw i32 %added, 20
+ ret i32 %add
+}
+
+define i32 @foo.default() #1 {
+; CHECK-LABEL: define i32 @foo.default(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
+;
+entry:
+ %call = tail call i32 @test_single_bb_resolver()
+ %add = add nsw i32 %call, 10
+ ret i32 %add
+}
+
+define weak_odr ptr @foo.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @foo.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 68719476736
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 1073741824
+ %.not3 = icmp eq i64 %2, 0
+ %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve
+ %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2
+ ret ptr %common.ret.op
+}
+
+define i32 @test_multi_bb_resolver._Mmops() #4 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver._Mmops(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+entry:
+ ret i32 3
+}
+
+define i32 @test_multi_bb_resolver._Msve2() #3 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve2(
+; CHECK-SAME: ) #[[ATTR1]] {
+entry:
+ ret i32 2
+}
+
+define i32 @test_multi_bb_resolver._Msve() #2 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+entry:
+ ret i32 1
+}
+
+define i32 @test_multi_bb_resolver.default() #1 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+entry:
+ ret i32 0
+}
+
+define weak_odr ptr @test_multi_bb_resolver.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %2 = and i64 %0, 68719476736
+ %.not5 = icmp eq i64 %2, 0
+ br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2: ; preds = %resolver_else
+ %3 = and i64 %0, 1073741824
+ %.not6 = icmp eq i64 %3, 0
+ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve
+ br label %common.ret
+}
+
+define i32 @bar._MmopsMsve2() #5 {
+; CHECK-LABEL: define i32 @bar._MmopsMsve2(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+ %call = tail call i32 @test_multi_bb_resolver()
+ %add = add nsw i32 %call, 40
+ ret i32 %add
+}
+
+define i32 @bar._Mmops() #4 {
+; CHECK-LABEL: define i32 @bar._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+ %call = tail call i32 @test_multi_bb_resolver()
+ %add = add nsw i32 %call, 30
+ ret i32 %add
+}
+
+define i32 @bar._Msve() #2 {
+; CHECK-LABEL: define i32 @bar._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
+;
+entry:
+ %call = tail call i32 @test_multi_bb_resolver()
+ %add = add nsw i32 %call, 20
+ ret i32 %add
+}
+
+define i32 @bar.default() #1 {
+; CHECK-LABEL: define i32 @bar.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
+;
+entry:
+ %call = tail call i32 @test_multi_bb_resolver()
+ %add = add nsw i32 %call, 10
+ ret i32 %add
+}
+
+define weak_odr ptr @bar.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @bar.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460821022900224
+ %2 = icmp eq i64 %1, 576460821022900224
+ %3 = and i64 %0, 1073741824
+ %.not = icmp eq i64 %3, 0
+ %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve
+ %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default
+ ret ptr %common.ret.op
+}
+
+define i32 @test_caller_feats_not_implied._Mmops() #4 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Mmops(
+; CHECK-SAME: ) #[[ATTR4]] {
+entry:
+ ret i32 3
+}
+
+define i32 @test_caller_feats_not_implied._Msme() #6 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msme(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+entry:
+ ret i32 2
+}
+
+define i32 @test_caller_feats_not_implied._Msve() #2 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+entry:
+ ret i32 1
+}
+
+define i32 @test_caller_feats_not_implied.default() #1 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+entry:
+ ret i32 0
+}
+
+define weak_odr ptr @test_caller_feats_not_implied.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry
+ %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ]
+ ret ptr %common.ret.op
+
+resolver_else: ; preds = %resolver_entry
+ %2 = and i64 %0, 4398046511104
+ %.not5 = icmp eq i64 %2, 0
+ br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2: ; preds = %resolver_else
+ %3 = and i64 %0, 1073741824
+ %.not6 = icmp eq i64 %3, 0
+ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve
+ br label %common.ret
+}
+
+define i32 @goo._Mmops() #4 {
+; CHECK-LABEL: define i32 @goo._Mmops(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
+;
+entry:
+ %call = tail call i32 @test_caller_feats_not_implied()
+ ret i32 %call
+}
+
+define i32 @goo._Msve() #2 {
+; CHECK-LABEL: define i32 @goo._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+ %call = tail call i32 @test_caller_feats_not_implied()
+ ret i32 %call
+}
+
+define i32 @goo.default() #1 {
+; CHECK-LABEL: define i32 @goo.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+ %call = tail call i32 @test_caller_feats_not_implied()
+ ret i32 %call
+}
+
+define weak_odr ptr @goo.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @goo.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 576460752303423488
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 1073741824
+ %.not3 = icmp eq i64 %2, 0
+ %goo._Msve.goo.default = select i1 %.not3, ptr @goo.default, ptr @goo._Msve
+ %common.ret.op = select i1 %.not, ptr %goo._Msve.goo.default, ptr @goo._Mmops
+ ret ptr %common.ret.op
+}
+
+define i32 @test_same_priority_callers._Msve() #2 {
+; CHECK-LABEL: define i32 @test_same_priority_callers._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+entry:
+ ret i32 1
+}
+
+define i32 @test_same_priority_callers.default() #1 {
+; CHECK-LABEL: define i32 @test_same_priority_callers.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+entry:
+ ret i32 0
+}
+
+define weak_odr ptr @test_same_priority_callers.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_same_priority_callers.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 1073741824
+ %.not = icmp eq i64 %1, 0
+ %test_same_priority_callers._Msve.test_same_priority_callers.default = select i1 %.not, ptr @test_same_priority_callers.default, ptr @test_same_priority_callers._Msve
+ ret ptr %test_same_priority_callers._Msve.test_same_priority_callers.default
+}
+
+define dso_local i32 @baz._Msve() #2 {
+; CHECK-LABEL: define dso_local i32 @baz._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_same_priority_callers._Msve()
+;
+entry:
+ %call = tail call i32 @test_same_priority_callers()
+ ret i32 %call
+}
+
+define i32 @baz._Maes() #1 {
+; CHECK-LABEL: define i32 @baz._Maes(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_same_priority_callers()
+;
+entry:
+ %call = tail call i32 @test_same_priority_callers()
+ ret i32 %call
+}
+
+; This isn't a bug in globalopt, but rather a problematic input.
+; The 'aes' extension does not add any target features on top
+; of what is inherited from the command line.
+;
+; What happens is that since baz._Maes and baz.default have the same priority,
+; globalopt tries to optimize the call in baz.default first and succeeds leaving
+; the remaining call in baz._Maes pointing to the resolver.
+;
+define dso_local i32 @baz.default() #1 {
+; CHECK-LABEL: define dso_local i32 @baz.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK: [[CALL:%.*]] = tail call i32 @test_same_priority_callers.default()
+;
+entry:
+ %call = tail call i32 @test_same_priority_callers()
+ ret i32 %call
+}
+
+define weak_odr ptr @baz.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @baz.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+ tail call void @__init_cpu_features_resolver()
+ %0 = load i64, ptr @__aarch64_cpu_features, align 8
+ %1 = and i64 %0, 1073741824
+ %.not = icmp eq i64 %1, 0
+ %2 = and i64 %0, 16384
+ %.not3 = icmp eq i64 %2, 0
+ %baz._Maes.baz.default = select i1 %.not3, ptr @baz.default, ptr @baz._Maes
+ %common.ret.op = select i1 %.not, ptr %baz._Maes.baz.default, ptr @baz._Msve
+ ret ptr %common.ret.op
+}
+
+attributes #0 = { "target-features"="+fmv" }
+attributes #1 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" }
+attributes #3 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" }
+attributes #4 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" }
+attributes #5 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" }
+attributes #6 = { "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" }
>From 16aa3baf9d0c354611b9d270a85c9d458302a92a Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Wed, 13 Nov 2024 19:48:27 +0000
Subject: [PATCH 2/2] Changes from last revision:
* clang format
* remove leftover target hook hasFMV after rebase
* remove filter in regression test after rebase
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 --
llvm/lib/TargetParser/AArch64TargetParser.cpp | 2 +-
llvm/lib/Transforms/IPO/GlobalOpt.cpp | 2 +-
llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 2 +-
4 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c4cfd5bfe82cf1..5d6663a4a0c146 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -88,8 +88,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
unsigned DefaultCallPenalty) const;
- bool hasFMV() const { return ST->hasFMV(); }
-
uint64_t getFeatureMask(Function &F) const;
/// \name Scalar TTI Implementations
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 0b1a7bacdaa5ab..588ea9a5dba42a 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -52,7 +52,7 @@ std::optional<AArch64::FMVInfo>
lookupFMVByID(llvm::AArch64::ArchExtKind ExtID) {
for (const auto &I : llvm::AArch64::getFMVInfo())
if (I.ID && *I.ID == ExtID)
- return I;
+ return I;
return {};
}
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 3c2aba774e69c2..a427fbc2f7ea9b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2753,7 +2753,7 @@ static bool OptimizeNonTrivialIFuncs(
unsigned I = 0;
// Now try to redirect calls starting from higher priority callers.
for (Function *Caller : Callers) {
- assert (I < Callees.size() && "Found callers of equal priority");
+ assert(I < Callees.size() && "Found callers of equal priority");
Function *Callee = Callees[I];
uint64_t CallerBits = FeatureMask[Caller];
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 91e991a778fa11..8e0072c3416b5f 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_same_priority_callers)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied)" --version 4
; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
More information about the llvm-commits
mailing list