[llvm] [FMV][GlobalOpt] Statically resolve calls to versioned functions. (PR #87939)

Sat Nov 30 07:04:41 PST 2024

https://github.com/labrinea updated https://github.com/llvm/llvm-project/pull/87939

>From 02bd5a7013c558f1e5220fc89bafa68f40276549 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Sun, 7 Apr 2024 21:06:47 +0100
Subject: [PATCH 1/5] [FMV][GlobalOpt] Bypass the IFunc Resolver of
 MultiVersioned functions.

To deduce whether the optimization is legal we need to compare the target
features between caller and callee versions. The criteria for bypassing
the resolver are the following:

 * If the callee's feature set is a subset of the caller's feature set,
   then the callee is a candidate for direct call.

 * Among such candidates the one of highest priority is the best match
   and it shall be picked, unless there is a version of the callee with
   higher priority than the best match which cannot be picked from a
   higher priority caller (directly or through the resolver).

 * For every higher priority callee version than the best match, there
   is a higher priority caller version whose feature set availability
   is implied by the callee's feature set.

Example:

Callers and Callees are ordered in decreasing priority.
The arrows indicate successful call redirections.

  Caller        Callee      Explanation
=========================================================================
mops+sve2 --+--> mops       all the callee versions are subsets of the
            |               caller but mops has the highest priority
            |
     mops --+    sve2       between mops and default callees, mops wins

      sve        sve        between sve and default callees, sve wins
                            but sve2 does not have a high priority caller

  default -----> default    sve (callee) implies sve (caller),
                            sve2(callee) implies sve (caller),
                            mops(callee) implies mops(caller)
---
 .../llvm/Analysis/TargetTransformInfo.h       |  14 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   4 +
 .../llvm/TargetParser/AArch64TargetParser.h   |   4 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   6 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   8 +
 .../AArch64/AArch64TargetTransformInfo.h      |   4 +
 llvm/lib/TargetParser/AArch64TargetParser.cpp |  17 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 141 +++++-
 .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 412 ++++++++++++++++++
 9 files changed, 604 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fa9392b86c15b9..49adecbc81e2bb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1762,6 +1762,12 @@ class TargetTransformInfo {
   /// false, but it shouldn't matter what it returns anyway.
   bool hasArmWideBranch(bool Thumb) const;
 
+  /// Returns true if the target supports Function MultiVersioning.
+  bool hasFMV() const;
+
+  /// Returns a bitmask constructed from the target features of a function.
+  uint64_t getFeatureMask(Function &F) const;
+
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
@@ -2152,6 +2158,8 @@ class TargetTransformInfo::Concept {
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
+  virtual bool hasFMV() const = 0;
+  virtual uint64_t getFeatureMask(Function &F) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
 };
 
@@ -2904,6 +2912,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasArmWideBranch(Thumb);
   }
 
+  bool hasFMV() const override { return Impl.hasFMV(); }
+
+  uint64_t getFeatureMask(Function &F) const override {
+    return Impl.getFeatureMask(F);
+  }
+
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 63c2ef8912b29c..6b8cae928ff6e9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -941,6 +941,10 @@ class TargetTransformInfoImplBase {
 
   bool hasArmWideBranch(bool) const { return false; }
 
+  bool hasFMV() const { return false; }
+
+  uint64_t getFeatureMask(Function &F) const { return 0; }
+
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
 protected:
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 805b963a7a13c7..152cfee8cf373d 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -846,6 +846,7 @@ const ArchInfo *getArchForCpu(StringRef CPU);
 // Parser
 const ArchInfo *parseArch(StringRef Arch);
 std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
+std::optional<ExtensionInfo> parseTargetFeature(StringRef Feature);
 // Given the name of a CPU or alias, return the correponding CpuInfo.
 std::optional<CpuInfo> parseCpu(StringRef Name);
 // Used by target parser tests
@@ -856,7 +857,8 @@ bool isX18ReservedByDefault(const Triple &TT);
 // For given feature names, return a bitmask corresponding to the entries of
 // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks
 // themselves, they are sequential (0, 1, 2, 3, ...).
-uint64_t getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
+uint64_t getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs,
+                            bool IsBackEndFeature = false);
 
 void PrintSupportedExtensions(StringMap<StringRef> DescMap);
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 5f933b4587843c..3caca8a417d3ee 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1296,6 +1296,12 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const {
   return TTIImpl->hasArmWideBranch(Thumb);
 }
 
+bool TargetTransformInfo::hasFMV() const { return TTIImpl->hasFMV(); }
+
+uint64_t TargetTransformInfo::getFeatureMask(Function &F) const {
+  return TTIImpl->getFeatureMask(F);
+}
+
 unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ee7137b92445bb..e68565ed16f06f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
@@ -231,6 +232,13 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
   return false;
 }
 
+uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const {
+  StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString();
+  SmallVector<StringRef, 8> Features;
+  FeatureStr.split(Features, ",");
+  return AArch64::getCpuSupportsMask(Features, /*IsBackEndFeature = */ true);
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e1..fe275341930ba5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -83,6 +83,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const;
 
+  bool hasFMV() const { return ST->hasFMV(); }
+
+  uint64_t getFeatureMask(Function &F) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 71099462d5ecff..5eecde791a0336 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -47,12 +47,13 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
   return {};
 }
 
-uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs) {
+uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs,
+                                     bool IsBackEndFeature) {
   uint64_t FeaturesMask = 0;
-  for (const StringRef &FeatureStr : FeatureStrs) {
-    if (auto Ext = parseArchExtension(FeatureStr))
+  for (const StringRef FeatureStr : FeatureStrs)
+    if (auto Ext = IsBackEndFeature ? parseTargetFeature(FeatureStr)
+                                    : parseArchExtension(FeatureStr))
       FeaturesMask |= (1ULL << Ext->CPUFeature);
-  }
   return FeaturesMask;
 }
 
@@ -132,6 +133,14 @@ std::optional<AArch64::ExtensionInfo> AArch64::parseArchExtension(StringRef Arch
   return {};
 }
 
+std::optional<AArch64::ExtensionInfo>
+AArch64::parseTargetFeature(StringRef Feature) {
+  for (const auto &E : Extensions)
+    if (Feature == E.Feature)
+      return E;
+  return {};
+}
+
 std::optional<AArch64::CpuInfo> AArch64::parseCpu(StringRef Name) {
   // Resolve aliases first.
   Name = resolveCPUAlias(Name);
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index da714c9a75701b..159362058ef42c 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -89,7 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
-STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
+STATISTIC(NumIFuncsResolved, "Number of resolved IFuncs");
 STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
 
 static cl::opt<bool>
@@ -2462,6 +2462,142 @@ DeleteDeadIFuncs(Module &M,
   return Changed;
 }
 
+// Follows the use-def chain of \p V backwards until it finds a Function,
+// in which case it collects in \p Versions.
+static void collectVersions(Value *V, SmallVectorImpl<Function *> &Versions) {
+  if (auto *F = dyn_cast<Function>(V)) {
+    Versions.push_back(F);
+  } else if (auto *Sel = dyn_cast<SelectInst>(V)) {
+    collectVersions(Sel->getTrueValue(), Versions);
+    collectVersions(Sel->getFalseValue(), Versions);
+  } else if (auto *Phi = dyn_cast<PHINode>(V)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+      collectVersions(Phi->getIncomingValue(I), Versions);
+  }
+}
+
+// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
+// deduce whether the optimization is legal we need to compare the target
+// features between caller and callee versions. The criteria for bypassing
+// the resolver are the following:
+//
+// * If the callee's feature set is a subset of the caller's feature set,
+//   then the callee is a candidate for direct call.
+//
+// * Among such candidates the one of highest priority is the best match
+//   and it shall be picked, unless there is a version of the callee with
+//   higher priority than the best match which cannot be picked from a
+//   higher priority caller (directly or through the resolver).
+//
+// * For every higher priority callee version than the best match, there
+//   is a higher priority caller version whose feature set availability
+//   is implied by the callee's feature set.
+//
+static bool OptimizeNonTrivialIFuncs(
+    Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+  bool Changed = false;
+
+  // Cache containing the mask constructed from a function's target features.
+  DenseMap<Function *, uint64_t> FeatureMask;
+
+  for (GlobalIFunc &IF : M.ifuncs()) {
+    if (IF.isInterposable())
+      continue;
+
+    Function *Resolver = IF.getResolverFunction();
+    if (!Resolver)
+      continue;
+
+    if (Resolver->isInterposable())
+      continue;
+
+    TargetTransformInfo &TTI = GetTTI(*Resolver);
+    if (!TTI.hasFMV())
+      return false;
+
+    // Discover the callee versions.
+    SmallVector<Function *> Callees;
+    for (BasicBlock &BB : *Resolver)
+      if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
+        collectVersions(Ret->getReturnValue(), Callees);
+
+    if (Callees.empty())
+      continue;
+
+    // Cache the feature mask for each callee.
+    for (Function *Callee : Callees) {
+      auto [It, Inserted] = FeatureMask.try_emplace(Callee);
+      if (Inserted)
+        It->second = TTI.getFeatureMask(*Callee);
+    }
+
+    // Sort the callee versions in decreasing priority order.
+    sort(Callees, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    // Find the callsites and cache the feature mask for each caller.
+    SmallVector<Function *> Callers;
+    DenseMap<Function *, SmallVector<CallBase *>> CallSites;
+    for (User *U : IF.users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        if (CB->getCalledOperand() == &IF) {
+          Function *Caller = CB->getFunction();
+          auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
+          if (FeatInserted)
+            FeatIt->second = TTI.getFeatureMask(*Caller);
+          auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
+          if (CallInserted)
+            Callers.push_back(Caller);
+          CallIt->second.push_back(CB);
+        }
+      }
+    }
+
+    // Sort the caller versions in decreasing priority order.
+    sort(Callers, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; };
+
+    // Index to the highest priority candidate.
+    unsigned I = 0;
+    // Now try to redirect calls starting from higher priority callers.
+    for (Function *Caller : Callers) {
+      // Getting here means we found callers of equal priority.
+      if (I == Callees.size())
+        break;
+      Function *Callee = Callees[I];
+      uint64_t CallerBits = FeatureMask[Caller];
+      uint64_t CalleeBits = FeatureMask[Callee];
+      // If the feature set of the caller implies the feature set of the
+      // highest priority candidate then it shall be picked. In case of
+      // identical sets advance the candidate index one position.
+      if (CallerBits == CalleeBits)
+        ++I;
+      else if (!implies(CallerBits, CalleeBits)) {
+        // Keep advancing the candidate index as long as the caller's
+        // features are a subset of the current candidate's.
+        while (implies(CalleeBits, CallerBits)) {
+          if (++I == Callees.size())
+            break;
+          CalleeBits = FeatureMask[Callees[I]];
+        }
+        continue;
+      }
+      auto &Calls = CallSites[Caller];
+      for (CallBase *CS : Calls)
+        CS->setCalledOperand(Callee);
+      Changed = true;
+    }
+    if (IF.use_empty() ||
+        all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+      NumIFuncsResolved++;
+  }
+  return Changed;
+}
+
 static bool
 optimizeGlobalsInModule(Module &M, const DataLayout &DL,
                         function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2525,6 +2661,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
     // Optimize IFuncs whose callee's are statically known.
     LocalChange |= OptimizeStaticIFuncs(M);
 
+    // Optimize IFuncs based on the target features of the caller.
+    LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI);
+
     // Remove any IFuncs that are now dead.
     LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats);
 
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
new file mode 100644
index 00000000000000..2805ce6fb2a3dc
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_same_priority_callers)" --version 4
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$test_single_bb_resolver.resolver = comdat any
+$test_multi_bb_resolver.resolver = comdat any
+$test_caller_feats_not_implied.resolver = comdat any
+$test_same_priority_callers.resolver = comdat any
+$foo.resolver = comdat any
+$bar.resolver = comdat any
+$goo.resolver = comdat any
+$baz.resolver = comdat any
+
+ at __aarch64_cpu_features = external local_unnamed_addr global { i64 }
+
+ at test_single_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_single_bb_resolver
+ at test_multi_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_multi_bb_resolver
+ at test_caller_feats_not_implied.ifunc = weak_odr alias i32 (), ptr @test_caller_feats_not_implied
+ at test_same_priority_callers.ifunc = weak_odr alias i32 (), ptr @test_same_priority_callers
+ at foo.ifunc = weak_odr alias i32 (), ptr @foo
+ at bar.ifunc = weak_odr alias i32 (), ptr @bar
+ at goo.ifunc = weak_odr alias i32 (), ptr @goo
+ at baz.ifunc = weak_odr alias i32 (), ptr @baz
+
+ at test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
+ at test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
+ at test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
+ at test_same_priority_callers = weak_odr ifunc i32 (), ptr @test_same_priority_callers.resolver
+ at foo = weak_odr ifunc i32 (), ptr @foo.resolver
+ at bar = weak_odr ifunc i32 (), ptr @bar.resolver
+ at goo = weak_odr ifunc i32 (), ptr @goo.resolver
+ at baz = weak_odr ifunc i32 (), ptr @baz.resolver
+
+declare void @__init_cpu_features_resolver() local_unnamed_addr
+
+declare i32 @test_single_bb_resolver._Msve() #2
+
+declare i32 @test_single_bb_resolver._Msve2() #3
+
+define i32 @test_single_bb_resolver.default() #1 {
+; CHECK-LABEL: define i32 @test_single_bb_resolver.default(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+entry:
+  ret i32 0
+}
+
+define weak_odr ptr @test_single_bb_resolver.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 68719476736
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 1073741824
+  %.not3 = icmp eq i64 %2, 0
+  %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve
+  %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2
+  ret ptr %common.ret.op
+}
+
+define i32 @foo._Msve() #2 {
+; CHECK-LABEL: define i32 @foo._Msve(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  %add = add nsw i32 %call, 30
+  ret i32 %add
+}
+
+define i32 @foo._Msve2() #3 {
+; CHECK-LABEL: define i32 @foo._Msve2(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK:    [[CALL1:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+; CHECK:    [[CALL2:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+;
+entry:
+  %call1 = tail call i32 @test_single_bb_resolver()
+  %call2 = tail call i32 @test_single_bb_resolver()
+  %added = add nsw i32 %call1, %call2
+  %add = add nsw i32 %added, 20
+  ret i32 %add
+}
+
+define i32 @foo.default() #1 {
+; CHECK-LABEL: define i32 @foo.default(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  %add = add nsw i32 %call, 10
+  ret i32 %add
+}
+
+define weak_odr ptr @foo.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @foo.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 68719476736
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 1073741824
+  %.not3 = icmp eq i64 %2, 0
+  %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve
+  %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2
+  ret ptr %common.ret.op
+}
+
+define i32 @test_multi_bb_resolver._Mmops() #4 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver._Mmops(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+entry:
+  ret i32 3
+}
+
+define i32 @test_multi_bb_resolver._Msve2() #3 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve2(
+; CHECK-SAME: ) #[[ATTR1]] {
+entry:
+  ret i32 2
+}
+
+define i32 @test_multi_bb_resolver._Msve() #2 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+entry:
+  ret i32 1
+}
+
+define i32 @test_multi_bb_resolver.default() #1 {
+; CHECK-LABEL: define i32 @test_multi_bb_resolver.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+entry:
+  ret i32 0
+}
+
+define weak_odr ptr @test_multi_bb_resolver.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 68719476736
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve
+  br label %common.ret
+}
+
+define i32 @bar._MmopsMsve2() #5 {
+; CHECK-LABEL: define i32 @bar._MmopsMsve2(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  %add = add nsw i32 %call, 40
+  ret i32 %add
+}
+
+define i32 @bar._Mmops() #4 {
+; CHECK-LABEL: define i32 @bar._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  %add = add nsw i32 %call, 30
+  ret i32 %add
+}
+
+define i32 @bar._Msve() #2 {
+; CHECK-LABEL: define i32 @bar._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  %add = add nsw i32 %call, 20
+  ret i32 %add
+}
+
+define i32 @bar.default() #1 {
+; CHECK-LABEL: define i32 @bar.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  %add = add nsw i32 %call, 10
+  ret i32 %add
+}
+
+define weak_odr ptr @bar.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @bar.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460821022900224
+  %2 = icmp eq i64 %1, 576460821022900224
+  %3 = and i64 %0, 1073741824
+  %.not = icmp eq i64 %3, 0
+  %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve
+  %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default
+  ret ptr %common.ret.op
+}
+
+define i32 @test_caller_feats_not_implied._Mmops() #4 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Mmops(
+; CHECK-SAME: ) #[[ATTR4]] {
+entry:
+  ret i32 3
+}
+
+define i32 @test_caller_feats_not_implied._Msme() #6 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msme(
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+entry:
+  ret i32 2
+}
+
+define i32 @test_caller_feats_not_implied._Msve() #2 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+entry:
+  ret i32 1
+}
+
+define i32 @test_caller_feats_not_implied.default() #1 {
+; CHECK-LABEL: define i32 @test_caller_feats_not_implied.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+entry:
+  ret i32 0
+}
+
+define weak_odr ptr @test_caller_feats_not_implied.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 4398046511104
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve
+  br label %common.ret
+}
+
+define i32 @goo._Mmops() #4 {
+; CHECK-LABEL: define i32 @goo._Mmops(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @goo._Msve() #2 {
+; CHECK-LABEL: define i32 @goo._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @goo.default() #1 {
+; CHECK-LABEL: define i32 @goo.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define weak_odr ptr @goo.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @goo.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 1073741824
+  %.not3 = icmp eq i64 %2, 0
+  %goo._Msve.goo.default = select i1 %.not3, ptr @goo.default, ptr @goo._Msve
+  %common.ret.op = select i1 %.not, ptr %goo._Msve.goo.default, ptr @goo._Mmops
+  ret ptr %common.ret.op
+}
+
+define i32 @test_same_priority_callers._Msve() #2 {
+; CHECK-LABEL: define i32 @test_same_priority_callers._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+entry:
+  ret i32 1
+}
+
+define i32 @test_same_priority_callers.default() #1 {
+; CHECK-LABEL: define i32 @test_same_priority_callers.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+entry:
+  ret i32 0
+}
+
+define weak_odr ptr @test_same_priority_callers.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_same_priority_callers.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 1073741824
+  %.not = icmp eq i64 %1, 0
+  %test_same_priority_callers._Msve.test_same_priority_callers.default = select i1 %.not, ptr @test_same_priority_callers.default, ptr @test_same_priority_callers._Msve
+  ret ptr %test_same_priority_callers._Msve.test_same_priority_callers.default
+}
+
+define dso_local i32 @baz._Msve() #2 {
+; CHECK-LABEL: define dso_local i32 @baz._Msve(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_same_priority_callers._Msve()
+;
+entry:
+  %call = tail call i32 @test_same_priority_callers()
+  ret i32 %call
+}
+
+define i32 @baz._Maes() #1 {
+; CHECK-LABEL: define i32 @baz._Maes(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_same_priority_callers()
+;
+entry:
+  %call = tail call i32 @test_same_priority_callers()
+  ret i32 %call
+}
+
+; This isn't a bug in globalopt, but rather a problematic input.
+; The 'aes' extension does not add any target features on top
+; of what is inherited from the command line.
+;
+; What happens is that since baz._Maes and baz.default have the same priority,
+; globalopt tries to optimize the call in baz.default first and succeeds leaving
+; the remaining call in baz._Maes pointing to the resolver.
+;
+define dso_local i32 @baz.default() #1 {
+; CHECK-LABEL: define dso_local i32 @baz.default(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_same_priority_callers.default()
+;
+entry:
+  %call = tail call i32 @test_same_priority_callers()
+  ret i32 %call
+}
+
+define weak_odr ptr @baz.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @baz.resolver(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 1073741824
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 16384
+  %.not3 = icmp eq i64 %2, 0
+  %baz._Maes.baz.default = select i1 %.not3, ptr @baz.default, ptr @baz._Maes
+  %common.ret.op = select i1 %.not, ptr %baz._Maes.baz.default, ptr @baz._Msve
+  ret ptr %common.ret.op
+}
+
+attributes #0 = { "target-features"="+fmv" }
+attributes #1 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" }
+attributes #3 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" }
+attributes #4 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" }
+attributes #5 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" }
+attributes #6 = { "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" }

>From 16aa3baf9d0c354611b9d270a85c9d458302a92a Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Wed, 13 Nov 2024 19:48:27 +0000
Subject: [PATCH 2/5] Changes from last revision:

* clang format
* remove leftover target hook hasFMV after rebase
* remove filter in regression test after rebase
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 --
 llvm/lib/TargetParser/AArch64TargetParser.cpp        | 2 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp                | 2 +-
 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll  | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c4cfd5bfe82cf1..5d6663a4a0c146 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -88,8 +88,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const;
 
-  bool hasFMV() const { return ST->hasFMV(); }
-
   uint64_t getFeatureMask(Function &F) const;
 
   /// \name Scalar TTI Implementations
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 0b1a7bacdaa5ab..588ea9a5dba42a 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -52,7 +52,7 @@ std::optional<AArch64::FMVInfo>
 lookupFMVByID(llvm::AArch64::ArchExtKind ExtID) {
   for (const auto &I : llvm::AArch64::getFMVInfo())
     if (I.ID && *I.ID == ExtID)
-     return I;
+      return I;
   return {};
 }
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 3c2aba774e69c2..a427fbc2f7ea9b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2753,7 +2753,7 @@ static bool OptimizeNonTrivialIFuncs(
     unsigned I = 0;
     // Now try to redirect calls starting from higher priority callers.
     for (Function *Caller : Callers) {
-      assert (I < Callees.size() && "Found callers of equal priority");
+      assert(I < Callees.size() && "Found callers of equal priority");
 
       Function *Callee = Callees[I];
       uint64_t CallerBits = FeatureMask[Caller];
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 91e991a778fa11..8e0072c3416b5f 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_same_priority_callers)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied)" --version 4
 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"

>From 052cef872c87a18794a7f243e995a63e4b012b78 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Thu, 28 Nov 2024 23:06:28 +0000
Subject: [PATCH 3/5] Changes from last revision:

Use FMV priority mask when sorting candidates
---
 .../TargetParser/AArch64FeatPriorities.inc    |  66 +++++++++++
 .../llvm/TargetParser/AArch64TargetParser.h   |  15 ++-
 llvm/lib/Target/AArch64/AArch64FMV.td         | 105 +++++++++---------
 .../AArch64/AArch64TargetTransformInfo.cpp    |   2 +-
 llvm/lib/TargetParser/AArch64TargetParser.cpp |  33 ++++--
 .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll |  90 +++------------
 llvm/utils/TableGen/ARMTargetDefEmitter.cpp   |   4 +-
 7 files changed, 169 insertions(+), 146 deletions(-)
 create mode 100644 llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc

diff --git a/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc
new file mode 100644
index 00000000000000..96af618032aea3
--- /dev/null
+++ b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc
@@ -0,0 +1,66 @@
+//===- AArch64FeatPriorities.inc - AArch64 FMV Priorities enum --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates the AArch64 FMV features sorted in ascending priority.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_FEAT_PRIORITIES_INC_H
+#define AARCH64_FEAT_PRIORITIES_INC_H
+
+// Function Multi Versioning feature priorities.
+enum FeatPriorities {
+  PRIOR_RNG,
+  PRIOR_FLAGM,
+  PRIOR_FLAGM2,
+  PRIOR_LSE,
+  PRIOR_FP,
+  PRIOR_SIMD,
+  PRIOR_DOTPROD,
+  PRIOR_SM4,
+  PRIOR_RDM,
+  PRIOR_CRC,
+  PRIOR_SHA2,
+  PRIOR_SHA3,
+  PRIOR_PMULL,
+  PRIOR_FP16,
+  PRIOR_FP16FML,
+  PRIOR_DIT,
+  PRIOR_DPB,
+  PRIOR_DPB2,
+  PRIOR_JSCVT,
+  PRIOR_FCMA,
+  PRIOR_RCPC,
+  PRIOR_RCPC2,
+  PRIOR_RCPC3,
+  PRIOR_FRINTTS,
+  PRIOR_I8MM,
+  PRIOR_BF16,
+  PRIOR_SVE,
+  PRIOR_SVE_F32MM,
+  PRIOR_SVE_F64MM,
+  PRIOR_SVE2,
+  PRIOR_SVE_PMULL128,
+  PRIOR_SVE_BITPERM,
+  PRIOR_SVE_SHA3,
+  PRIOR_SVE_SM4,
+  PRIOR_SME,
+  PRIOR_MEMTAG2,
+  PRIOR_SB,
+  PRIOR_PREDRES,
+  PRIOR_SSBS2,
+  PRIOR_BTI,
+  PRIOR_LS64_ACCDATA,
+  PRIOR_WFXT,
+  PRIOR_SME_F64,
+  PRIOR_SME_I64,
+  PRIOR_SME2,
+  PRIOR_MOPS
+};
+
+#endif
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 1311329821828f..bd9354b4e7fa1f 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -35,6 +35,7 @@ struct ArchInfo;
 struct CpuInfo;
 
 #include "llvm/TargetParser/AArch64CPUFeatures.inc"
+#include "llvm/TargetParser/AArch64FeatPriorities.inc"
 
 static_assert(FEAT_MAX < 62,
               "Number of features in CPUFeatures are limited to 62 entries");
@@ -69,12 +70,12 @@ struct ExtensionInfo {
 
 struct FMVInfo {
   StringRef Name;                // The target_version/target_clones spelling.
-  CPUFeatures Bit;               // Index of the bit in the FMV feature bitset.
+  CPUFeatures FeatureBit;        // Index of the bit in the FMV feature bitset.
   std::optional<ArchExtKind> ID; // The architecture extension to enable.
-  unsigned Priority;             // FMV priority.
-  FMVInfo(StringRef Name, CPUFeatures Bit, std::optional<ArchExtKind> ID,
-          unsigned Priority)
-      : Name(Name), Bit(Bit), ID(ID), Priority(Priority) {};
+  FeatPriorities PriorityBit;    // FMV priority.
+  FMVInfo(StringRef Name, CPUFeatures FeatureBit, std::optional<ArchExtKind> ID,
+          FeatPriorities PriorityBit)
+      : Name(Name), FeatureBit(FeatureBit), ID(ID), PriorityBit(PriorityBit){};
 };
 
 const std::vector<FMVInfo> &getFMVInfo();
@@ -271,6 +272,10 @@ bool isX18ReservedByDefault(const Triple &TT);
 // Return the priority for a given set of FMV features.
 unsigned getFMVPriority(ArrayRef<StringRef> Features);
 
+// For given feature names, return a bitmask corresponding to the entries of
+// AArch64::FeatPriorities.
+uint64_t getPriorityMask(ArrayRef<StringRef> Features);
+
 // For given feature names, return a bitmask corresponding to the entries of
 // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks
 // themselves, they are sequential (0, 1, 2, 3, ...).
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
index fc7a94a5fe475f..e0f56fd5556196 100644
--- a/llvm/lib/Target/AArch64/AArch64FMV.td
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -22,64 +22,65 @@
 
 
 // Something you can add to target_version or target_clones.
-class FMVExtension<string n, string b, int p> {
+class FMVExtension<string name, string enumeration> {
     // Name, as spelled in target_version or target_clones. e.g. "memtag".
-    string Name = n;
+    string Name = name;
 
     // A C++ expression giving the number of the bit in the FMV ABI.
     // Currently this is given as a value from the enum "CPUFeatures".
-    string Bit = b;
+    string FeatureBit = "FEAT_" # enumeration;
 
     // SubtargetFeature enabled for codegen when this FMV feature is present.
-    string BackendFeature = n;
+    string BackendFeature = name;
 
-    // The FMV priority.
-    int Priority = p;
+    // A C++ expression giving the number of the priority bit.
+    // Currently this is given as a value from the enum "FeatPriorities".
+    string PriorityBit = "PRIOR_" # enumeration;
 }
 
-def : FMVExtension<"aes", "FEAT_PMULL", 150>;
-def : FMVExtension<"bf16", "FEAT_BF16", 280>;
-def : FMVExtension<"bti", "FEAT_BTI", 510>;
-def : FMVExtension<"crc", "FEAT_CRC", 110>;
-def : FMVExtension<"dit", "FEAT_DIT", 180>;
-def : FMVExtension<"dotprod", "FEAT_DOTPROD", 104>;
-let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "FEAT_DPB", 190>;
-let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "FEAT_DPB2", 200>;
-def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", 350>;
-def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", 360>;
-def : FMVExtension<"fcma", "FEAT_FCMA", 220>;
-def : FMVExtension<"flagm", "FEAT_FLAGM", 20>;
-let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FEAT_FLAGM2", 30>;
-def : FMVExtension<"fp", "FEAT_FP", 90>;
-def : FMVExtension<"fp16", "FEAT_FP16", 170>;
-def : FMVExtension<"fp16fml", "FEAT_FP16FML", 175>;
-let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FEAT_FRINTTS", 250>;
-def : FMVExtension<"i8mm", "FEAT_I8MM", 270>;
-def : FMVExtension<"jscvt", "FEAT_JSCVT", 210>;
-def : FMVExtension<"ls64", "FEAT_LS64_ACCDATA", 520>;
-def : FMVExtension<"lse", "FEAT_LSE", 80>;
-def : FMVExtension<"memtag", "FEAT_MEMTAG2", 440>;
-def : FMVExtension<"mops", "FEAT_MOPS", 650>;
-def : FMVExtension<"predres", "FEAT_PREDRES", 480>;
-def : FMVExtension<"rcpc", "FEAT_RCPC", 230>;
-let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "FEAT_RCPC2", 240>;
-def : FMVExtension<"rcpc3", "FEAT_RCPC3", 241>;
-def : FMVExtension<"rdm", "FEAT_RDM", 108>;
-def : FMVExtension<"rng", "FEAT_RNG", 10>;
-def : FMVExtension<"sb", "FEAT_SB", 470>;
-def : FMVExtension<"sha2", "FEAT_SHA2", 130>;
-def : FMVExtension<"sha3", "FEAT_SHA3", 140>;
-def : FMVExtension<"simd", "FEAT_SIMD", 100>;
-def : FMVExtension<"sm4", "FEAT_SM4", 106>;
-def : FMVExtension<"sme", "FEAT_SME", 430>;
-def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", 560>;
-def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", 570>;
-def : FMVExtension<"sme2", "FEAT_SME2", 580>;
-def : FMVExtension<"ssbs", "FEAT_SSBS2", 490>;
-def : FMVExtension<"sve", "FEAT_SVE", 310>;
-def : FMVExtension<"sve2", "FEAT_SVE2", 370>;
-def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", 380>;
-def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", 400>;
-def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", 410>;
-def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", 420>;
-def : FMVExtension<"wfxt", "FEAT_WFXT", 550>;
+def : FMVExtension<"aes", "PMULL">;
+def : FMVExtension<"bf16", "BF16">;
+def : FMVExtension<"bti", "BTI">;
+def : FMVExtension<"crc", "CRC">;
+def : FMVExtension<"dit", "DIT">;
+def : FMVExtension<"dotprod", "DOTPROD">;
+let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "DPB">;
+let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "DPB2">;
+def : FMVExtension<"f32mm", "SVE_F32MM">;
+def : FMVExtension<"f64mm", "SVE_F64MM">;
+def : FMVExtension<"fcma", "FCMA">;
+def : FMVExtension<"flagm", "FLAGM">;
+let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FLAGM2">;
+def : FMVExtension<"fp", "FP">;
+def : FMVExtension<"fp16", "FP16">;
+def : FMVExtension<"fp16fml", "FP16FML">;
+let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">;
+def : FMVExtension<"i8mm", "I8MM">;
+def : FMVExtension<"jscvt", "JSCVT">;
+def : FMVExtension<"ls64", "LS64_ACCDATA">;
+def : FMVExtension<"lse", "LSE">;
+def : FMVExtension<"memtag", "MEMTAG2">;
+def : FMVExtension<"mops", "MOPS">;
+def : FMVExtension<"predres", "PREDRES">;
+def : FMVExtension<"rcpc", "RCPC">;
+let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">;
+def : FMVExtension<"rcpc3", "RCPC3">;
+def : FMVExtension<"rdm", "RDM">;
+def : FMVExtension<"rng", "RNG">;
+def : FMVExtension<"sb", "SB">;
+def : FMVExtension<"sha2", "SHA2">;
+def : FMVExtension<"sha3", "SHA3">;
+def : FMVExtension<"simd", "SIMD">;
+def : FMVExtension<"sm4", "SM4">;
+def : FMVExtension<"sme", "SME">;
+def : FMVExtension<"sme-f64f64", "SME_F64">;
+def : FMVExtension<"sme-i16i64", "SME_I64">;
+def : FMVExtension<"sme2", "SME2">;
+def : FMVExtension<"ssbs", "SSBS2">;
+def : FMVExtension<"sve", "SVE">;
+def : FMVExtension<"sve2", "SVE2">;
+def : FMVExtension<"sve2-aes", "SVE_PMULL128">;
+def : FMVExtension<"sve2-bitperm", "SVE_BITPERM">;
+def : FMVExtension<"sve2-sha3", "SVE_SHA3">;
+def : FMVExtension<"sve2-sm4", "SVE_SM4">;
+def : FMVExtension<"wfxt", "WFXT">;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a81e7935e59685..56b9a40557baaf 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -252,7 +252,7 @@ uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const {
   FeatureStr.split(Features, ",");
   if (none_of(Features, [](StringRef Feat) { return Feat == "+fmv"; }))
     return 0;
-  return AArch64::getCpuSupportsMask(Features);
+  return AArch64::getPriorityMask(Features);
 }
 
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 5313cdced8981b..83d177afd904b6 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -57,26 +57,35 @@ lookupFMVByID(llvm::AArch64::ArchExtKind ExtID) {
 }
 
 unsigned AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
-  constexpr unsigned MaxFMVPriority = 1000;
-  unsigned Priority = 0;
-  unsigned NumFeatures = 0;
+  constexpr unsigned MaxFMVPriority = 100;
+  uint64_t Priority = 0;
+  FeatPriorities TopBit = static_cast<FeatPriorities>(0);
   for (StringRef Feature : Features) {
-    if (auto Ext = parseFMVExtension(Feature)) {
-      Priority = std::max(Priority, Ext->Priority);
-      NumFeatures++;
+    if (auto FMVExt = parseFMVExtension(Feature)) {
+      TopBit = std::max(TopBit, FMVExt->PriorityBit);
+      Priority |= (1ULL << FMVExt->PriorityBit);
     }
   }
-  return Priority + MaxFMVPriority * NumFeatures;
+  return TopBit + MaxFMVPriority * popcount(Priority);
+}
+
+uint64_t AArch64::getPriorityMask(ArrayRef<StringRef> Features) {
+  uint64_t PriorityMask = 0;
+  for (StringRef Feature : Features) {
+    if (auto FMVExt = parseFMVExtension(Feature))
+      PriorityMask |= (1ULL << FMVExt->PriorityBit);
+    else if (auto ArchExt = targetFeatureToExtension(Feature))
+      if (auto FMVExt = lookupFMVByID(ArchExt->ID))
+        PriorityMask |= (1ULL << FMVExt->PriorityBit);
+  }
+  return PriorityMask;
 }
 
 uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs) {
   uint64_t FeaturesMask = 0;
   for (const StringRef &FeatureStr : FeatureStrs) {
-    if (auto FMVExt = parseFMVExtension(FeatureStr))
-      FeaturesMask |= (1ULL << FMVExt->Bit);
-    else if (auto ArchExt = targetFeatureToExtension(FeatureStr))
-      if (auto FMVExt = lookupFMVByID(ArchExt->ID))
-        FeaturesMask |= (1ULL << FMVExt->Bit);
+    if (auto Ext = parseFMVExtension(FeatureStr))
+      FeaturesMask |= (1ULL << Ext->FeatureBit);
   }
   return FeaturesMask;
 }
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 8e0072c3416b5f..fb89a7c06489df 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -7,18 +7,12 @@ target triple = "aarch64-unknown-linux-gnu"
 $test_single_bb_resolver.resolver = comdat any
 $test_multi_bb_resolver.resolver = comdat any
 $test_caller_feats_not_implied.resolver = comdat any
-$foo.resolver = comdat any
-$bar.resolver = comdat any
-$goo.resolver = comdat any
 
 @__aarch64_cpu_features = external local_unnamed_addr global { i64 }
 
 @test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
 @test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
 @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
- at foo = weak_odr ifunc i32 (), ptr @foo.resolver
- at bar = weak_odr ifunc i32 (), ptr @bar.resolver
- at goo = weak_odr ifunc i32 (), ptr @goo.resolver
 
 declare void @__init_cpu_features_resolver() local_unnamed_addr
 
@@ -45,50 +39,32 @@ resolver_entry:
 
 define i32 @foo._Msve() #1 {
 ; CHECK-LABEL: define i32 @foo._Msve(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
 ;
 entry:
   %call = tail call i32 @test_single_bb_resolver()
-  %add = add nsw i32 %call, 30
-  ret i32 %add
+  ret i32 %call
 }
 
 define i32 @foo._Msve2() #2 {
 ; CHECK-LABEL: define i32 @foo._Msve2(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
 ;
 entry:
   %call = tail call i32 @test_single_bb_resolver()
-  %add = add nsw i32 %call, 20
-  ret i32 %add
+  ret i32 %call
 }
 
 define i32 @foo.default() #0 {
 ; CHECK-LABEL: define i32 @foo.default(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
 ;
 entry:
   %call = tail call i32 @test_single_bb_resolver()
-  %add = add nsw i32 %call, 10
-  ret i32 %add
-}
-
-define weak_odr ptr @foo.resolver() #0 comdat {
-; CHECK-LABEL: define weak_odr ptr @foo.resolver(
-; CHECK-SAME: ) #[[ATTR0]] comdat {
-resolver_entry:
-  tail call void @__init_cpu_features_resolver()
-  %0 = load i64, ptr @__aarch64_cpu_features, align 8
-  %1 = and i64 %0, 68719476736
-  %.not = icmp eq i64 %1, 0
-  %2 = and i64 %0, 1073741824
-  %.not3 = icmp eq i64 %2, 0
-  %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve
-  %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2
-  ret ptr %common.ret.op
+  ret i32 %call
 }
 
 declare i32 @test_multi_bb_resolver._Mmops() #3
@@ -127,13 +103,12 @@ resolver_else2:                                   ; preds = %resolver_else
 
 define i32 @bar._MmopsMsve2() #4 {
 ; CHECK-LABEL: define i32 @bar._MmopsMsve2(
-; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
 ;
 entry:
   %call = tail call i32 @test_multi_bb_resolver()
-  %add = add nsw i32 %call, 40
-  ret i32 %add
+  ret i32 %call
 }
 
 define i32 @bar._Mmops() #3 {
@@ -143,45 +118,27 @@ define i32 @bar._Mmops() #3 {
 ;
 entry:
   %call = tail call i32 @test_multi_bb_resolver()
-  %add = add nsw i32 %call, 30
-  ret i32 %add
+  ret i32 %call
 }
 
 define i32 @bar._Msve() #1 {
 ; CHECK-LABEL: define i32 @bar._Msve(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
 ;
 entry:
   %call = tail call i32 @test_multi_bb_resolver()
-  %add = add nsw i32 %call, 20
-  ret i32 %add
+  ret i32 %call
 }
 
 define i32 @bar.default() #0 {
 ; CHECK-LABEL: define i32 @bar.default(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
 ;
 entry:
   %call = tail call i32 @test_multi_bb_resolver()
-  %add = add nsw i32 %call, 10
-  ret i32 %add
-}
-
-define weak_odr ptr @bar.resolver() #0 comdat {
-; CHECK-LABEL: define weak_odr ptr @bar.resolver(
-; CHECK-SAME: ) #[[ATTR0]] comdat {
-resolver_entry:
-  tail call void @__init_cpu_features_resolver()
-  %0 = load i64, ptr @__aarch64_cpu_features, align 8
-  %1 = and i64 %0, 576460821022900224
-  %2 = icmp eq i64 %1, 576460821022900224
-  %3 = and i64 %0, 1073741824
-  %.not = icmp eq i64 %3, 0
-  %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve
-  %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default
-  ret ptr %common.ret.op
+  ret i32 %call
 }
 
 declare i32 @test_caller_feats_not_implied._Mmops() #3
@@ -220,7 +177,7 @@ resolver_else2:                                   ; preds = %resolver_else
 
 define i32 @goo._Mmops() #3 {
 ; CHECK-LABEL: define i32 @goo._Mmops(
-; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
 ;
 entry:
@@ -230,7 +187,7 @@ entry:
 
 define i32 @goo._Msve() #1 {
 ; CHECK-LABEL: define i32 @goo._Msve(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
 ;
 entry:
@@ -240,7 +197,7 @@ entry:
 
 define i32 @goo.default() #0 {
 ; CHECK-LABEL: define i32 @goo.default(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 ; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
 ;
 entry:
@@ -248,21 +205,6 @@ entry:
   ret i32 %call
 }
 
-define weak_odr ptr @goo.resolver() #0 comdat {
-; CHECK-LABEL: define weak_odr ptr @goo.resolver(
-; CHECK-SAME: ) #[[ATTR0]] comdat {
-resolver_entry:
-  tail call void @__init_cpu_features_resolver()
-  %0 = load i64, ptr @__aarch64_cpu_features, align 8
-  %1 = and i64 %0, 576460752303423488
-  %.not = icmp eq i64 %1, 0
-  %2 = and i64 %0, 1073741824
-  %.not3 = icmp eq i64 %2, 0
-  %goo._Msve.goo.default = select i1 %.not3, ptr @goo.default, ptr @goo._Msve
-  %common.ret.op = select i1 %.not, ptr %goo._Msve.goo.default, ptr @goo._Mmops
-  ret ptr %common.ret.op
-}
-
 attributes #0 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
 attributes #1 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" }
 attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" }
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index 3b02f63e9490b1..a8c7acbcd1dd1f 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -162,14 +162,14 @@ static void emitARMTargetDef(const RecordKeeper &RK, raw_ostream &OS) {
   for (const Record *Rec : FMVExts) {
     OS << "  I.emplace_back(";
     OS << "\"" << Rec->getValueAsString("Name") << "\"";
-    OS << ", " << Rec->getValueAsString("Bit");
+    OS << ", " << Rec->getValueAsString("FeatureBit");
     auto FeatName = Rec->getValueAsString("BackendFeature");
     const Record *FeatRec = ExtensionMap[FeatName];
     if (FeatRec)
       OS << ", " << FeatRec->getValueAsString("ArchExtKindSpelling").upper();
     else
       OS << ", std::nullopt";
-    OS << ", " << (uint64_t)Rec->getValueAsInt("Priority");
+    OS << ", " << Rec->getValueAsString("PriorityBit");
     OS << ");\n";
   };
   OS << "  return I;\n"

>From 5314bc2e3de316eae1a69c3ab0e48f5c9fe7d010 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Sat, 30 Nov 2024 13:34:51 +0000
Subject: [PATCH 4/5] Changes from last revision

Allow the optimization when the caller is non FMV but the attributes match.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  8 +++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 +++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  9 ++++--
 .../AArch64/AArch64TargetTransformInfo.h      |  2 ++
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 19 ++++-------
 .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 32 ++++++++++++++++++-
 7 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 12601e39294869..ee3163fd9a599d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1844,6 +1844,9 @@ class TargetTransformInfo {
   /// Returns a bitmask constructed from the target features of a function.
   uint64_t getFeatureMask(Function &F) const;
 
+  /// Returns true if this is an instance of a function with multiple versions.
+  bool isMultiversionedFunction(Function &F) const;
+
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
@@ -2270,6 +2273,7 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual uint64_t getFeatureMask(Function &F) const = 0;
+  virtual bool isMultiversionedFunction(Function &F) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
   virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
                                                Type *ArrayType) const = 0;
@@ -3090,6 +3094,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getFeatureMask(F);
   }
 
+  bool isMultiversionedFunction(Function &F) const override {
+    return Impl.isMultiversionedFunction(F);
+  }
+
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6034c3c84135cb..ecedb9c52cb26c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1026,6 +1026,8 @@ class TargetTransformInfoImplBase {
 
   uint64_t getFeatureMask(Function &F) const { return 0; }
 
+  bool isMultiversionedFunction(Function &F) const { return false; }
+
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e0ced05ecf10f2..bc0557d721de8d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1364,6 +1364,10 @@ uint64_t TargetTransformInfo::getFeatureMask(Function &F) const {
   return TTIImpl->getFeatureMask(F);
 }
 
+bool TargetTransformInfo::isMultiversionedFunction(Function &F) const {
+  return TTIImpl->isMultiversionedFunction(F);
+}
+
 unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 56b9a40557baaf..82fb90cbf8d905 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -250,11 +250,16 @@ uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const {
   StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString();
   SmallVector<StringRef, 8> Features;
   FeatureStr.split(Features, ",");
-  if (none_of(Features, [](StringRef Feat) { return Feat == "+fmv"; }))
-    return 0;
   return AArch64::getPriorityMask(Features);
 }
 
+bool AArch64TTIImpl::isMultiversionedFunction(Function &F) const {
+  StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString();
+  SmallVector<StringRef, 8> Features;
+  FeatureStr.split(Features, ",");
+  return any_of(Features, [](StringRef Feat) { return Feat == "+fmv"; });
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1a26a55282a028..cb0e7cd496f250 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -90,6 +90,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   uint64_t getFeatureMask(Function &F) const;
 
+  bool isMultiversionedFunction(Function &F) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index f6fc0a9c2f1675..442487d242664a 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2702,24 +2702,19 @@ static bool OptimizeNonTrivialIFuncs(
 
     TargetTransformInfo &TTI = GetTTI(*Resolver);
 
+    // This IFunc is not FMV.
+    if (any_of(Callees, [&TTI](Function *F) {
+          return !TTI.isMultiversionedFunction(*F);
+        }))
+      continue;
+
     // Cache the feature mask for each callee.
-    bool IsFMV = true;
     for (Function *Callee : Callees) {
       auto [It, Inserted] = FeatureMask.try_emplace(Callee);
-      if (Inserted) {
+      if (Inserted)
         It->second = TTI.getFeatureMask(*Callee);
-        // Empty mask means this isn't an FMV callee.
-        if (It->second == 0) {
-          IsFMV = false;
-          break;
-        }
-      }
     }
 
-    // This IFunc is not FMV.
-    if (!IsFMV)
-      continue;
-
     // Sort the callee versions in decreasing priority order.
     sort(Callees, [&](auto *LHS, auto *RHS) {
       return FeatureMask[LHS] > FeatureMask[RHS];
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index fb89a7c06489df..c6fca86a796394 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller)" --version 4
 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -7,12 +7,14 @@ target triple = "aarch64-unknown-linux-gnu"
 $test_single_bb_resolver.resolver = comdat any
 $test_multi_bb_resolver.resolver = comdat any
 $test_caller_feats_not_implied.resolver = comdat any
+$test_non_fmv_caller.resolver = comdat any
 
 @__aarch64_cpu_features = external local_unnamed_addr global { i64 }
 
 @test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
 @test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
 @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
+ at test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver
 
 declare void @__init_cpu_features_resolver() local_unnamed_addr
 
@@ -205,9 +207,37 @@ entry:
   ret i32 %call
 }
 
+declare i32 @test_non_fmv_caller._Maes() #6
+
+declare i32 @test_non_fmv_caller.default() #0
+
+define weak_odr ptr @test_non_fmv_caller.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver(
+; CHECK-SAME: ) #[[ATTR0]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 32768
+  %.not = icmp eq i64 %1, 0
+  %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes
+  ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default
+}
+
+define i32 @baz() #7 {
+; CHECK-LABEL: define i32 @baz(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
 attributes #0 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
 attributes #1 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" }
 attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" }
 attributes #3 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" }
 attributes #4 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" }
 attributes #5 = { "target-features"="+bf16,+fmv,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" }
+attributes #6 = { "target-features"="+aes,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #7 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" }

>From 2c3b4d17cd81f75b6ebd2081bcfdc441408e691a Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
Date: Sat, 30 Nov 2024 14:51:04 +0000
Subject: [PATCH 5/5] Changes from last revision

Add a problematic test case
---
 .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 64 ++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index c6fca86a796394..4284756ec939f5 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller)" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority)" --version 4
 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -8,6 +8,7 @@ $test_single_bb_resolver.resolver = comdat any
 $test_multi_bb_resolver.resolver = comdat any
 $test_caller_feats_not_implied.resolver = comdat any
 $test_non_fmv_caller.resolver = comdat any
+$test_priority.resolver = comdat any
 
 @__aarch64_cpu_features = external local_unnamed_addr global { i64 }
 
@@ -15,6 +16,7 @@ $test_non_fmv_caller.resolver = comdat any
 @test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
 @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
 @test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver
+ at test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
 
 declare void @__init_cpu_features_resolver() local_unnamed_addr
 
@@ -233,6 +235,62 @@ entry:
   ret i32 %call
 }
 
+declare i32 @test_priority._Msve2-sha3() #8
+
+declare i32 @test_priority._Mls64Mssbs() #9
+
+declare i32 @test_priority._MflagmMlseMrng() #10
+
+declare i32 @test_priority.default() #0
+
+define weak_odr ptr @test_priority.resolver() #0 comdat {
+; CHECK-LABEL: define weak_odr ptr @test_priority.resolver(
+; CHECK-SAME: ) #[[ATTR0]] comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 131
+  %2 = icmp eq i64 %1, 131
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 9570149208162304
+  %4 = icmp eq i64 %3, 9570149208162304
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 1099511627776
+  %.not = icmp eq i64 %5, 0
+  %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3
+  br label %common.ret
+}
+
+; FIXME: This is wrong. When generating the resolver the selection algorithm
+; orders the function versions according to the number of specified features,
+; from highest to lowest. (Note: I am going on a tangent here but in the case
+; of a tie, the version with the highest priority feature is preferred. This
+; is non deterministic if the highest priority feature is common. For example
+; mops+sve vs mops+sve2).
+;
+; In this example the problem is slightly different. When in IR we can't know
+; what were the features before their dependencies got expanded. Therefore
+; we can select based on highest priority feature, then second, then third,
+; etc... That's what we should be doing in the front-end too if you ask me.
+;
+define i32 @hoo._MflagmMls64MlseMrngMssbsMsve2-sha3() #11 {
+; CHECK-LABEL: define i32 @hoo._MflagmMls64MlseMrngMssbsMsve2-sha3(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR11:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs()
+;
+entry:
+  %call = tail call i32 @test_priority()
+  ret i32 %call
+}
+
 attributes #0 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
 attributes #1 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" }
 attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" }
@@ -241,3 +299,7 @@ attributes #4 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outli
 attributes #5 = { "target-features"="+bf16,+fmv,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" }
 attributes #6 = { "target-features"="+aes,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" }
 attributes #7 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #8 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a" }
+attributes #9 = { "target-features"="+fmv,+fp-armv8,+ls64,+neon,+outline-atomics,+ssbs,+v8a" }
+attributes #10 = { "target-features"="+flagm,+fmv,+fp-armv8,+lse,+neon,+outline-atomics,+rand,+v8a" }
+attributes #11 = { "target-features"="+flagm,+fmv,+fp-armv8,+fullfp16,+ls64,+lse,+neon,+outline-atomics,+rand,+sha2,+sha3,+ssbs,+sve,+sve2,+sve2-sha3,+v8a" }