[clang] [clang][FMV] Direct-call FMV callees from FMV callers (PR #80093)

Jon Roelofs via cfe-commits cfe-commits at lists.llvm.org
Wed Jan 31 09:19:28 PST 2024


https://github.com/jroelofs updated https://github.com/llvm/llvm-project/pull/80093

>From ed52ee4424459ebc046a625341ad8dbbd38bcbe3 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 30 Jan 2024 19:13:42 -0800
Subject: [PATCH 1/4] [clang][FMV] Direct-call multi-versioned callees from
 multi-versioned callers

... when there is a callee with a matching feature set, and no other higher
priority callee.  This optimization helps the inliner see past the
ifunc+resolver to the callee that we know it will always land on.

This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714
---
 clang/lib/CodeGen/CGCall.cpp                  |  72 +++++
 clang/lib/CodeGen/CodeGenModule.cpp           |   2 +-
 .../test/CodeGen/attr-target-mv-direct-call.c | 245 ++++++++++++++++++
 3 files changed, 318 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/attr-target-mv-direct-call.c

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 28c211aa631e4..84a04e3ccddd8 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4966,6 +4966,11 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) {
   return MaxVectorWidth;
 }
 
+// FIXME: put this somewhere nicer to share
+unsigned
+TargetMVPriority(const TargetInfo &TI,
+                 const CodeGenFunction::MultiVersionResolverOption &RO);
+
 RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  const CGCallee &Callee,
                                  ReturnValueSlot ReturnValue,
@@ -5437,6 +5442,73 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this);
   llvm::Value *CalleePtr = ConcreteCallee.getFunctionPointer();
 
+  // If a multi-versioned caller calls a multi-versioned callee, skip the
+  // resolver when there is a precise match on the feature sets, and no
+  // possibility of a better match at runtime.
+  if (const auto *CallerFD = dyn_cast_or_null<FunctionDecl>(CurGD.getDecl()))
+    if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>())
+      if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
+        // FIXME: do the same where either the caller or callee are
+        // target_clones.
+        if (FD->isTargetMultiVersion()) {
+          llvm::SmallVector<StringRef, 8> CallerFeats;
+          CallerTVA->getFeatures(CallerFeats);
+          MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats);
+
+          bool HasHigherPriorityCallee = false;
+          llvm::Constant *FoundMatchingCallee = nullptr;
+          getContext().forEachMultiversionedFunctionVersion(
+              FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee,
+                   &FoundMatchingCallee](const FunctionDecl *CurFD) {
+                const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>();
+
+                GlobalDecl CurGD{
+                    (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
+                StringRef MangledName = CGM.getMangledName(CurFD);
+
+                llvm::SmallVector<StringRef, 8> CalleeFeats;
+                CalleeTVA->getFeatures(CalleeFeats);
+                MultiVersionResolverOption CalleeMVRO(nullptr, "", CalleeFeats);
+
+                const TargetInfo &TI = getTarget();
+
+                // If there is a higher priority callee, we can't do the
+                // optimization at all, as it would be a valid choice at
+                // runtime.
+                if (TargetMVPriority(TI, CalleeMVRO) >
+                    TargetMVPriority(TI, CallerMVRO)) {
+                  HasHigherPriorityCallee = true;
+                  return;
+                }
+
+                // FIXME: we could allow a lower-priority match when the
+                // features are a proper subset. But for now, to keep things
+                // simpler, we only care about a precise match.
+                if (TargetMVPriority(TI, CalleeMVRO) <
+                    TargetMVPriority(TI, CallerMVRO))
+                  return;
+
+                if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
+                  FoundMatchingCallee = Func;
+                  return;
+                }
+
+                if (CurFD->isDefined()) {
+                  // FIXME: not sure how to get the address
+                } else {
+                  const CGFunctionInfo &FI =
+                      getTypes().arrangeGlobalDeclaration(FD);
+                  llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
+                  FoundMatchingCallee =
+                      CGM.GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false,
+                                            /*DontDefer=*/false, ForDefinition);
+                }
+              });
+
+          if (FoundMatchingCallee && !HasHigherPriorityCallee)
+            CalleePtr = FoundMatchingCallee;
+        }
+
   // If we're using inalloca, set up that argument.
   if (ArgMemory.isValid()) {
     llvm::Value *Arg = ArgMemory.getPointer();
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6ec54cc01c923..c334e4a3a40f3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4092,7 +4092,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
 static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
                                                       llvm::Function *NewFn);
 
-static unsigned
+unsigned
 TargetMVPriority(const TargetInfo &TI,
                  const CodeGenFunction::MultiVersionResolverOption &RO) {
   unsigned Priority = 0;
diff --git a/clang/test/CodeGen/attr-target-mv-direct-call.c b/clang/test/CodeGen/attr-target-mv-direct-call.c
new file mode 100644
index 0000000000000..687fdd1ca3c24
--- /dev/null
+++ b/clang/test/CodeGen/attr-target-mv-direct-call.c
@@ -0,0 +1,245 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
+
+// Check that we make a direct call from direct_caller._Msimd to
+// direct_callee._Msimd when there is no better option.
+__attribute__((target_version("simd"))) void direct_callee(void) {}
+__attribute__((target_version("default"))) void direct_callee(void) {}
+__attribute__((target_version("simd"))) void direct_caller(void) { direct_callee(); }
+__attribute__((target_version("default"))) void direct_caller(void) { direct_callee(); }
+
+// ... and that we go through the ifunc+resolver when there is a better option
+// that might be chosen at runtime.
+__attribute__((target_version("simd"))) void resolved_callee1(void) {}
+__attribute__((target_version("fcma"))) void resolved_callee1(void) {}
+__attribute__((target_version("default"))) void resolved_callee1(void) {}
+__attribute__((target_version("simd"))) void resolved_caller1(void) { resolved_callee1(); }
+__attribute__((target_version("default"))) void resolved_caller1(void) { resolved_callee1(); }
+
+// FIXME: we could direct call in cases like this:
+__attribute__((target_version("fp"))) void resolved_callee2(void) {}
+__attribute__((target_version("default"))) void resolved_callee2(void) {}
+__attribute__((target_version("simd+fp"))) void resolved_caller2(void) { resolved_callee2(); }
+__attribute__((target_version("default"))) void resolved_caller2(void) { resolved_callee2(); }
+
+void source() {
+    direct_caller();
+    resolved_caller1();
+    resolved_caller2();
+}
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @direct_callee.ifunc = weak_odr ifunc void (), ptr @direct_callee.resolver
+// CHECK: @direct_caller.ifunc = weak_odr ifunc void (), ptr @direct_caller.resolver
+// CHECK: @resolved_callee1.ifunc = weak_odr ifunc void (), ptr @resolved_callee1.resolver
+// CHECK: @resolved_caller1.ifunc = weak_odr ifunc void (), ptr @resolved_caller1.resolver
+// CHECK: @resolved_callee2.ifunc = weak_odr ifunc void (), ptr @resolved_callee2.resolver
+// CHECK: @resolved_caller2.ifunc = weak_odr ifunc void (), ptr @resolved_caller2.resolver
+//.
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_callee._Msimd
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@direct_callee.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @direct_callee._Msimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @direct_callee.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_caller._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @direct_callee._Msimd()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@direct_caller.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @direct_caller._Msimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @direct_caller.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2097152
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2097152
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_callee1._Mfcma
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 512
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 512
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @resolved_callee1._Msimd
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @resolved_callee1.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee1.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_caller1._Msimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @resolved_caller1.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2._Mfp
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 256
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 256
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_callee2._Mfp
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @resolved_callee2.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2._MfpMsimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee2.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 768
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 768
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_caller2._MfpMsimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @resolved_caller2.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@source
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @direct_caller.ifunc()
+// CHECK-NEXT:    call void @resolved_caller1.ifunc()
+// CHECK-NEXT:    call void @resolved_caller2.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_callee.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_caller.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @direct_callee.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Mfcma
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee1.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee2.ifunc()
+// CHECK-NEXT:    ret void
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+neon" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.

>From 712643868509f64ee820688579b0278b8783456c Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Wed, 31 Jan 2024 08:53:36 -0800
Subject: [PATCH 2/4] don't do it at -O0

---
 clang/lib/CodeGen/CGCall.cpp                  | 127 ++---
 .../test/CodeGen/attr-target-mv-direct-call.c | 453 +++++++++---------
 2 files changed, 284 insertions(+), 296 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 84a04e3ccddd8..fe69b0ab133ac 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5446,68 +5446,71 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // resolver when there is a precise match on the feature sets, and no
   // possibility of a better match at runtime.
   if (const auto *CallerFD = dyn_cast_or_null<FunctionDecl>(CurGD.getDecl()))
-    if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>())
-      if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
-        // FIXME: do the same where either the caller or callee are
-        // target_clones.
-        if (FD->isTargetMultiVersion()) {
-          llvm::SmallVector<StringRef, 8> CallerFeats;
-          CallerTVA->getFeatures(CallerFeats);
-          MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats);
-
-          bool HasHigherPriorityCallee = false;
-          llvm::Constant *FoundMatchingCallee = nullptr;
-          getContext().forEachMultiversionedFunctionVersion(
-              FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee,
-                   &FoundMatchingCallee](const FunctionDecl *CurFD) {
-                const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>();
-
-                GlobalDecl CurGD{
-                    (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
-                StringRef MangledName = CGM.getMangledName(CurFD);
-
-                llvm::SmallVector<StringRef, 8> CalleeFeats;
-                CalleeTVA->getFeatures(CalleeFeats);
-                MultiVersionResolverOption CalleeMVRO(nullptr, "", CalleeFeats);
-
-                const TargetInfo &TI = getTarget();
-
-                // If there is a higher priority callee, we can't do the
-                // optimization at all, as it would be a valid choice at
-                // runtime.
-                if (TargetMVPriority(TI, CalleeMVRO) >
-                    TargetMVPriority(TI, CallerMVRO)) {
-                  HasHigherPriorityCallee = true;
-                  return;
-                }
-
-                // FIXME: we could allow a lower-priority match when the
-                // features are a proper subset. But for now, to keep things
-                // simpler, we only care about a precise match.
-                if (TargetMVPriority(TI, CalleeMVRO) <
-                    TargetMVPriority(TI, CallerMVRO))
-                  return;
-
-                if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
-                  FoundMatchingCallee = Func;
-                  return;
-                }
-
-                if (CurFD->isDefined()) {
-                  // FIXME: not sure how to get the address
-                } else {
-                  const CGFunctionInfo &FI =
-                      getTypes().arrangeGlobalDeclaration(FD);
-                  llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
-                  FoundMatchingCallee =
-                      CGM.GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false,
-                                            /*DontDefer=*/false, ForDefinition);
-                }
-              });
-
-          if (FoundMatchingCallee && !HasHigherPriorityCallee)
-            CalleePtr = FoundMatchingCallee;
-        }
+    if (CGM.getCodeGenOpts().OptimizationLevel > 0 &&
+        !CallerFD->hasAttr<OptimizeNoneAttr>())
+      if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>())
+        if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
+          // FIXME: do the same where either the caller or callee are
+          // target_clones.
+          if (FD->isTargetMultiVersion()) {
+            llvm::SmallVector<StringRef, 8> CallerFeats;
+            CallerTVA->getFeatures(CallerFeats);
+            MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats);
+
+            bool HasHigherPriorityCallee = false;
+            llvm::Constant *FoundMatchingCallee = nullptr;
+            getContext().forEachMultiversionedFunctionVersion(
+                FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee,
+                     &FoundMatchingCallee](const FunctionDecl *CurFD) {
+                  const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>();
+
+                  GlobalDecl CurGD{
+                      (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
+                  StringRef MangledName = CGM.getMangledName(CurFD);
+
+                  llvm::SmallVector<StringRef, 8> CalleeFeats;
+                  CalleeTVA->getFeatures(CalleeFeats);
+                  MultiVersionResolverOption CalleeMVRO(nullptr, "",
+                                                        CalleeFeats);
+
+                  const TargetInfo &TI = getTarget();
+
+                  // If there is a higher priority callee, we can't do the
+                  // optimization at all, as it would be a valid choice at
+                  // runtime.
+                  if (TargetMVPriority(TI, CalleeMVRO) >
+                      TargetMVPriority(TI, CallerMVRO)) {
+                    HasHigherPriorityCallee = true;
+                    return;
+                  }
+
+                  // FIXME: we could allow a lower-priority match when the
+                  // features are a proper subset. But for now, to keep things
+                  // simpler, we only care about a precise match.
+                  if (TargetMVPriority(TI, CalleeMVRO) <
+                      TargetMVPriority(TI, CallerMVRO))
+                    return;
+
+                  if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
+                    FoundMatchingCallee = Func;
+                    return;
+                  }
+
+                  if (CurFD->isDefined()) {
+                    // FIXME: not sure how to get the address
+                  } else {
+                    const CGFunctionInfo &FI =
+                        getTypes().arrangeGlobalDeclaration(FD);
+                    llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
+                    FoundMatchingCallee = CGM.GetAddrOfFunction(
+                        CurGD, Ty, /*ForVTable=*/false,
+                        /*DontDefer=*/false, ForDefinition);
+                  }
+                });
+
+            if (FoundMatchingCallee && !HasHigherPriorityCallee)
+              CalleePtr = FoundMatchingCallee;
+          }
 
   // If we're using inalloca, set up that argument.
   if (ArgMemory.isValid()) {
diff --git a/clang/test/CodeGen/attr-target-mv-direct-call.c b/clang/test/CodeGen/attr-target-mv-direct-call.c
index 687fdd1ca3c24..9a465dba52d25 100644
--- a/clang/test/CodeGen/attr-target-mv-direct-call.c
+++ b/clang/test/CodeGen/attr-target-mv-direct-call.c
@@ -1,245 +1,230 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --filter "call i32" --include-generated-funcs
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -O0 -S -emit-llvm -disable-llvm-optzns -o - %s | FileCheck %s --check-prefixes=CHECK,O0
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -O2 -S -emit-llvm -disable-llvm-optzns -o - %s | FileCheck %s --check-prefixes=CHECK,O2
 
 // Check that we make a direct call from direct_caller._Msimd to
 // direct_callee._Msimd when there is no better option.
-__attribute__((target_version("simd"))) void direct_callee(void) {}
-__attribute__((target_version("default"))) void direct_callee(void) {}
-__attribute__((target_version("simd"))) void direct_caller(void) { direct_callee(); }
-__attribute__((target_version("default"))) void direct_caller(void) { direct_callee(); }
+__attribute__((target_version("simd"))) int direct_callee(void) { return 1; }
+__attribute__((target_version("default"))) int direct_callee(void) { return 2; }
+__attribute__((target_version("simd"))) int direct_caller(void) { return direct_callee(); }
+__attribute__((target_version("default"))) int direct_caller(void) { return direct_callee(); }
+
+__attribute__((target_version("simd"), optnone)) int optnone_caller(void) { return direct_callee(); }
+__attribute__((target_version("default"), optnone)) int optnone_caller(void) { return direct_callee(); }
 
 // ... and that we go through the ifunc+resolver when there is a better option
 // that might be chosen at runtime.
-__attribute__((target_version("simd"))) void resolved_callee1(void) {}
-__attribute__((target_version("fcma"))) void resolved_callee1(void) {}
-__attribute__((target_version("default"))) void resolved_callee1(void) {}
-__attribute__((target_version("simd"))) void resolved_caller1(void) { resolved_callee1(); }
-__attribute__((target_version("default"))) void resolved_caller1(void) { resolved_callee1(); }
+__attribute__((target_version("simd"))) int resolved_callee1(void) { return 3; }
+__attribute__((target_version("fcma"))) int resolved_callee1(void) { return 4; }
+__attribute__((target_version("default"))) int resolved_callee1(void) { return 5; }
+__attribute__((target_version("simd"))) int resolved_caller1(void) { return resolved_callee1(); }
+__attribute__((target_version("default"))) int resolved_caller1(void) { return resolved_callee1(); }
 
 // FIXME: we could direct call in cases like this:
-__attribute__((target_version("fp"))) void resolved_callee2(void) {}
-__attribute__((target_version("default"))) void resolved_callee2(void) {}
-__attribute__((target_version("simd+fp"))) void resolved_caller2(void) { resolved_callee2(); }
-__attribute__((target_version("default"))) void resolved_caller2(void) { resolved_callee2(); }
+__attribute__((target_version("fp"))) int resolved_callee2(void) { return 6; }
+__attribute__((target_version("default"))) int resolved_callee2(void) { return 7; }
+__attribute__((target_version("simd+fp"))) int resolved_caller2(void) { return resolved_callee2(); }
+__attribute__((target_version("default"))) int resolved_caller2(void) { return resolved_callee2(); }
 
-void source() {
-    direct_caller();
-    resolved_caller1();
-    resolved_caller2();
+int source() {
+    return direct_caller() + optnone_caller() + resolved_caller1() + resolved_caller2();
 }
 
-//.
-// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @direct_callee.ifunc = weak_odr ifunc void (), ptr @direct_callee.resolver
-// CHECK: @direct_caller.ifunc = weak_odr ifunc void (), ptr @direct_caller.resolver
-// CHECK: @resolved_callee1.ifunc = weak_odr ifunc void (), ptr @resolved_callee1.resolver
-// CHECK: @resolved_caller1.ifunc = weak_odr ifunc void (), ptr @resolved_caller1.resolver
-// CHECK: @resolved_callee2.ifunc = weak_odr ifunc void (), ptr @resolved_callee2.resolver
-// CHECK: @resolved_caller2.ifunc = weak_odr ifunc void (), ptr @resolved_caller2.resolver
-//.
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@direct_callee._Msimd
-// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@direct_callee.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @direct_callee._Msimd
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @direct_callee.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@direct_caller._Msimd
-// CHECK-SAME: () #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @direct_callee._Msimd()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@direct_caller.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @direct_caller._Msimd
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @direct_caller.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Msimd
-// CHECK-SAME: () #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2097152
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2097152
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @resolved_callee1._Mfcma
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 512
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 512
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @resolved_callee1._Msimd
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @resolved_callee1.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_caller1._Msimd
-// CHECK-SAME: () #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @resolved_callee1.ifunc()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @resolved_caller1._Msimd
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @resolved_caller1.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee2._Mfp
-// CHECK-SAME: () #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 256
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 256
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @resolved_callee2._Mfp
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @resolved_callee2.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_caller2._MfpMsimd
-// CHECK-SAME: () #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @resolved_callee2.ifunc()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.resolver() comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 768
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 768
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @resolved_caller2._MfpMsimd
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @resolved_caller2.default
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@source
-// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @direct_caller.ifunc()
-// CHECK-NEXT:    call void @resolved_caller1.ifunc()
-// CHECK-NEXT:    call void @resolved_caller2.ifunc()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@direct_callee.default
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@direct_caller.default
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @direct_callee.ifunc()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Mfcma
-// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.default
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.default
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @resolved_callee1.ifunc()
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.default
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.default
-// CHECK-SAME: () #[[ATTR1]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void @resolved_callee2.ifunc()
-// CHECK-NEXT:    ret void
-//
-//.
-// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+neon" }
-//.
-// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-//.
+// CHECK-LABEL: @direct_callee._Msimd(
+//
+// CHECK-LABEL: @direct_callee.resolver(
+//
+//
+// CHECK-LABEL: @direct_caller.resolver(
+//
+// CHECK-LABEL: @optnone_caller._Msimd(
+// CHECK:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// CHECK-LABEL: @optnone_caller.resolver(
+//
+// CHECK-LABEL: @resolved_callee1._Msimd(
+//
+// CHECK-LABEL: @resolved_callee1.resolver(
+//
+// CHECK-LABEL: @resolved_caller1._Msimd(
+// CHECK:    [[CALL:%.*]] = call i32 @resolved_callee1.ifunc()
+//
+//
+// CHECK-LABEL: @resolved_caller1.resolver(
+//
+// CHECK-LABEL: @resolved_callee2._Mfp(
+//
+// CHECK-LABEL: @resolved_callee2.resolver(
+//
+// CHECK-LABEL: @resolved_caller2._MfpMsimd(
+// CHECK:    [[CALL:%.*]] = call i32 @resolved_callee2.ifunc()
+//
+//
+// CHECK-LABEL: @resolved_caller2.resolver(
+//
+// CHECK-LABEL: @source(
+// CHECK:    [[CALL:%.*]] = call i32 @direct_caller.ifunc()
+// CHECK:    [[CALL1:%.*]] = call i32 @optnone_caller.ifunc()
+// CHECK:    [[CALL2:%.*]] = call i32 @resolved_caller1.ifunc()
+// CHECK:    [[CALL4:%.*]] = call i32 @resolved_caller2.ifunc()
+//
+//
+// CHECK-LABEL: @direct_callee.default(
+//
+// CHECK-LABEL: @direct_caller.default(
+// CHECK:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// CHECK-LABEL: @optnone_caller.default(
+// CHECK:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// CHECK-LABEL: @resolved_callee1._Mfcma(
+//
+// CHECK-LABEL: @resolved_callee1.default(
+//
+// CHECK-LABEL: @resolved_caller1.default(
+// CHECK:    [[CALL:%.*]] = call i32 @resolved_callee1.ifunc()
+//
+//
+// CHECK-LABEL: @resolved_callee2.default(
+//
+// CHECK-LABEL: @resolved_caller2.default(
+// CHECK:    [[CALL:%.*]] = call i32 @resolved_callee2.ifunc()
+//
+//
+// O0-LABEL: @direct_callee._Msimd(
+//
+// O0-LABEL: @direct_callee.resolver(
+//
+// O0-LABEL: @direct_caller._Msimd(
+// O0:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O0-LABEL: @direct_caller.resolver(
+//
+// O0-LABEL: @optnone_caller._Msimd(
+// O0:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O0-LABEL: @optnone_caller.resolver(
+//
+// O0-LABEL: @resolved_callee1._Msimd(
+//
+// O0-LABEL: @resolved_callee1.resolver(
+//
+// O0-LABEL: @resolved_caller1._Msimd(
+// O0:    [[CALL:%.*]] = call i32 @resolved_callee1.ifunc()
+//
+//
+// O0-LABEL: @resolved_caller1.resolver(
+//
+// O0-LABEL: @resolved_callee2._Mfp(
+//
+// O0-LABEL: @resolved_callee2.resolver(
+//
+// O0-LABEL: @resolved_caller2._MfpMsimd(
+// O0:    [[CALL:%.*]] = call i32 @resolved_callee2.ifunc()
+//
+//
+// O0-LABEL: @resolved_caller2.resolver(
+//
+// O0-LABEL: @source(
+// O0:    [[CALL:%.*]] = call i32 @direct_caller.ifunc()
+// O0:    [[CALL1:%.*]] = call i32 @optnone_caller.ifunc()
+// O0:    [[CALL2:%.*]] = call i32 @resolved_caller1.ifunc()
+// O0:    [[CALL4:%.*]] = call i32 @resolved_caller2.ifunc()
+//
+//
+// O0-LABEL: @direct_callee.default(
+//
+// O0-LABEL: @direct_caller.default(
+// O0:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O0-LABEL: @optnone_caller.default(
+// O0:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O0-LABEL: @resolved_callee1._Mfcma(
+//
+// O0-LABEL: @resolved_callee1.default(
+//
+// O0-LABEL: @resolved_caller1.default(
+// O0:    [[CALL:%.*]] = call i32 @resolved_callee1.ifunc()
+//
+//
+// O0-LABEL: @resolved_callee2.default(
+//
+// O0-LABEL: @resolved_caller2.default(
+// O0:    [[CALL:%.*]] = call i32 @resolved_callee2.ifunc()
+//
+//
+//
+// O2-LABEL: @direct_callee._Msimd(
+//
+// O2-LABEL: @direct_callee.resolver(
+//
+// O2-LABEL: @direct_caller._Msimd(
+// O2:    [[CALL:%.*]] = call i32 @direct_callee._Msimd()
+//
+//
+// O2-LABEL: @direct_caller.resolver(
+//
+// O2-LABEL: @optnone_caller._Msimd(
+// O2:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O2-LABEL: @optnone_caller.resolver(
+//
+// O2-LABEL: @resolved_callee1._Msimd(
+//
+// O2-LABEL: @resolved_callee1.resolver(
+//
+// O2-LABEL: @resolved_caller1._Msimd(
+// O2:    [[CALL:%.*]] = call i32 @resolved_callee1.ifunc()
+//
+//
+// O2-LABEL: @resolved_caller1.resolver(
+//
+// O2-LABEL: @resolved_callee2._Mfp(
+//
+// O2-LABEL: @resolved_callee2.resolver(
+//
+// O2-LABEL: @resolved_caller2._MfpMsimd(
+// O2:    [[CALL:%.*]] = call i32 @resolved_callee2.ifunc()
+//
+//
+// O2-LABEL: @resolved_caller2.resolver(
+//
+// O2-LABEL: @source(
+// O2:    [[CALL:%.*]] = call i32 @direct_caller.ifunc()
+// O2:    [[CALL1:%.*]] = call i32 @optnone_caller.ifunc()
+// O2:    [[CALL2:%.*]] = call i32 @resolved_caller1.ifunc()
+// O2:    [[CALL4:%.*]] = call i32 @resolved_caller2.ifunc()
+//
+//
+// O2-LABEL: @direct_callee.default(
+//
+// O2-LABEL: @direct_caller.default(
+// O2:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O2-LABEL: @optnone_caller.default(
+// O2:    [[CALL:%.*]] = call i32 @direct_callee.ifunc()
+//
+//
+// O2-LABEL: @resolved_callee1._Mfcma(
+//
+// O2-LABEL: @resolved_callee1.default(
+//
+// O2-LABEL: @resolved_caller1.default(
+// O2:    [[CALL:%.*]] = call i32 @resolved_callee1.ifunc()
+//
+//
+// O2-LABEL: @resolved_callee2.default(
+//
+// O2-LABEL: @resolved_caller2.default(
+// O2:    [[CALL:%.*]] = call i32 @resolved_callee2.ifunc()
+//

>From cadeb37acd2786f29984645420e8a715e8ccfe8b Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Wed, 31 Jan 2024 08:55:43 -0800
Subject: [PATCH 3/4] only care about caller => callee calls in the test

---
 clang/test/CodeGen/attr-target-mv-direct-call.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/clang/test/CodeGen/attr-target-mv-direct-call.c b/clang/test/CodeGen/attr-target-mv-direct-call.c
index 9a465dba52d25..88b4568d0c084 100644
--- a/clang/test/CodeGen/attr-target-mv-direct-call.c
+++ b/clang/test/CodeGen/attr-target-mv-direct-call.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --filter "call i32" --include-generated-funcs
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --filter "call i32.*callee" --include-generated-funcs
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -O0 -S -emit-llvm -disable-llvm-optzns -o - %s | FileCheck %s --check-prefixes=CHECK,O0
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -O2 -S -emit-llvm -disable-llvm-optzns -o - %s | FileCheck %s --check-prefixes=CHECK,O2
 
@@ -64,11 +64,6 @@ int source() {
 // CHECK-LABEL: @resolved_caller2.resolver(
 //
 // CHECK-LABEL: @source(
-// CHECK:    [[CALL:%.*]] = call i32 @direct_caller.ifunc()
-// CHECK:    [[CALL1:%.*]] = call i32 @optnone_caller.ifunc()
-// CHECK:    [[CALL2:%.*]] = call i32 @resolved_caller1.ifunc()
-// CHECK:    [[CALL4:%.*]] = call i32 @resolved_caller2.ifunc()
-//
 //
 // CHECK-LABEL: @direct_callee.default(
 //
@@ -131,11 +126,6 @@ int source() {
 // O0-LABEL: @resolved_caller2.resolver(
 //
 // O0-LABEL: @source(
-// O0:    [[CALL:%.*]] = call i32 @direct_caller.ifunc()
-// O0:    [[CALL1:%.*]] = call i32 @optnone_caller.ifunc()
-// O0:    [[CALL2:%.*]] = call i32 @resolved_caller1.ifunc()
-// O0:    [[CALL4:%.*]] = call i32 @resolved_caller2.ifunc()
-//
 //
 // O0-LABEL: @direct_callee.default(
 //
@@ -199,11 +189,6 @@ int source() {
 // O2-LABEL: @resolved_caller2.resolver(
 //
 // O2-LABEL: @source(
-// O2:    [[CALL:%.*]] = call i32 @direct_caller.ifunc()
-// O2:    [[CALL1:%.*]] = call i32 @optnone_caller.ifunc()
-// O2:    [[CALL2:%.*]] = call i32 @resolved_caller1.ifunc()
-// O2:    [[CALL4:%.*]] = call i32 @resolved_caller2.ifunc()
-//
 //
 // O2-LABEL: @direct_callee.default(
 //

>From 4129daf8de38949970bb00b226a2d8df1662a0db Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Wed, 31 Jan 2024 09:18:50 -0800
Subject: [PATCH 4/4] address a fixme: move priority calculation to a member
 function

---
 clang/lib/CodeGen/CGCall.cpp          | 11 ++---------
 clang/lib/CodeGen/CodeGenFunction.cpp | 18 ++++++++++++++++++
 clang/lib/CodeGen/CodeGenFunction.h   |  2 ++
 clang/lib/CodeGen/CodeGenModule.cpp   | 21 +--------------------
 4 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index fe69b0ab133ac..3f6e171c67345 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4966,11 +4966,6 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) {
   return MaxVectorWidth;
 }
 
-// FIXME: put this somewhere nicer to share
-unsigned
-TargetMVPriority(const TargetInfo &TI,
-                 const CodeGenFunction::MultiVersionResolverOption &RO);
-
 RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  const CGCallee &Callee,
                                  ReturnValueSlot ReturnValue,
@@ -5478,8 +5473,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                   // If there is a higher priority callee, we can't do the
                   // optimization at all, as it would be a valid choice at
                   // runtime.
-                  if (TargetMVPriority(TI, CalleeMVRO) >
-                      TargetMVPriority(TI, CallerMVRO)) {
+                  if (CalleeMVRO.priority(TI) > CallerMVRO.priority(TI)) {
                     HasHigherPriorityCallee = true;
                     return;
                   }
@@ -5487,8 +5481,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                   // FIXME: we could allow a lower-priority match when the
                   // features are a proper subset. But for now, to keep things
                   // simpler, we only care about a precise match.
-                  if (TargetMVPriority(TI, CalleeMVRO) <
-                      TargetMVPriority(TI, CallerMVRO))
+                  if (CalleeMVRO.priority(TI) < CallerMVRO.priority(TI))
                     return;
 
                   if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 1ad905078d349..f2c93b5e5398b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2697,6 +2697,24 @@ void CodeGenFunction::EmitSanitizerStatReport(llvm::SanitizerStatKind SSK) {
   CGM.getSanStats().create(IRB, SSK);
 }
 
+unsigned CodeGenFunction::MultiVersionResolverOption::priority(
+    const TargetInfo &TI) const {
+  unsigned Priority = 0;
+  unsigned NumFeatures = 0;
+  for (StringRef Feat : Conditions.Features) {
+    Priority = std::max(Priority, TI.multiVersionSortPriority(Feat));
+    NumFeatures++;
+  }
+
+  if (!Conditions.Architecture.empty())
+    Priority = std::max(Priority,
+                        TI.multiVersionSortPriority(Conditions.Architecture));
+
+  Priority += TI.multiVersionFeatureCost() * NumFeatures;
+
+  return Priority;
+}
+
 void CodeGenFunction::EmitKCFIOperandBundle(
     const CGCallee &Callee, SmallVectorImpl<llvm::OperandBundleDef> &Bundles) {
   const FunctionProtoType *FP =
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 143ad64e8816b..525852437dbb8 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4965,6 +4965,8 @@ class CodeGenFunction : public CodeGenTypeCache {
     MultiVersionResolverOption(llvm::Function *F, StringRef Arch,
                                ArrayRef<StringRef> Feats)
         : Function(F), Conditions(Arch, Feats) {}
+
+    unsigned priority(const TargetInfo &TI) const;
   };
 
   // Emits the body of a multiversion function's resolver. Assumes that the
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c334e4a3a40f3..d6abd4cc9454d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4092,25 +4092,6 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
 static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
                                                       llvm::Function *NewFn);
 
-unsigned
-TargetMVPriority(const TargetInfo &TI,
-                 const CodeGenFunction::MultiVersionResolverOption &RO) {
-  unsigned Priority = 0;
-  unsigned NumFeatures = 0;
-  for (StringRef Feat : RO.Conditions.Features) {
-    Priority = std::max(Priority, TI.multiVersionSortPriority(Feat));
-    NumFeatures++;
-  }
-
-  if (!RO.Conditions.Architecture.empty())
-    Priority = std::max(
-        Priority, TI.multiVersionSortPriority(RO.Conditions.Architecture));
-
-  Priority += TI.multiVersionFeatureCost() * NumFeatures;
-
-  return Priority;
-}
-
 // Multiversion functions should be at most 'WeakODRLinkage' so that a different
 // TU can forward declare the function without causing problems.  Particularly
 // in the cases of CPUDispatch, this causes issues. This also makes sure we
@@ -4244,7 +4225,7 @@ void CodeGenModule::emitMultiVersionFunctions() {
     llvm::stable_sort(
         Options, [&TI](const CodeGenFunction::MultiVersionResolverOption &LHS,
                        const CodeGenFunction::MultiVersionResolverOption &RHS) {
-          return TargetMVPriority(TI, LHS) > TargetMVPriority(TI, RHS);
+          return LHS.priority(TI) > RHS.priority(TI);
         });
     CodeGenFunction CGF(*this);
     CGF.EmitMultiVersionResolver(ResolverFunc, Options);



More information about the cfe-commits mailing list