[clang] [clang][FMV] Direct-call multi-versioned callees from multi-versioned (PR #80093)

Jon Roelofs via cfe-commits cfe-commits at lists.llvm.org
Tue Jan 30 19:20:39 PST 2024


https://github.com/jroelofs created https://github.com/llvm/llvm-project/pull/80093

… callers when there is a callee with a matching feature set, and no other higher priority callee.  This optimization helps the inliner see past the ifunc+resolver to the callee that we know it will always land on.

This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714

>From ed52ee4424459ebc046a625341ad8dbbd38bcbe3 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 30 Jan 2024 19:13:42 -0800
Subject: [PATCH] [clang][FMV] Direct-call multi-versioned callees from
 multi-versioned callers

... when there is a callee with a matching feature set, and no other higher
priority callee.  This optimization helps the inliner see past the
ifunc+resolver to the callee that we know it will always land on.

This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714
---
 clang/lib/CodeGen/CGCall.cpp                  |  72 +++++
 clang/lib/CodeGen/CodeGenModule.cpp           |   2 +-
 .../test/CodeGen/attr-target-mv-direct-call.c | 245 ++++++++++++++++++
 3 files changed, 318 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/attr-target-mv-direct-call.c

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 28c211aa631e4..84a04e3ccddd8 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4966,6 +4966,11 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) {
   return MaxVectorWidth;
 }
 
+// FIXME: put this somewhere nicer to share
+unsigned
+TargetMVPriority(const TargetInfo &TI,
+                 const CodeGenFunction::MultiVersionResolverOption &RO);
+
 RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  const CGCallee &Callee,
                                  ReturnValueSlot ReturnValue,
@@ -5437,6 +5442,73 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this);
   llvm::Value *CalleePtr = ConcreteCallee.getFunctionPointer();
 
+  // If a multi-versioned caller calls a multi-versioned callee, skip the
+  // resolver when there is a precise match on the feature sets, and no
+  // possibility of a better match at runtime.
+  if (const auto *CallerFD = dyn_cast_or_null<FunctionDecl>(CurGD.getDecl()))
+    if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>())
+      if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
+        // FIXME: do the same where either the caller or callee are
+        // target_clones.
+        if (FD->isTargetMultiVersion()) {
+          llvm::SmallVector<StringRef, 8> CallerFeats;
+          CallerTVA->getFeatures(CallerFeats);
+          MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats);
+
+          bool HasHigherPriorityCallee = false;
+          llvm::Constant *FoundMatchingCallee = nullptr;
+          getContext().forEachMultiversionedFunctionVersion(
+              FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee,
+                   &FoundMatchingCallee](const FunctionDecl *CurFD) {
+                const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>();
+
+                GlobalDecl CurGD{
+                    (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
+                StringRef MangledName = CGM.getMangledName(CurFD);
+
+                llvm::SmallVector<StringRef, 8> CalleeFeats;
+                CalleeTVA->getFeatures(CalleeFeats);
+                MultiVersionResolverOption CalleeMVRO(nullptr, "", CalleeFeats);
+
+                const TargetInfo &TI = getTarget();
+
+                // If there is a higher priority callee, we can't do the
+                // optimization at all, as it would be a valid choice at
+                // runtime.
+                if (TargetMVPriority(TI, CalleeMVRO) >
+                    TargetMVPriority(TI, CallerMVRO)) {
+                  HasHigherPriorityCallee = true;
+                  return;
+                }
+
+                // FIXME: we could allow a lower-priority match when the
+                // features are a proper subset. But for now, to keep things
+                // simpler, we only care about a precise match.
+                if (TargetMVPriority(TI, CalleeMVRO) <
+                    TargetMVPriority(TI, CallerMVRO))
+                  return;
+
+                if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
+                  FoundMatchingCallee = Func;
+                  return;
+                }
+
+                if (CurFD->isDefined()) {
+                  // FIXME: not sure how to get the address
+                } else {
+                  const CGFunctionInfo &FI =
+                      getTypes().arrangeGlobalDeclaration(FD);
+                  llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
+                  FoundMatchingCallee =
+                      CGM.GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false,
+                                            /*DontDefer=*/false, ForDefinition);
+                }
+              });
+
+          if (FoundMatchingCallee && !HasHigherPriorityCallee)
+            CalleePtr = FoundMatchingCallee;
+        }
+
   // If we're using inalloca, set up that argument.
   if (ArgMemory.isValid()) {
     llvm::Value *Arg = ArgMemory.getPointer();
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6ec54cc01c923..c334e4a3a40f3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4092,7 +4092,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
 static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
                                                       llvm::Function *NewFn);
 
-static unsigned
+unsigned
 TargetMVPriority(const TargetInfo &TI,
                  const CodeGenFunction::MultiVersionResolverOption &RO) {
   unsigned Priority = 0;
diff --git a/clang/test/CodeGen/attr-target-mv-direct-call.c b/clang/test/CodeGen/attr-target-mv-direct-call.c
new file mode 100644
index 0000000000000..687fdd1ca3c24
--- /dev/null
+++ b/clang/test/CodeGen/attr-target-mv-direct-call.c
@@ -0,0 +1,245 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
+
+// Check that we make a direct call from direct_caller._Msimd to
+// direct_callee._Msimd when there is no better option.
+__attribute__((target_version("simd"))) void direct_callee(void) {}
+__attribute__((target_version("default"))) void direct_callee(void) {}
+__attribute__((target_version("simd"))) void direct_caller(void) { direct_callee(); }
+__attribute__((target_version("default"))) void direct_caller(void) { direct_callee(); }
+
+// ... and that we go through the ifunc+resolver when there is a better option
+// that might be chosen at runtime.
+__attribute__((target_version("simd"))) void resolved_callee1(void) {}
+__attribute__((target_version("fcma"))) void resolved_callee1(void) {}
+__attribute__((target_version("default"))) void resolved_callee1(void) {}
+__attribute__((target_version("simd"))) void resolved_caller1(void) { resolved_callee1(); }
+__attribute__((target_version("default"))) void resolved_caller1(void) { resolved_callee1(); }
+
+// FIXME: we could direct call in cases like this:
+__attribute__((target_version("fp"))) void resolved_callee2(void) {}
+__attribute__((target_version("default"))) void resolved_callee2(void) {}
+__attribute__((target_version("simd+fp"))) void resolved_caller2(void) { resolved_callee2(); }
+__attribute__((target_version("default"))) void resolved_caller2(void) { resolved_callee2(); }
+
+void source() {
+    direct_caller();
+    resolved_caller1();
+    resolved_caller2();
+}
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @direct_callee.ifunc = weak_odr ifunc void (), ptr @direct_callee.resolver
+// CHECK: @direct_caller.ifunc = weak_odr ifunc void (), ptr @direct_caller.resolver
+// CHECK: @resolved_callee1.ifunc = weak_odr ifunc void (), ptr @resolved_callee1.resolver
+// CHECK: @resolved_caller1.ifunc = weak_odr ifunc void (), ptr @resolved_caller1.resolver
+// CHECK: @resolved_callee2.ifunc = weak_odr ifunc void (), ptr @resolved_callee2.resolver
+// CHECK: @resolved_caller2.ifunc = weak_odr ifunc void (), ptr @resolved_caller2.resolver
+//.
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_callee._Msimd
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@direct_callee.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @direct_callee._Msimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @direct_callee.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_caller._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @direct_callee._Msimd()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@direct_caller.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @direct_caller._Msimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @direct_caller.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2097152
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2097152
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_callee1._Mfcma
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 512
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 512
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @resolved_callee1._Msimd
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @resolved_callee1.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee1.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_caller1._Msimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @resolved_caller1.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2._Mfp
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 256
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 256
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_callee2._Mfp
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @resolved_callee2.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2._MfpMsimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee2.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 768
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 768
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @resolved_caller2._MfpMsimd
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @resolved_caller2.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@source
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @direct_caller.ifunc()
+// CHECK-NEXT:    call void @resolved_caller1.ifunc()
+// CHECK-NEXT:    call void @resolved_caller2.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_callee.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_caller.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @direct_callee.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Mfcma
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee1.ifunc()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @resolved_callee2.ifunc()
+// CHECK-NEXT:    ret void
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+neon" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.



More information about the cfe-commits mailing list