[clang] [clang][FMV] Direct-call multi-versioned callees from multi-versioned (PR #80093)
Jon Roelofs via cfe-commits
cfe-commits at lists.llvm.org
Tue Jan 30 19:20:39 PST 2024
https://github.com/jroelofs created https://github.com/llvm/llvm-project/pull/80093
… callers when there is a callee with a matching feature set, and no other higher priority callee. This optimization helps the inliner see past the ifunc+resolver to the callee that we know it will always land on.
This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714
>From ed52ee4424459ebc046a625341ad8dbbd38bcbe3 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 30 Jan 2024 19:13:42 -0800
Subject: [PATCH] [clang][FMV] Direct-call multi-versioned callees from
multi-versioned callers
... when there is a callee with a matching feature set, and no other higher
priority callee. This optimization helps the inliner see past the
ifunc+resolver to the callee that we know it will always land on.
This is a conservative implementation of: https://github.com/llvm/llvm-project/issues/71714
---
clang/lib/CodeGen/CGCall.cpp | 72 +++++
clang/lib/CodeGen/CodeGenModule.cpp | 2 +-
.../test/CodeGen/attr-target-mv-direct-call.c | 245 ++++++++++++++++++
3 files changed, 318 insertions(+), 1 deletion(-)
create mode 100644 clang/test/CodeGen/attr-target-mv-direct-call.c
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 28c211aa631e4..84a04e3ccddd8 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4966,6 +4966,11 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) {
return MaxVectorWidth;
}
+// FIXME: put this somewhere nicer to share
+unsigned
+TargetMVPriority(const TargetInfo &TI,
+ const CodeGenFunction::MultiVersionResolverOption &RO);
+
RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
const CGCallee &Callee,
ReturnValueSlot ReturnValue,
@@ -5437,6 +5442,73 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this);
llvm::Value *CalleePtr = ConcreteCallee.getFunctionPointer();
+ // If a multi-versioned caller calls a multi-versioned callee, skip the
+ // resolver when there is a precise match on the feature sets, and no
+ // possibility of a better match at runtime.
+ if (const auto *CallerFD = dyn_cast_or_null<FunctionDecl>(CurGD.getDecl()))
+ if (const auto *CallerTVA = CallerFD->getAttr<TargetVersionAttr>())
+ if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
+ // FIXME: do the same where either the caller or callee are
+ // target_clones.
+ if (FD->isTargetMultiVersion()) {
+ llvm::SmallVector<StringRef, 8> CallerFeats;
+ CallerTVA->getFeatures(CallerFeats);
+ MultiVersionResolverOption CallerMVRO(nullptr, "", CallerFeats);
+
+ bool HasHigherPriorityCallee = false;
+ llvm::Constant *FoundMatchingCallee = nullptr;
+ getContext().forEachMultiversionedFunctionVersion(
+ FD, [this, FD, &CallerMVRO, &HasHigherPriorityCallee,
+ &FoundMatchingCallee](const FunctionDecl *CurFD) {
+ const auto *CalleeTVA = CurFD->getAttr<TargetVersionAttr>();
+
+ GlobalDecl CurGD{
+ (CurFD->isDefined() ? CurFD->getDefinition() : CurFD)};
+ StringRef MangledName = CGM.getMangledName(CurFD);
+
+ llvm::SmallVector<StringRef, 8> CalleeFeats;
+ CalleeTVA->getFeatures(CalleeFeats);
+ MultiVersionResolverOption CalleeMVRO(nullptr, "", CalleeFeats);
+
+ const TargetInfo &TI = getTarget();
+
+ // If there is a higher priority callee, we can't do the
+ // optimization at all, as it would be a valid choice at
+ // runtime.
+ if (TargetMVPriority(TI, CalleeMVRO) >
+ TargetMVPriority(TI, CallerMVRO)) {
+ HasHigherPriorityCallee = true;
+ return;
+ }
+
+ // FIXME: we could allow a lower-priority match when the
+ // features are a proper subset. But for now, to keep things
+ // simpler, we only care about a precise match.
+ if (TargetMVPriority(TI, CalleeMVRO) <
+ TargetMVPriority(TI, CallerMVRO))
+ return;
+
+ if (llvm::Constant *Func = CGM.GetGlobalValue(MangledName)) {
+ FoundMatchingCallee = Func;
+ return;
+ }
+
+ if (CurFD->isDefined()) {
+ // FIXME: not sure how to get the address
+ } else {
+ const CGFunctionInfo &FI =
+ getTypes().arrangeGlobalDeclaration(FD);
+ llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
+ FoundMatchingCallee =
+ CGM.GetAddrOfFunction(CurGD, Ty, /*ForVTable=*/false,
+ /*DontDefer=*/false, ForDefinition);
+ }
+ });
+
+ if (FoundMatchingCallee && !HasHigherPriorityCallee)
+ CalleePtr = FoundMatchingCallee;
+ }
+
// If we're using inalloca, set up that argument.
if (ArgMemory.isValid()) {
llvm::Value *Arg = ArgMemory.getPointer();
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6ec54cc01c923..c334e4a3a40f3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4092,7 +4092,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
llvm::Function *NewFn);
-static unsigned
+unsigned
TargetMVPriority(const TargetInfo &TI,
const CodeGenFunction::MultiVersionResolverOption &RO) {
unsigned Priority = 0;
diff --git a/clang/test/CodeGen/attr-target-mv-direct-call.c b/clang/test/CodeGen/attr-target-mv-direct-call.c
new file mode 100644
index 0000000000000..687fdd1ca3c24
--- /dev/null
+++ b/clang/test/CodeGen/attr-target-mv-direct-call.c
@@ -0,0 +1,245 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -emit-llvm -o - %s | FileCheck %s
+
+// Check that we make a direct call from direct_caller._Msimd to
+// direct_callee._Msimd when there is no better option.
+__attribute__((target_version("simd"))) void direct_callee(void) {}
+__attribute__((target_version("default"))) void direct_callee(void) {}
+__attribute__((target_version("simd"))) void direct_caller(void) { direct_callee(); }
+__attribute__((target_version("default"))) void direct_caller(void) { direct_callee(); }
+
+// ... and that we go through the ifunc+resolver when there is a better option
+// that might be chosen at runtime.
+__attribute__((target_version("simd"))) void resolved_callee1(void) {}
+__attribute__((target_version("fcma"))) void resolved_callee1(void) {}
+__attribute__((target_version("default"))) void resolved_callee1(void) {}
+__attribute__((target_version("simd"))) void resolved_caller1(void) { resolved_callee1(); }
+__attribute__((target_version("default"))) void resolved_caller1(void) { resolved_callee1(); }
+
+// FIXME: we could direct call in cases like this:
+__attribute__((target_version("fp"))) void resolved_callee2(void) {}
+__attribute__((target_version("default"))) void resolved_callee2(void) {}
+__attribute__((target_version("simd+fp"))) void resolved_caller2(void) { resolved_callee2(); }
+__attribute__((target_version("default"))) void resolved_caller2(void) { resolved_callee2(); }
+
+void source() {
+ direct_caller();
+ resolved_caller1();
+ resolved_caller2();
+}
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @direct_callee.ifunc = weak_odr ifunc void (), ptr @direct_callee.resolver
+// CHECK: @direct_caller.ifunc = weak_odr ifunc void (), ptr @direct_caller.resolver
+// CHECK: @resolved_callee1.ifunc = weak_odr ifunc void (), ptr @resolved_callee1.resolver
+// CHECK: @resolved_caller1.ifunc = weak_odr ifunc void (), ptr @resolved_caller1.resolver
+// CHECK: @resolved_callee2.ifunc = weak_odr ifunc void (), ptr @resolved_callee2.resolver
+// CHECK: @resolved_caller2.ifunc = weak_odr ifunc void (), ptr @resolved_caller2.resolver
+//.
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_callee._Msimd
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@direct_callee.resolver() comdat {
+// CHECK-NEXT: resolver_entry:
+// CHECK-NEXT: call void @__init_cpu_features_resolver()
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK: resolver_return:
+// CHECK-NEXT: ret ptr @direct_callee._Msimd
+// CHECK: resolver_else:
+// CHECK-NEXT: ret ptr @direct_callee.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_caller._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @direct_callee._Msimd()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@direct_caller.resolver() comdat {
+// CHECK-NEXT: resolver_entry:
+// CHECK-NEXT: call void @__init_cpu_features_resolver()
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK: resolver_return:
+// CHECK-NEXT: ret ptr @direct_caller._Msimd
+// CHECK: resolver_else:
+// CHECK-NEXT: ret ptr @direct_caller.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.resolver() comdat {
+// CHECK-NEXT: resolver_entry:
+// CHECK-NEXT: call void @__init_cpu_features_resolver()
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2097152
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2097152
+// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK: resolver_return:
+// CHECK-NEXT: ret ptr @resolved_callee1._Mfcma
+// CHECK: resolver_else:
+// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 512
+// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 512
+// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK: resolver_return1:
+// CHECK-NEXT: ret ptr @resolved_callee1._Msimd
+// CHECK: resolver_else2:
+// CHECK-NEXT: ret ptr @resolved_callee1.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1._Msimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @resolved_callee1.ifunc()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.resolver() comdat {
+// CHECK-NEXT: resolver_entry:
+// CHECK-NEXT: call void @__init_cpu_features_resolver()
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 512
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 512
+// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK: resolver_return:
+// CHECK-NEXT: ret ptr @resolved_caller1._Msimd
+// CHECK: resolver_else:
+// CHECK-NEXT: ret ptr @resolved_caller1.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2._Mfp
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.resolver() comdat {
+// CHECK-NEXT: resolver_entry:
+// CHECK-NEXT: call void @__init_cpu_features_resolver()
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 256
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 256
+// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK: resolver_return:
+// CHECK-NEXT: ret ptr @resolved_callee2._Mfp
+// CHECK: resolver_else:
+// CHECK-NEXT: ret ptr @resolved_callee2.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2._MfpMsimd
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @resolved_callee2.ifunc()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.resolver() comdat {
+// CHECK-NEXT: resolver_entry:
+// CHECK-NEXT: call void @__init_cpu_features_resolver()
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 768
+// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 768
+// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK: resolver_return:
+// CHECK-NEXT: ret ptr @resolved_caller2._MfpMsimd
+// CHECK: resolver_else:
+// CHECK-NEXT: ret ptr @resolved_caller2.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@source
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @direct_caller.ifunc()
+// CHECK-NEXT: call void @resolved_caller1.ifunc()
+// CHECK-NEXT: call void @resolved_caller2.ifunc()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_callee.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@direct_caller.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @direct_callee.ifunc()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1._Mfcma
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee1.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller1.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @resolved_callee1.ifunc()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_callee2.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@resolved_caller2.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @resolved_callee2.ifunc()
+// CHECK-NEXT: ret void
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+neon" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
More information about the cfe-commits
mailing list