[llvm] 7a41af8 - [X86] Support arch=x86-64{,-v2,-v3,-v4} for target_clones attribute
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 23 22:09:01 PDT 2023
Author: Fangrui Song
Date: 2023-08-23T22:08:55-07:00
New Revision: 7a41af86041bd757b7f380d7f645403d4e1725ca
URL: https://github.com/llvm/llvm-project/commit/7a41af86041bd757b7f380d7f645403d4e1725ca
DIFF: https://github.com/llvm/llvm-project/commit/7a41af86041bd757b7f380d7f645403d4e1725ca.diff
LOG: [X86] Support arch=x86-64{,-v2,-v3,-v4} for target_clones attribute
GCC 12 (https://gcc.gnu.org/PR101696) allows `arch=x86-64`
`arch=x86-64-v2` `arch=x86-64-v3` `arch=x86-64-v4` in the
target_clones function attribute. This patch ports the feature.
* Set KeyFeature to `x86-64{,-v2,-v3,-v4}` in `Processors[]`, to be used
by X86TargetInfo::multiVersionSortPriority
* builtins: change `__cpu_features2` to an array like libgcc. Define
`FEATURE_X86_64_{BASELINE,V2,V3,V4}` and depended ISA feature bits.
* CGBuiltin.cpp: update EmitX86CpuSupports to handle `arch=x86-64*`.
Close https://github.com/llvm/llvm-project/issues/55830
Reviewed By: pengfei
Differential Revision: https://reviews.llvm.org/D158329
Added:
Modified:
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/CodeGen/CodeGenFunction.cpp
clang/lib/CodeGen/CodeGenFunction.h
clang/test/CodeGen/attr-target-clones.c
clang/test/CodeGen/builtin-cpu-supports.c
clang/test/Sema/attr-target-clones.c
compiler-rt/lib/builtins/cpu_model.c
llvm/lib/TargetParser/X86TargetParser.cpp
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a99cf3d82aaed5..a395c4979081e3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -54,6 +54,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/MatrixBuilder.h"
#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ScopedPrinter.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/TargetParser/X86TargetParser.h"
@@ -13324,16 +13325,15 @@ Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
}
Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
- return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
+ uint64_t Mask = llvm::X86::getCpuSupportsMask(FeatureStrs);
+ std::array<uint32_t, 4> FeatureMask{Lo_32(Mask), Hi_32(Mask), 0, 0};
+ return EmitX86CpuSupports(FeatureMask);
}
-llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
- uint32_t Features1 = Lo_32(FeaturesMask);
- uint32_t Features2 = Hi_32(FeaturesMask);
-
+llvm::Value *
+CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
Value *Result = Builder.getTrue();
-
- if (Features1 != 0) {
+ if (FeatureMask[0] != 0) {
// Matching the struct layout from the compiler-rt/libgcc structure that is
// filled in:
// unsigned int __cpu_vendor;
@@ -13356,22 +13356,26 @@ llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint64_t FeaturesMask) {
CharUnits::fromQuantity(4));
// Check the value of the bit corresponding to the feature requested.
- Value *Mask = Builder.getInt32(Features1);
+ Value *Mask = Builder.getInt32(FeatureMask[0]);
Value *Bitset = Builder.CreateAnd(Features, Mask);
Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
Result = Builder.CreateAnd(Result, Cmp);
}
- if (Features2 != 0) {
- llvm::Constant *CpuFeatures2 = CGM.CreateRuntimeVariable(Int32Ty,
- "__cpu_features2");
- cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
-
- Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures2,
- CharUnits::fromQuantity(4));
-
+ llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3);
+ llvm::Constant *CpuFeatures2 =
+ CGM.CreateRuntimeVariable(ATy, "__cpu_features2");
+ cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
+ for (int i = 1; i != 4; ++i) {
+ const uint32_t M = FeatureMask[i];
+ if (!M)
+ continue;
+ Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
+ Value *Features = Builder.CreateAlignedLoad(
+ Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs),
+ CharUnits::fromQuantity(4));
// Check the value of the bit corresponding to the feature requested.
- Value *Mask = Builder.getInt32(Features2);
+ Value *Mask = Builder.getInt32(M);
Value *Bitset = Builder.CreateAnd(Features, Mask);
Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
Result = Builder.CreateAnd(Result, Cmp);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 89138ddbdc0bc0..7b456edf5a3623 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2681,8 +2681,27 @@ llvm::Value *CodeGenFunction::FormX86ResolverCondition(
const MultiVersionResolverOption &RO) {
llvm::Value *Condition = nullptr;
- if (!RO.Conditions.Architecture.empty())
- Condition = EmitX86CpuIs(RO.Conditions.Architecture);
+ if (!RO.Conditions.Architecture.empty()) {
+ StringRef Arch = RO.Conditions.Architecture;
+ std::array<uint32_t, 4> Mask{};
+ // If arch= specifies an x86-64 micro-architecture level, test a special
+ // feature named FEATURE_X86_64_*, otherwise we use __builtin_cpu_is.
+ if (Arch.consume_front("x86-64")) {
+ if (Arch.empty()) // FEATURE_X86_64_BASELINE 95=2*32+31
+ Mask[2] = 1u << 31;
+ else if (Arch == "-v2") // FEATURE_X86_64_V2 96==3*32+0
+ Mask[3] = 1u << 0;
+ else if (Arch == "-v3") // FEATURE_X86_64_V3 97==3*32+1
+ Mask[3] = 1u << 1;
+ else if (Arch == "-v4") // FEATURE_X86_64_V3 98==3*32+2
+ Mask[3] = 1u << 2;
+ else
+ llvm_unreachable("invalid x86-64 micro-architecture level");
+ Condition = EmitX86CpuSupports(Mask);
+ } else {
+ Condition = EmitX86CpuIs(Arch);
+ }
+ }
if (!RO.Conditions.Features.empty()) {
llvm::Value *FeatureCond = EmitX86CpuSupports(RO.Conditions.Features);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index af9ab55cbabd92..0741e3c40fcce6 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4902,7 +4902,7 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *EmitX86CpuIs(StringRef CPUStr);
llvm::Value *EmitX86CpuSupports(const CallExpr *E);
llvm::Value *EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs);
- llvm::Value *EmitX86CpuSupports(uint64_t Mask);
+ llvm::Value *EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask);
llvm::Value *EmitX86CpuInit();
llvm::Value *FormX86ResolverCondition(const MultiVersionResolverOption &RO);
llvm::Value *EmitAArch64CpuInit();
diff --git a/clang/test/CodeGen/attr-target-clones.c b/clang/test/CodeGen/attr-target-clones.c
index 3f0cf97fe75b0b..5da9548067831a 100644
--- a/clang/test/CodeGen/attr-target-clones.c
+++ b/clang/test/CodeGen/attr-target-clones.c
@@ -13,6 +13,9 @@
// WINDOWS: $foo_inline = comdat any
// WINDOWS: $foo_inline2 = comdat any
+// LINUX: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
+// LINUX: @__cpu_features2 = external dso_local global [3 x i32]
+
// LINUX: @foo.ifunc = weak_odr ifunc i32 (), ptr @foo.resolver
// LINUX: @foo_dupes.ifunc = weak_odr ifunc void (), ptr @foo_dupes.resolver
// LINUX: @unused.ifunc = weak_odr ifunc void (), ptr @unused.resolver
@@ -137,6 +140,28 @@ int test_foo_used_no_defn(void) {
// WINDOWS: musttail call i32 @foo_used_no_defn.sse4.2.0
// WINDOWS: musttail call i32 @foo_used_no_defn.default.1
+__attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4")))
+int isa_level(int) { return 0; }
+// LINUX: define{{.*}} i32 @isa_level.default.4(
+// LINUX: define{{.*}} i32 @isa_level.arch_x86-64.0(
+// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v2.1(
+// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v3.2(
+// LINUX: define{{.*}} i32 @isa_level.arch_x86-64-v4.3(
+// LINUX: define weak_odr ptr @isa_level.resolver() comdat
+// LINUX: call void @__cpu_indicator_init()
+// LINUX-NEXT: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
+// LINUX-NEXT: and i32 %[[#]], 4
+// LINUX: ret ptr @isa_level.arch_x86-64-v4.3
+// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
+// LINUX-NEXT: and i32 %[[#]], 2
+// LINUX: ret ptr @isa_level.arch_x86-64-v3.2
+// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 2)
+// LINUX-NEXT: and i32 %[[#]], 1
+// LINUX: ret ptr @isa_level.arch_x86-64-v2.1
+// LINUX: load i32, ptr getelementptr inbounds ([3 x i32], ptr @__cpu_features2, i32 0, i32 1)
+// LINUX-NEXT: and i32 %[[#]], -2147483648
+// LINUX: ret ptr @isa_level.arch_x86-64.0
+// LINUX: ret ptr @isa_level.default.4
// Deferred emission of inline definitions.
diff --git a/clang/test/CodeGen/builtin-cpu-supports.c b/clang/test/CodeGen/builtin-cpu-supports.c
index 5df16028c9a26a..59a82f89b03797 100644
--- a/clang/test/CodeGen/builtin-cpu-supports.c
+++ b/clang/test/CodeGen/builtin-cpu-supports.c
@@ -5,7 +5,7 @@
extern void a(const char *);
// CHECK: @__cpu_model = external dso_local global { i32, i32, i32, [1 x i32] }
-// CHECK: @__cpu_features2 = external dso_local global i32
+// CHECK: @__cpu_features2 = external dso_local global [3 x i32]
int main(void) {
__builtin_cpu_init();
diff --git a/clang/test/Sema/attr-target-clones.c b/clang/test/Sema/attr-target-clones.c
index e9ddecad5727f3..e287fce7699b77 100644
--- a/clang/test/Sema/attr-target-clones.c
+++ b/clang/test/Sema/attr-target-clones.c
@@ -118,3 +118,7 @@ void __attribute__((__overloadable__)) good_overload4(int) __attribute__((target
// expected-error at +1 {{attribute 'target_clones' multiversioning cannot be combined with attribute 'overloadable'}}
void __attribute__((__overloadable__)) good_overload5(void) __attribute__((target_clones("mmx", "sse4.2", "default")));
void good_overload5(int) __attribute__((target_clones("mmx", "sse4.2", "default")));
+
+void good_isa_level(int) __attribute__((target_clones("default", "arch=x86-64", "arch=x86-64-v2", "arch=x86-64-v3", "arch=x86-64-v4")));
+// expected-warning at +1 {{unsupported CPU 'x86-64-v5' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
+void bad_isa_level(int) __attribute__((target_clones("default", "arch=x86-64-v5")));
diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c
index 0f804af8730b0e..6a0cc8f7a402da 100644
--- a/compiler-rt/lib/builtins/cpu_model.c
+++ b/compiler-rt/lib/builtins/cpu_model.c
@@ -158,6 +158,19 @@ enum ProcessorFeatures {
FEATURE_AVX512BITALG,
FEATURE_AVX512BF16,
FEATURE_AVX512VP2INTERSECT,
+
+ FEATURE_CMPXCHG16B = 46,
+ FEATURE_F16C = 49,
+ FEATURE_LAHF_LM = 54,
+ FEATURE_LM,
+ FEATURE_WP,
+ FEATURE_LZCNT,
+ FEATURE_MOVBE,
+
+ FEATURE_X86_64_BASELINE = 95,
+ FEATURE_X86_64_V2,
+ FEATURE_X86_64_V3,
+ FEATURE_X86_64_V4,
CPU_FEATURE_MAX
};
@@ -677,6 +690,7 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
unsigned *Features) {
unsigned EAX = 0, EBX = 0;
+#define hasFeature(F) ((Features[F / 32] >> (F % 32)) & 1)
#define setFeature(F) \
Features[F / 32] |= 1U << (F % 32)
@@ -697,14 +711,20 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
setFeature(FEATURE_SSSE3);
if ((ECX >> 12) & 1)
setFeature(FEATURE_FMA);
+ if ((ECX >> 13) & 1)
+ setFeature(FEATURE_CMPXCHG16B);
if ((ECX >> 19) & 1)
setFeature(FEATURE_SSE4_1);
if ((ECX >> 20) & 1)
setFeature(FEATURE_SSE4_2);
+ if ((ECX >> 22) & 1)
+ setFeature(FEATURE_MOVBE);
if ((ECX >> 23) & 1)
setFeature(FEATURE_POPCNT);
if ((ECX >> 25) & 1)
setFeature(FEATURE_AES);
+ if ((ECX >> 29) & 1)
+ setFeature(FEATURE_F16C);
// If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
// indicates that the AVX registers will be saved and restored on context
@@ -786,12 +806,39 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
!getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
- if (HasExtLeaf1 && ((ECX >> 6) & 1))
- setFeature(FEATURE_SSE4_A);
- if (HasExtLeaf1 && ((ECX >> 11) & 1))
- setFeature(FEATURE_XOP);
- if (HasExtLeaf1 && ((ECX >> 16) & 1))
- setFeature(FEATURE_FMA4);
+ if (HasExtLeaf1) {
+ if (ECX & 1)
+ setFeature(FEATURE_LAHF_LM);
+ if ((ECX >> 5) & 1)
+ setFeature(FEATURE_LZCNT);
+ if (((ECX >> 6) & 1))
+ setFeature(FEATURE_SSE4_A);
+ if (((ECX >> 11) & 1))
+ setFeature(FEATURE_XOP);
+ if (((ECX >> 16) & 1))
+ setFeature(FEATURE_FMA4);
+ if (((EDX >> 29) & 1))
+ setFeature(FEATURE_LM);
+ }
+
+ if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) {
+ setFeature(FEATURE_X86_64_BASELINE);
+ if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) &&
+ hasFeature(FEATURE_LAHF_LM) && hasFeature(FEATURE_SSE4_2)) {
+ setFeature(FEATURE_X86_64_V2);
+ if (hasFeature(FEATURE_AVX2) && hasFeature(FEATURE_BMI) &&
+ hasFeature(FEATURE_BMI2) && hasFeature(FEATURE_F16C) &&
+ hasFeature(FEATURE_FMA) && hasFeature(FEATURE_LZCNT) &&
+ hasFeature(FEATURE_MOVBE)) {
+ setFeature(FEATURE_X86_64_V3);
+ if (hasFeature(FEATURE_AVX512BW) && hasFeature(FEATURE_AVX512CD) &&
+ hasFeature(FEATURE_AVX512DQ) && hasFeature(FEATURE_AVX512VL))
+ setFeature(FEATURE_X86_64_V4);
+ }
+ }
+ }
+
+#undef hasFeature
#undef setFeature
}
@@ -813,7 +860,7 @@ struct __processor_model {
#ifndef _WIN32
__attribute__((visibility("hidden")))
#endif
-unsigned int __cpu_features2 = 0;
+unsigned __cpu_features2[(CPU_FEATURE_MAX - 1) / 32];
// A constructor function that is sets __cpu_model and __cpu_features2 with
// the right values. This needs to run only once. This constructor is
@@ -827,6 +874,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
unsigned Vendor;
unsigned Model, Family;
unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
+ static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
+ static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
// This function needs to run just once.
if (__cpu_model.__cpu_vendor)
@@ -844,9 +893,10 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
// Find available features.
getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]);
- assert((sizeof(Features)/sizeof(Features[0])) == 2);
__cpu_model.__cpu_features[0] = Features[0];
- __cpu_features2 = Features[1];
+ __cpu_features2[0] = Features[1];
+ __cpu_features2[1] = Features[2];
+ __cpu_features2[2] = Features[3];
if (Vendor == SIG_INTEL) {
// Get CPU type.
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 8d2401e69f9df5..3a36289044fad0 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -237,6 +237,7 @@ static constexpr FeatureBitset FeaturesZNVER4 =
// listed here before, which means it doesn't support -march, -mtune and so on.
// FIXME: Remove OnlyForCPUDispatchSpecific after all CPUs here support both
// cpu_dispatch/specific() feature and -march, -mtune, and so on.
+// clang-format off
constexpr ProcInfo Processors[] = {
// Empty processor. Include X87 and CMPXCHG8 for backwards compatibility.
{ {""}, CK_None, ~0U, FeatureX87 | FeatureCMPXCHG8B, '\0', false },
@@ -404,13 +405,14 @@ constexpr ProcInfo Processors[] = {
{ {"znver3"}, CK_ZNVER3, FEATURE_AVX2, FeaturesZNVER3, '\0', false },
{ {"znver4"}, CK_ZNVER4, FEATURE_AVX512VBMI2, FeaturesZNVER4, '\0', false },
// Generic 64-bit processor.
- { {"x86-64"}, CK_x86_64, ~0U, FeaturesX86_64, '\0', false },
- { {"x86-64-v2"}, CK_x86_64_v2, ~0U, FeaturesX86_64_V2, '\0', false },
- { {"x86-64-v3"}, CK_x86_64_v3, ~0U, FeaturesX86_64_V3, '\0', false },
- { {"x86-64-v4"}, CK_x86_64_v4, ~0U, FeaturesX86_64_V4, '\0', false },
+ { {"x86-64"}, CK_x86_64, FEATURE_SSE2 , FeaturesX86_64, '\0', false },
+ { {"x86-64-v2"}, CK_x86_64_v2, FEATURE_SSE4_2 , FeaturesX86_64_V2, '\0', false },
+ { {"x86-64-v3"}, CK_x86_64_v3, FEATURE_AVX2, FeaturesX86_64_V3, '\0', false },
+ { {"x86-64-v4"}, CK_x86_64_v4, FEATURE_AVX512VL, FeaturesX86_64_V4, '\0', false },
// Geode processors.
{ {"geode"}, CK_Geode, ~0U, FeaturesGeode, '\0', false },
};
+// clang-format on
constexpr const char *NoTuneList[] = {"x86-64-v2", "x86-64-v3", "x86-64-v4"};
More information about the llvm-commits
mailing list