[clang] [llvm] [LLVM] Introduce late-resolved LLVM intrinsics and Clang builtins (PR #185147)
Joseph Huber via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 6 18:25:37 PST 2026
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/185147
>From 58461c7344ba2555923fe2cafdb0482985cbbc27 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 5 Mar 2026 20:36:38 -0600
Subject: [PATCH] [LLVM] Introduce late-resolved LLVM intrinsics and Clang
builtins
Summary:
This PR introduces two new LLVM intrinsics to handle resolving features
in the backend similar to https://github.com/llvm/llvm-project/pull/134016.
This introduces two new LLVM intrinsics, these are lowered in a new pass
called `LowerTargetintrinsics`.
```llvm
llvm.target.has.feature(metadata !"feature-name") -> i1
llvm.target.is.cpu(metadata !"cpu-name") -> i1
```
These are intended to behave similarly to `llvm.is.constant` where they
are guaranteed to be lowered to a constant value before the backend
runs. This is handled by an intrnsic handling hook in
`PreISelIntrinsicLowering`.
These intrinsics are intended to guard potentially incompatible code by
only emitting the branches they dominate if the target features match,
essentially pushing feature dispatch into the backend. To facilitate
this, these must be lowered even at 'O0' and all branches must be
removed.
This means we do the follow in the pass unconditionally:
1. Promotes allocas to SSA
2. Resolves each intrinsic to i1 using the TargetMachine
3. Folds constant terminators, removes dead blocks
On the clang side, we expose this with two builtins similar to the
linked PR.
```c
__builtin_is_invocable(builtin_fn)
__builtin_target_is_cpu("cpu-name")
```
These are intended to be used to create portable GPU feature dispatch,
this requires suppressing the normal incompatible target feature
testing.
```c
void kernel(unsigned *out, unsigned a, unsigned b,
unsigned c, unsigned d) {
if (__builtin_is_invocable(__builtin_amdgcn_permlane16))
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
else
fallback(out, a, b, c, d);
}
```
This isn't a complete implementation, I believe we would need to be more
explicit about what is allowed. The LLVM intrinsic is fundamentally
limited to a single intra-procedural pass, meaning any uses that escape
the current function will not be folded reliably.
This does not provide SPIR-V stubs like the linked PR needs.
Targets that pass a lot of default features even without a CPU will
still override if identified later (like x64).
---
clang/include/clang/Basic/Builtins.td | 12 +
.../clang/Basic/DiagnosticSemaKinds.td | 1 +
clang/lib/CodeGen/CGBuiltin.cpp | 33 ++
clang/lib/CodeGen/CodeGenFunction.cpp | 14 +-
clang/lib/CodeGen/CodeGenFunction.h | 5 +
clang/lib/Sema/SemaChecking.cpp | 14 +
clang/lib/Sema/SemaExpr.cpp | 28 ++
.../CodeGen/amdgpu-builtin-is-invocable.c | 68 ++++
clang/test/CodeGen/builtin-target-feature.c | 14 +
clang/test/Sema/builtin-target-feature.c | 31 ++
.../llvm/CodeGen/LowerTargetIntrinsics.h | 45 +++
llvm/include/llvm/IR/Intrinsics.td | 14 +
llvm/include/llvm/MC/MCSubtargetInfo.h | 4 +
llvm/lib/Analysis/InlineCost.cpp | 3 +
llvm/lib/Analysis/InstructionSimplify.cpp | 5 +
llvm/lib/CodeGen/CMakeLists.txt | 1 +
llvm/lib/CodeGen/LowerTargetIntrinsics.cpp | 161 +++++++++
llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 13 +
llvm/lib/MC/MCSubtargetInfo.cpp | 7 +
llvm/lib/Passes/PassRegistry.def | 1 +
.../LowerTargetIntrinsics/amdgpu.ll | 39 ++
.../Transforms/LowerTargetIntrinsics/basic.ll | 52 +++
.../LowerTargetIntrinsics/complex-cfg.ll | 334 ++++++++++++++++++
.../LowerTargetIntrinsics/isel-lowering.ll | 28 ++
.../LowerTargetIntrinsics/mem2reg.ll | 92 +++++
.../per-function-attrs.ll | 37 ++
.../LowerTargetIntrinsics/pipeline.ll | 58 +++
27 files changed, 1111 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CodeGen/amdgpu-builtin-is-invocable.c
create mode 100644 clang/test/CodeGen/builtin-target-feature.c
create mode 100644 clang/test/Sema/builtin-target-feature.c
create mode 100644 llvm/include/llvm/CodeGen/LowerTargetIntrinsics.h
create mode 100644 llvm/lib/CodeGen/LowerTargetIntrinsics.cpp
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/amdgpu.ll
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/basic.ll
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/complex-cfg.ll
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/isel-lowering.ll
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/mem2reg.ll
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/per-function-attrs.ll
create mode 100644 llvm/test/Transforms/LowerTargetIntrinsics/pipeline.ll
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 7063d7c06c4ca..37467385a0e26 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -821,6 +821,18 @@ def BuiltinCPUInit : Builtin {
let Prototype = "void()";
}
+def BuiltinTargetIsCPU : Builtin {
+ let Spellings = ["__builtin_target_is_cpu"];
+ let Attributes = [NoThrow, Const];
+ let Prototype = "bool(char const*)";
+}
+
+def BuiltinIsInvocable : Builtin {
+ let Spellings = ["__builtin_is_invocable"];
+ let Attributes = [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments];
+ let Prototype = "bool(...)";
+}
+
def BuiltinCalloc : Builtin {
let Spellings = ["__builtin_calloc"];
let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 58e15a89c2373..59531da637d15 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -163,6 +163,7 @@ def err_ice_too_large : Error<
"integer constant expression evaluates to value %0 that cannot be "
"represented in a %1-bit %select{signed|unsigned}2 integer type">;
def err_expr_not_string_literal : Error<"expression is not a string literal">;
+def err_expr_not_builtin : Error<"expression must be a valid builtin function for the target">;
def note_constexpr_assert_failed : Note<
"assertion failed during evaluation of constant expression">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6fb43d5cb0fbf..01e6a97e5a5c6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3764,6 +3764,39 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/false);
return RValue::get(Result);
}
+ case Builtin::BI__builtin_target_is_cpu: {
+ StringRef Str =
+ cast<clang::StringLiteral>(E->getArg(0)->IgnoreImpCasts())->getString();
+ llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+ llvm::Value *MDArg =
+ llvm::MetadataAsValue::get(Ctx, llvm::MDString::get(Ctx, Str));
+ Function *F = CGM.getIntrinsic(Intrinsic::target_is_cpu);
+ Value *Result = Builder.CreateCall(F, MDArg);
+ llvm::Type *ResultType = ConvertType(E->getType());
+ return RValue::get(
+ Builder.CreateIntCast(Result, ResultType, /*isSigned=*/false));
+ }
+ case Builtin::BI__builtin_is_invocable: {
+ unsigned BID =
+ E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
+ StringRef Features(getContext().BuiltinInfo.getRequiredFeatures(BID));
+ for (StringRef Feat : llvm::split(Features, ','))
+ CheckedTargetFeatures.insert(Feat);
+ llvm::Type *ResultType = ConvertType(E->getType());
+ if (Features.empty())
+ return RValue::get(llvm::ConstantInt::getTrue(ResultType));
+ llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+ Function *F = CGM.getIntrinsic(Intrinsic::target_has_feature);
+ Value *Result = nullptr;
+ for (StringRef Feat : llvm::split(Features, ',')) {
+ llvm::Value *MDArg =
+ llvm::MetadataAsValue::get(Ctx, llvm::MDString::get(Ctx, Feat));
+ Value *Has = Builder.CreateCall(F, MDArg);
+ Result = Result ? Builder.CreateAnd(Result, Has) : Has;
+ }
+ return RValue::get(Builder.CreateIntCast(Result, ResultType,
+ /*isSigned=*/false));
+ }
case Builtin::BI__builtin_dynamic_object_size:
case Builtin::BI__builtin_object_size: {
unsigned Type =
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 788599bc8746f..132032c5ed3a2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2913,9 +2913,17 @@ void CodeGenFunction::checkTargetFeatures(SourceLocation Loc,
StringRef FeatureList(CGM.getContext().BuiltinInfo.getRequiredFeatures(BuiltinID));
if (!Builtin::evaluateRequiredTargetFeatures(
FeatureList, CallerFeatureMap) && !IsHipStdPar) {
- CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
- << TargetDecl->getDeclName()
- << FeatureList;
+ // Suppress the diagnostic if every required feature has been checked
+ // via a late-resolved intrinsic in this function.
+ bool AllFeaturesChecked =
+ !FeatureList.empty() &&
+ llvm::all_of(llvm::split(FeatureList, ','), [this](StringRef F) {
+ return CheckedTargetFeatures.contains(F);
+ });
+ if (!AllFeaturesChecked) {
+ CGM.getDiags().Report(Loc, diag::err_builtin_needs_feature)
+ << TargetDecl->getDeclName() << FeatureList;
+ }
}
} else if (!TargetDecl->isMultiVersion() &&
TargetDecl->hasAttr<TargetAttr>()) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index ae2956eeac57a..4bbcab0c80588 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -590,6 +590,11 @@ class CodeGenFunction : public CodeGenTypeCache {
/// potentially set the return value.
bool SawAsmBlock = false;
+ /// List of features to accept when the current function emits a late-resolved
+ /// target feature intrinsic like __builtin_is_invocable(). This suppresses
+ /// errors when the required feature is missing as it will be handled by LLVM.
+ llvm::StringSet<> CheckedTargetFeatures;
+
GlobalDecl CurSEHParent;
/// True if the current function is an outlined SEH helper. This can be a
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index aea5b722738aa..4208287f8086a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2865,6 +2865,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
Context.getAuxTargetInfo(), BuiltinID))
return ExprError();
break;
+ case Builtin::BI__builtin_is_invocable:
+ if (checkArgCount(TheCall, 1))
+ return ExprError();
+ break;
+ case Builtin::BI__builtin_target_is_cpu: {
+ if (checkArgCount(TheCall, 1))
+ return ExprError();
+ Expr *Arg = TheCall->getArg(0)->IgnoreImpCasts();
+ if (!isa<StringLiteral>(Arg))
+ return ExprError(
+ Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
+ << Arg->getSourceRange());
+ break;
+ }
case Builtin::BI__builtin_cpu_init:
if (!Context.getTargetInfo().supportsCpuInit()) {
Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 53d215f5c5e3e..0723ba738987f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6758,6 +6758,34 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
if (Result.isInvalid()) return ExprError();
Fn = Result.get();
+ // __builtin_is_invocable takes an unevaluated builtin function reference.
+ // We must resolve it before CheckArgsForPlaceholders rejects BuiltinFnTy.
+ // Replace the argument with an integer literal holding the builtin ID;
+ // CodeGen resolves the required features from the ID at emission time.
+ if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1) {
+ if (auto *DRE = dyn_cast<DeclRefExpr>(Fn->IgnoreParenImpCasts())) {
+ auto *CalleeFD = dyn_cast<FunctionDecl>(DRE->getDecl());
+ if (CalleeFD &&
+ CalleeFD->getBuiltinID() == Builtin::BI__builtin_is_invocable) {
+ if (ArgExprs[0]->getType() != Context.BuiltinFnTy) {
+ Diag(ArgExprs[0]->getExprLoc(), diag::err_expr_not_builtin)
+ << ArgExprs[0]->getSourceRange();
+ return ExprError();
+ }
+ auto *ArgDecl = ArgExprs[0]->getReferencedDeclOfCallee();
+ auto *ArgFD = dyn_cast_or_null<FunctionDecl>(ArgDecl);
+ if (!ArgFD || !ArgFD->getBuiltinID()) {
+ Diag(ArgExprs[0]->getExprLoc(), diag::err_expr_not_builtin)
+ << ArgExprs[0]->getSourceRange();
+ return ExprError();
+ }
+ ArgExprs[0] = IntegerLiteral::Create(
+ Context, llvm::APInt(32, ArgFD->getBuiltinID()),
+ Context.UnsignedIntTy, ArgExprs[0]->getExprLoc());
+ }
+ }
+ }
+
if (CheckArgsForPlaceholders(ArgExprs))
return ExprError();
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
new file mode 100644
index 0000000000000..85535fdba86df
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=GFX1030
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=GFX900
+
+void permlane16_path(void);
+void permlane64_path(void);
+void popcount_path(void);
+void fallback_path(void);
+
+// __builtin_amdgcn_permlane16 requires "gfx10-insts".
+void test_is_invocable_permlane16(void) {
+// GFX1030-LABEL: @test_is_invocable_permlane16
+// GFX1030: call i1 @llvm.target.has.feature(metadata !"gfx10-insts")
+//
+// GFX900-LABEL: @test_is_invocable_permlane16
+// GFX900: call i1 @llvm.target.has.feature(metadata !"gfx10-insts")
+ if (__builtin_is_invocable(__builtin_amdgcn_permlane16))
+ permlane16_path();
+}
+
+// __builtin_amdgcn_permlane64 requires "gfx11-insts".
+void test_is_invocable_permlane64(void) {
+// GFX1030-LABEL: @test_is_invocable_permlane64
+// GFX1030: call i1 @llvm.target.has.feature(metadata !"gfx11-insts")
+//
+// GFX900-LABEL: @test_is_invocable_permlane64
+// GFX900: call i1 @llvm.target.has.feature(metadata !"gfx11-insts")
+ if (__builtin_is_invocable(__builtin_amdgcn_permlane64))
+ permlane64_path();
+}
+
+// __builtin_popcount has no required features, fold to constant.
+void test_is_invocable_popcount(void) {
+// GFX1030-LABEL: @test_is_invocable_popcount
+// GFX1030-NOT: call i1 @llvm.target.has.feature
+// GFX1030: call void @popcount_path
+//
+// GFX900-LABEL: @test_is_invocable_popcount
+// GFX900-NOT: call i1 @llvm.target.has.feature
+// GFX900: call void @popcount_path
+ if (__builtin_is_invocable(__builtin_popcount))
+ popcount_path();
+}
+
+void test_dispatch(void) {
+// GFX1030-LABEL: @test_dispatch
+// GFX1030: call i1 @llvm.target.has.feature(metadata !"gfx10-insts")
+//
+// GFX900-LABEL: @test_dispatch
+// GFX900: call i1 @llvm.target.has.feature(metadata !"gfx10-insts")
+ if (__builtin_is_invocable(__builtin_amdgcn_permlane16))
+ permlane16_path();
+ else
+ fallback_path();
+}
+
+// Calling a feature-gated builtin inside a guard must not produce an error.
+void test_guarded_builtin_call(unsigned *out, unsigned a, unsigned b,
+ unsigned c, unsigned d) {
+// GFX1030-LABEL: @test_guarded_builtin_call
+// GFX1030: call i1 @llvm.target.has.feature(metadata !"gfx10-insts")
+// GFX1030: call{{.*}} i32 @llvm.amdgcn.permlane16
+//
+// GFX900-LABEL: @test_guarded_builtin_call
+// GFX900: call i1 @llvm.target.has.feature(metadata !"gfx10-insts")
+// GFX900: call{{.*}} i32 @llvm.amdgcn.permlane16
+ if (__builtin_is_invocable(__builtin_amdgcn_permlane16))
+ *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
+}
diff --git a/clang/test/CodeGen/builtin-target-feature.c b/clang/test/CodeGen/builtin-target-feature.c
new file mode 100644
index 0000000000000..aea8e23c0e6ca
--- /dev/null
+++ b/clang/test/CodeGen/builtin-target-feature.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple x86_64-linux -target-cpu haswell -O2 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -target-cpu haswell -O0 -emit-llvm -o - %s | FileCheck %s
+
+// The intrinsics survive -emit-llvm (they are only resolved in the
+// backend pipeline, not the mid-end). Verify they are emitted correctly.
+
+void avx2_path(void);
+
+void test_is_haswell(void) {
+// CHECK-LABEL: @test_is_haswell
+// CHECK: call i1 @llvm.target.is.cpu(metadata !"haswell")
+ if (__builtin_target_is_cpu("haswell"))
+ avx2_path();
+}
diff --git a/clang/test/Sema/builtin-target-feature.c b/clang/test/Sema/builtin-target-feature.c
new file mode 100644
index 0000000000000..4289239ecb8df
--- /dev/null
+++ b/clang/test/Sema/builtin-target-feature.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify %s
+
+void test_good_cpu_condition(void) {
+ if (__builtin_target_is_cpu("haswell"))
+ ;
+}
+
+void test_good_is_invocable(void) {
+ if (__builtin_is_invocable(__builtin_popcount))
+ ;
+}
+
+const char *str = "avx2";
+
+_Bool test_not_literal_cpu(void) {
+ return __builtin_target_is_cpu(str); // expected-error {{expression is not a string literal}}
+}
+
+void not_a_builtin(void);
+
+_Bool test_is_invocable_string_literal(void) {
+ return __builtin_is_invocable("avx2"); // expected-error {{expression must be a valid builtin function for the target}}
+}
+
+_Bool test_is_invocable_non_builtin(void) {
+ return __builtin_is_invocable(not_a_builtin); // expected-error {{expression must be a valid builtin function for the target}}
+}
+
+_Bool test_is_invocable_variable(void) {
+ return __builtin_is_invocable(str); // expected-error {{expression must be a valid builtin function for the target}}
+}
diff --git a/llvm/include/llvm/CodeGen/LowerTargetIntrinsics.h b/llvm/include/llvm/CodeGen/LowerTargetIntrinsics.h
new file mode 100644
index 0000000000000..fdc9156f0d712
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/LowerTargetIntrinsics.h
@@ -0,0 +1,45 @@
+//===- LowerTargetIntrinsics.h - Lower feature intrinsics -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers 'llvm.target.has.feature' and 'llvm.target.is.cpu'
+// intrinsics into constants by querying the TargetMachine's subtarget for the
+// enclosing function. It then propagates the resulting constants, folds
+// branches, and removes dead blocks.
+//
+// This is a correctness requirement, code guarded by these intrinsics may
+// contain instructions illegal on the current target. The dead code must be
+// eliminated before ISel. The pass must run even on optnone functions at -O0.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LOWERTARGETINTRINSICS_H
+#define LLVM_CODEGEN_LOWERTARGETINTRINSICS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class TargetMachine;
+
+/// Lower all llvm.target.has.feature and llvm.target.is.cpu calls in \p F to
+/// constants by querying \p TM.
+bool lowerTargetIntrinsics(Function &F, const TargetMachine &TM);
+
+class LowerTargetIntrinsicsPass
+ : public PassInfoMixin<LowerTargetIntrinsicsPass> {
+ const TargetMachine *TM;
+
+public:
+ LowerTargetIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+ static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_LOWERTARGETINTRINSICS_H
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 5b5fffaa48951..1e9b083354f3c 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1980,6 +1980,20 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
def int_is_constant : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty],
[IntrNoMem, IntrConvergent]>;
+// Late-resolved intrinsic to query whether the enclosing function's resolved
+// target features include a named feature. The argument is a metadata string
+// naming the feature. Resolved against the TargetMachine's subtarget for the
+// function. Must be lowered before ISel.
+def int_target_has_feature : DefaultAttrsIntrinsic<[llvm_i1_ty],
+ [llvm_metadata_ty],
+ [IntrNoMem, IntrWillReturn]>;
+
+// Intrinsic to query whether the enclosing function's target CPU exactly
+// matches a named CPU. Same contract as llvm.target.has.feature.
+def int_target_is_cpu : DefaultAttrsIntrinsic<[llvm_i1_ty],
+ [llvm_metadata_ty],
+ [IntrNoMem, IntrWillReturn]>;
+
// Introduce a use of the argument without generating any code.
def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty],
[IntrHasSideEffects, IntrInaccessibleMemOnly]>;
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h
index 708de6d98f40b..52050ef686477 100644
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -159,6 +159,10 @@ class LLVM_ABI MCSubtargetInfo {
/// the provided string, ignoring all other features.
bool checkFeatures(StringRef FS) const;
+ /// Check if a named feature is enabled in this subtarget.
+ /// Returns false for unknown/unrecognized feature names.
+ bool hasFeatureString(StringRef FeatureName) const;
+
/// Get the machine model of a CPU.
const MCSchedModel &getSchedModelForCPU(StringRef CPU) const;
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 2ba82cbcdde30..811994ddcdf38 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -2534,6 +2534,9 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
return true;
case Intrinsic::is_constant:
return simplifyIntrinsicCallIsConstant(Call);
+ case Intrinsic::target_has_feature:
+ case Intrinsic::target_is_cpu:
+ return true;
case Intrinsic::objectsize:
return simplifyIntrinsicCallObjectSize(Call);
}
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4da15280029d9..96a12cead5b7b 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4429,6 +4429,11 @@ static Value *simplifyWithOpsReplaced(Value *V,
if (match(I, m_Intrinsic<Intrinsic::is_constant>()))
return nullptr;
+ // Don't fold target reflection intrinsics based on assumptions.
+ if (match(I, m_Intrinsic<Intrinsic::target_has_feature>()) ||
+ match(I, m_Intrinsic<Intrinsic::target_is_cpu>()))
+ return nullptr;
+
// Don't simplify freeze.
if (isa<FreezeInst>(I))
return nullptr;
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 7620d546091c8..d497b43cc6d03 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -108,6 +108,7 @@ add_llvm_component_library(LLVMCodeGen
LoopTraversal.cpp
LowLevelTypeUtils.cpp
LowerEmuTLS.cpp
+ LowerTargetIntrinsics.cpp
MachineBasicBlock.cpp
MachineBlockFrequencyInfo.cpp
MachineBlockHashInfo.cpp
diff --git a/llvm/lib/CodeGen/LowerTargetIntrinsics.cpp b/llvm/lib/CodeGen/LowerTargetIntrinsics.cpp
new file mode 100644
index 0000000000000..4daadfdb76d98
--- /dev/null
+++ b/llvm/lib/CodeGen/LowerTargetIntrinsics.cpp
@@ -0,0 +1,161 @@
+//===- LowerTargetIntrinsics.cpp - Lower target feature intrinsics --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers llvm.target.has.feature and llvm.target.is.cpu intrinsics to
+// constants using the TargetMachine. It then propagates the constants, folds
+// branches, and removes dead blocks. This is a correctness requirement.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LowerTargetIntrinsics.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-target-intrinsics"
+
+STATISTIC(NumHasFeatureLowered, "Number of llvm.target.has.feature lowered");
+STATISTIC(NumIsCpuLowered, "Number of llvm.target.is.cpu lowered");
+
+// Extract a metadata string from an intrinsic argument that is a
+// MetadataAsValue wrapping an MDString. Returns empty StringRef on failure.
+static StringRef extractMetadataStringArg(const CallInst *CI, unsigned ArgNo) {
+ auto *MAV = dyn_cast<MetadataAsValue>(CI->getArgOperand(ArgNo));
+ if (!MAV)
+ return StringRef();
+ auto *MDS = dyn_cast<MDString>(MAV->getMetadata());
+ return MDS ? MDS->getString() : StringRef();
+}
+
+static Constant *resolveTargetIntrinsic(IntrinsicInst *II,
+ const TargetMachine *TM) {
+ StringRef Name = extractMetadataStringArg(II, 0);
+ Function *F = II->getFunction();
+ LLVMContext &Ctx = II->getContext();
+
+ if (Name.empty())
+ return ConstantInt::getFalse(Ctx);
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::target_has_feature: {
+ const TargetSubtargetInfo *STI = TM->getSubtargetImpl(*F);
+ bool Has = STI ? STI->hasFeatureString(Name) : false;
+ LLVM_DEBUG(dbgs() << " has.feature(\"" << Name << "\") -> " << Has
+ << "\n");
+ ++NumHasFeatureLowered;
+ return ConstantInt::getBool(Ctx, Has);
+ }
+ case Intrinsic::target_is_cpu: {
+ const TargetSubtargetInfo *STI = TM->getSubtargetImpl(*F);
+ StringRef CPU = STI ? STI->getCPU() : "";
+ bool Match = (CPU == Name);
+ LLVM_DEBUG(dbgs() << " is.cpu(\"" << Name << "\") [actual=\"" << CPU
+ << "\"] -> " << Match << "\n");
+ ++NumIsCpuLowered;
+ return ConstantInt::getBool(Ctx, Match);
+ }
+ default:
+ llvm_unreachable("Not a target reflection intrinsic");
+ }
+}
+
+static void promoteAllocasToSSA(Function &F, DominatorTree &DT) {
+ SmallVector<AllocaInst *, 8> Promotable;
+ BasicBlock &Entry = F.getEntryBlock();
+ for (Instruction &I : Entry)
+ if (auto *AI = dyn_cast<AllocaInst>(&I))
+ if (isAllocaPromotable(AI))
+ Promotable.push_back(AI);
+
+ if (!Promotable.empty())
+ PromoteMemToReg(Promotable, DT);
+}
+
+static SmallVector<IntrinsicInst *, 8> collectTargetIntrinsics(Function &F) {
+ SmallVector<IntrinsicInst *, 8> Calls;
+ for (Instruction &I : instructions(&F))
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::target_has_feature ||
+ II->getIntrinsicID() == Intrinsic::target_is_cpu)
+ Calls.push_back(II);
+ return Calls;
+}
+
+bool llvm::lowerTargetIntrinsics(Function &F, const TargetMachine &TM) {
+ SmallVector<IntrinsicInst *, 8> TargetCalls = collectTargetIntrinsics(F);
+ if (TargetCalls.empty())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "LowerTargetIntrinsics: processing " << F.getName()
+ << " (" << TargetCalls.size() << " calls)\n");
+
+ // Promote allocas to SSA so constant propagation works through
+ // the alloca/store/load patterns Clang emits at -O0.
+ DominatorTree DT(F);
+ promoteAllocasToSSA(F, DT);
+
+ // Re-collect after mem2reg may have changed things.
+ TargetCalls = collectTargetIntrinsics(F);
+ if (TargetCalls.empty())
+ return true;
+
+ // Resolve each intrinsic to a constant and propagate.
+ for (IntrinsicInst *II : TargetCalls) {
+ Constant *Val = resolveTargetIntrinsic(II, &TM);
+ replaceAndRecursivelySimplify(II, Val);
+ }
+
+ // Fold now-constant terminators and remove dead blocks.
+ for (BasicBlock &BB : make_early_inc_range(F))
+ ConstantFoldTerminator(&BB, true);
+ removeUnreachableBlocks(F);
+
+ return true;
+}
+
+PreservedAnalyses LowerTargetIntrinsicsPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ Function *HasFeatureDecl =
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::target_has_feature);
+ Function *IsCpuDecl =
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::target_is_cpu);
+ if (!HasFeatureDecl && !IsCpuDecl)
+ return PreservedAnalyses::all();
+
+ // Collect the set of functions that contain calls to these intrinsics.
+ SmallPtrSet<Function *, 8> AffectedFunctions;
+ for (Function *Decl : {HasFeatureDecl, IsCpuDecl}) {
+ if (!Decl)
+ continue;
+ for (User *U : Decl->users())
+ if (auto *CI = dyn_cast<CallInst>(U))
+ AffectedFunctions.insert(CI->getFunction());
+ }
+
+ bool Changed = false;
+ for (Function *F : AffectedFunctions)
+ Changed |= lowerTargetIntrinsics(*F, *TM);
+
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 0544995f979f7..1a7434ca15590 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -19,9 +19,11 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/ExpandVectorPredication.h"
#include "llvm/CodeGen/LibcallLoweringInfo.h"
+#include "llvm/CodeGen/LowerTargetIntrinsics.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
@@ -702,6 +704,17 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
return Changed;
});
break;
+ case Intrinsic::target_has_feature:
+ case Intrinsic::target_is_cpu:
+ if (TM) {
+ SmallPtrSet<Function *, 4> Visited;
+ for (User *U : F.users())
+ if (auto *CI = dyn_cast<CallInst>(U))
+ Visited.insert(CI->getFunction());
+ for (Function *Fn : Visited)
+ Changed |= lowerTargetIntrinsics(*Fn, *TM);
+ }
+ break;
#define BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \
case Intrinsic::VPID:
#include "llvm/IR/VPIntrinsics.def"
diff --git a/llvm/lib/MC/MCSubtargetInfo.cpp b/llvm/lib/MC/MCSubtargetInfo.cpp
index 7ad6c76978793..f705fece8278a 100644
--- a/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -332,6 +332,13 @@ bool MCSubtargetInfo::checkFeatures(StringRef FS) const {
});
}
+bool MCSubtargetInfo::hasFeatureString(StringRef FeatureName) const {
+ auto It = llvm::lower_bound(ProcFeatures, FeatureName);
+ if (It != ProcFeatures.end() && StringRef(It->Key) == FeatureName)
+ return FeatureBits.test(It->Value);
+ return false;
+}
+
const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
assert(llvm::is_sorted(ProcDesc) &&
"Processor machine model table is not sorted");
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 879713f4d6e57..9054db4fdc850 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -107,6 +107,7 @@ MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
MODULE_PASS("lower-emutls", LowerEmuTLSPass())
MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
MODULE_PASS("lower-ifunc", LowerIFuncPass())
+MODULE_PASS("lower-target-intrinsics", LowerTargetIntrinsicsPass(TM))
MODULE_PASS("simplify-type-tests", SimplifyTypeTestsPass())
MODULE_PASS("lowertypetests", LowerTypeTestsPass())
MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/amdgpu.ll b/llvm/test/Transforms/LowerTargetIntrinsics/amdgpu.ll
new file mode 100644
index 0000000000000..c9f78d38d8009
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/amdgpu.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s --check-prefix=GFX1030
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s --check-prefix=GFX90A
+
+declare void @rdna_path()
+declare void @cdna_path()
+declare void @generic_path()
+
+define void @test_amdgpu_cpu_dispatch() {
+; GFX1030-LABEL: @test_amdgpu_cpu_dispatch(
+; GFX1030-NOT: @cdna_path
+; GFX1030-NOT: @generic_path
+; GFX1030: call void @rdna_path()
+; GFX1030: ret void
+;
+; GFX90A-LABEL: @test_amdgpu_cpu_dispatch(
+; GFX90A-NOT: @rdna_path
+; GFX90A-NOT: @generic_path
+; GFX90A: call void @cdna_path()
+; GFX90A: ret void
+entry:
+ %is_gfx1030 = call i1 @llvm.target.is.cpu(metadata !"gfx1030")
+ br i1 %is_gfx1030, label %rdna, label %check_cdna
+
+rdna:
+ call void @rdna_path()
+ ret void
+
+check_cdna:
+ %is_gfx90a = call i1 @llvm.target.is.cpu(metadata !"gfx90a")
+ br i1 %is_gfx90a, label %cdna, label %generic
+
+cdna:
+ call void @cdna_path()
+ ret void
+
+generic:
+ call void @generic_path()
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/basic.ll b/llvm/test/Transforms/LowerTargetIntrinsics/basic.ll
new file mode 100644
index 0000000000000..a1a384befc79d
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/basic.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s --check-prefix=HASWELL
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux -mcpu=x86-64 < %s | FileCheck %s --check-prefix=GENERIC
+
+define i1 @test_has_avx2() {
+; HASWELL-LABEL: @test_has_avx2(
+; HASWELL-NEXT: ret i1 true
+;
+; GENERIC-LABEL: @test_has_avx2(
+; GENERIC-NEXT: ret i1 false
+ %1 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ ret i1 %1
+}
+
+define i1 @test_has_sse2() {
+; HASWELL-LABEL: @test_has_sse2(
+; HASWELL-NEXT: ret i1 true
+;
+; GENERIC-LABEL: @test_has_sse2(
+; GENERIC-NEXT: ret i1 true
+ %1 = call i1 @llvm.target.has.feature(metadata !"sse2")
+ ret i1 %1
+}
+
+define i1 @test_has_bogus() {
+; HASWELL-LABEL: @test_has_bogus(
+; HASWELL-NEXT: ret i1 false
+;
+; GENERIC-LABEL: @test_has_bogus(
+; GENERIC-NEXT: ret i1 false
+ %1 = call i1 @llvm.target.has.feature(metadata !"bogus")
+ ret i1 %1
+}
+
+define i1 @test_is_haswell() {
+; HASWELL-LABEL: @test_is_haswell(
+; HASWELL-NEXT: ret i1 true
+;
+; GENERIC-LABEL: @test_is_haswell(
+; GENERIC-NEXT: ret i1 false
+ %1 = call i1 @llvm.target.is.cpu(metadata !"haswell")
+ ret i1 %1
+}
+
+define i1 @test_is_generic() {
+; HASWELL-LABEL: @test_is_generic(
+; HASWELL-NEXT: ret i1 false
+;
+; GENERIC-LABEL: @test_is_generic(
+; GENERIC-NEXT: ret i1 true
+ %1 = call i1 @llvm.target.is.cpu(metadata !"x86-64")
+ ret i1 %1
+}
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/complex-cfg.ll b/llvm/test/Transforms/LowerTargetIntrinsics/complex-cfg.ll
new file mode 100644
index 0000000000000..9cab6492f680f
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/complex-cfg.ll
@@ -0,0 +1,334 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s --check-prefix=HASWELL
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=DEFAULT
+
+; Complex control flow tests to ensure all dead blocks are removed.
+
+declare void @avx2_path()
+declare void @avx512_path()
+declare void @fallback_path()
+declare void @common_work()
+declare i32 @get_val()
+declare void @sse2_only_path()
+declare void @avx2_and_sse2_path()
+
+define void @test_loop_with_check(i32 %n) {
+; HASWELL-LABEL: define void @test_loop_with_check(
+; HASWELL-SAME: i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; HASWELL-NEXT: [[ENTRY:.*]]:
+; HASWELL-NEXT: br label %[[LOOP_HEADER:.*]]
+; HASWELL: [[LOOP_HEADER]]:
+; HASWELL-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; HASWELL-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
+; HASWELL-NEXT: br i1 [[CMP]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; HASWELL: [[LOOP_BODY]]:
+; HASWELL-NEXT: br label %[[AVX2_BB:.*]]
+; HASWELL: [[AVX2_BB]]:
+; HASWELL-NEXT: call void @avx2_path()
+; HASWELL-NEXT: br label %[[LOOP_LATCH]]
+; HASWELL: [[LOOP_LATCH]]:
+; HASWELL-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; HASWELL-NEXT: br label %[[LOOP_HEADER]]
+; HASWELL: [[EXIT]]:
+; HASWELL-NEXT: ret void
+;
+; DEFAULT-LABEL: define void @test_loop_with_check(
+; DEFAULT-SAME: i32 [[N:%.*]]) {
+; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
+; DEFAULT: [[LOOP_HEADER]]:
+; DEFAULT-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; DEFAULT-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
+; DEFAULT-NEXT: br i1 [[CMP]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; DEFAULT: [[LOOP_BODY]]:
+; DEFAULT-NEXT: br label %[[FALLBACK_BB:.*]]
+; DEFAULT: [[FALLBACK_BB]]:
+; DEFAULT-NEXT: call void @fallback_path()
+; DEFAULT-NEXT: br label %[[LOOP_LATCH]]
+; DEFAULT: [[LOOP_LATCH]]:
+; DEFAULT-NEXT: [[I_NEXT]] = add i32 [[I]], 1
+; DEFAULT-NEXT: br label %[[LOOP_HEADER]]
+; DEFAULT: [[EXIT]]:
+; DEFAULT-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+ %cmp = icmp slt i32 %i, %n
+ br i1 %cmp, label %loop.body, label %exit
+
+loop.body:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ br label %loop.latch
+
+fallback.bb:
+ call void @fallback_path()
+ br label %loop.latch
+
+loop.latch:
+ %i.next = add i32 %i, 1
+ br label %loop.header
+
+exit:
+ ret void
+}
+
+define void @test_multiple_checks() {
+; HASWELL-LABEL: define void @test_multiple_checks(
+; HASWELL-SAME: ) #[[ATTR0]] {
+; HASWELL-NEXT: [[ENTRY:.*:]]
+; HASWELL-NEXT: br label %[[DO_AVX2:.*]]
+; HASWELL: [[DO_AVX2]]:
+; HASWELL-NEXT: call void @avx2_path()
+; HASWELL-NEXT: br label %[[CHECK_AVX512:.*]]
+; HASWELL: [[CHECK_AVX512]]:
+; HASWELL-NEXT: br label %[[SKIP_AVX512:.*]]
+; HASWELL: [[SKIP_AVX512]]:
+; HASWELL-NEXT: call void @common_work()
+; HASWELL-NEXT: br label %[[DONE:.*]]
+; HASWELL: [[DONE]]:
+; HASWELL-NEXT: ret void
+;
+; DEFAULT-LABEL: define void @test_multiple_checks() {
+; DEFAULT-NEXT: [[ENTRY:.*:]]
+; DEFAULT-NEXT: br label %[[SKIP_AVX2:.*]]
+; DEFAULT: [[SKIP_AVX2]]:
+; DEFAULT-NEXT: br label %[[CHECK_AVX512:.*]]
+; DEFAULT: [[CHECK_AVX512]]:
+; DEFAULT-NEXT: br label %[[SKIP_AVX512:.*]]
+; DEFAULT: [[SKIP_AVX512]]:
+; DEFAULT-NEXT: call void @common_work()
+; DEFAULT-NEXT: br label %[[DONE:.*]]
+; DEFAULT: [[DONE]]:
+; DEFAULT-NEXT: ret void
+;
+entry:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %do_avx2, label %skip_avx2
+
+do_avx2:
+ call void @avx2_path()
+ br label %check_avx512
+
+skip_avx2:
+ br label %check_avx512
+
+check_avx512:
+ %has_avx512 = call i1 @llvm.target.has.feature(metadata !"avx512f")
+ br i1 %has_avx512, label %do_avx512, label %skip_avx512
+
+do_avx512:
+ call void @avx512_path()
+ br label %done
+
+skip_avx512:
+ call void @common_work()
+ br label %done
+
+done:
+ ret void
+}
+
+define i32 @test_phi_propagation() {
+; CHECK-LABEL: define i32 @test_phi_propagation(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 1
+;
+; HASWELL-LABEL: define i32 @test_phi_propagation(
+; HASWELL-SAME: ) #[[ATTR0]] {
+; HASWELL-NEXT: [[ENTRY:.*:]]
+; HASWELL-NEXT: ret i32 1
+;
+; DEFAULT-LABEL: define i32 @test_phi_propagation() {
+; DEFAULT-NEXT: [[ENTRY:.*:]]
+; DEFAULT-NEXT: ret i32 0
+;
+entry:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ %val = zext i1 %has_avx2 to i32
+ ret i32 %val
+}
+
+define void @test_cascade() {
+; HASWELL-LABEL: define void @test_cascade(
+; HASWELL-SAME: ) #[[ATTR0]] {
+; HASWELL-NEXT: [[ENTRY:.*:]]
+; HASWELL-NEXT: br label %[[TRY_AVX2:.*]]
+; HASWELL: [[TRY_AVX2]]:
+; HASWELL-NEXT: br label %[[USE_AVX2:.*]]
+; HASWELL: [[USE_AVX2]]:
+; HASWELL-NEXT: call void @avx2_path()
+; HASWELL-NEXT: br label %[[DONE:.*]]
+; HASWELL: [[DONE]]:
+; HASWELL-NEXT: ret void
+;
+; DEFAULT-LABEL: define void @test_cascade() {
+; DEFAULT-NEXT: [[ENTRY:.*:]]
+; DEFAULT-NEXT: br label %[[TRY_AVX2:.*]]
+; DEFAULT: [[TRY_AVX2]]:
+; DEFAULT-NEXT: br label %[[USE_FALLBACK:.*]]
+; DEFAULT: [[USE_FALLBACK]]:
+; DEFAULT-NEXT: call void @fallback_path()
+; DEFAULT-NEXT: br label %[[DONE:.*]]
+; DEFAULT: [[DONE]]:
+; DEFAULT-NEXT: ret void
+;
+entry:
+ %has_avx512 = call i1 @llvm.target.has.feature(metadata !"avx512f")
+ br i1 %has_avx512, label %use_avx512, label %try_avx2
+
+use_avx512:
+ call void @avx512_path()
+ br label %done
+
+try_avx2:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %use_avx2, label %use_fallback
+
+use_avx2:
+ call void @avx2_path()
+ br label %done
+
+use_fallback:
+ call void @fallback_path()
+ br label %done
+
+done:
+ ret void
+}
+
+define void @test_branch_fold() {
+; HASWELL-LABEL: define void @test_branch_fold(
+; HASWELL-SAME: ) #[[ATTR0]] {
+; HASWELL-NEXT: [[ENTRY:.*:]]
+; HASWELL-NEXT: br label %[[AVX2_BB:.*]]
+; HASWELL: [[AVX2_BB]]:
+; HASWELL-NEXT: call void @avx2_path()
+; HASWELL-NEXT: ret void
+;
+; DEFAULT-LABEL: define void @test_branch_fold() {
+; DEFAULT-NEXT: [[ENTRY:.*:]]
+; DEFAULT-NEXT: br label %[[FALLBACK_BB:.*]]
+; DEFAULT: [[FALLBACK_BB]]:
+; DEFAULT-NEXT: call void @fallback_path()
+; DEFAULT-NEXT: ret void
+;
+; GENERIC-LABEL: @test_branch_fold(
+; GENERIC-NOT: @avx2_path
+; GENERIC: call void @fallback_path()
+; GENERIC-NEXT: ret void
+entry:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ ret void
+
+fallback.bb:
+ call void @fallback_path()
+ ret void
+}
+
+define i32 @test_diamond_phi(i32 %x) {
+; HASWELL-LABEL: define i32 @test_diamond_phi(
+; HASWELL-SAME: i32 [[X:%.*]]) #[[ATTR0]] {
+; HASWELL-NEXT: [[ENTRY:.*:]]
+; HASWELL-NEXT: br label %[[AVX2_BB:.*]]
+; HASWELL: [[AVX2_BB]]:
+; HASWELL-NEXT: call void @avx2_path()
+; HASWELL-NEXT: br label %[[MERGE:.*]]
+; HASWELL: [[MERGE]]:
+; HASWELL-NEXT: ret i32 42
+;
+; DEFAULT-LABEL: define i32 @test_diamond_phi(
+; DEFAULT-SAME: i32 [[X:%.*]]) {
+; DEFAULT-NEXT: [[ENTRY:.*:]]
+; DEFAULT-NEXT: br label %[[FALLBACK_BB:.*]]
+; DEFAULT: [[FALLBACK_BB]]:
+; DEFAULT-NEXT: call void @fallback_path()
+; DEFAULT-NEXT: br label %[[MERGE:.*]]
+; DEFAULT: [[MERGE]]:
+; DEFAULT-NEXT: ret i32 99
+;
+; GENERIC-LABEL: @test_diamond_phi(
+; GENERIC-NOT: @avx2_path
+; GENERIC: call void @fallback_path()
+; GENERIC: ret i32 99
+entry:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ br label %merge
+
+fallback.bb:
+ call void @fallback_path()
+ br label %merge
+
+merge:
+ %result = phi i32 [ 42, %avx2.bb ], [ 99, %fallback.bb ]
+ ret i32 %result
+}
+
+define void @test_nested_branch() {
+; HASWELL-LABEL: define void @test_nested_branch(
+; HASWELL-SAME: ) #[[ATTR0]] {
+; HASWELL-NEXT: [[ENTRY:.*:]]
+; HASWELL-NEXT: br label %[[CHECK_SSE2:.*]]
+; HASWELL: [[CHECK_SSE2]]:
+; HASWELL-NEXT: br label %[[BOTH:.*]]
+; HASWELL: [[BOTH]]:
+; HASWELL-NEXT: call void @avx2_and_sse2_path()
+; HASWELL-NEXT: ret void
+;
+; DEFAULT-LABEL: define void @test_nested_branch() {
+; DEFAULT-NEXT: [[ENTRY:.*:]]
+; DEFAULT-NEXT: br label %[[NO_AVX2:.*]]
+; DEFAULT: [[NO_AVX2]]:
+; DEFAULT-NEXT: br label %[[SSE2_ONLY:.*]]
+; DEFAULT: [[SSE2_ONLY]]:
+; DEFAULT-NEXT: call void @sse2_only_path()
+; DEFAULT-NEXT: ret void
+;
+; GENERIC-LABEL: @test_nested_branch(
+; GENERIC-NOT: @avx2_and_sse2_path
+; GENERIC-NOT: @avx2_path
+; GENERIC: call void @sse2_only_path()
+; GENERIC: ret void
+entry:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %check.sse2, label %no.avx2
+
+check.sse2:
+ %has_sse2 = call i1 @llvm.target.has.feature(metadata !"sse2")
+ br i1 %has_sse2, label %both, label %avx2.only
+
+both:
+ call void @avx2_and_sse2_path()
+ ret void
+
+avx2.only:
+ call void @avx2_path()
+ ret void
+
+no.avx2:
+ %has_sse2_alt = call i1 @llvm.target.has.feature(metadata !"sse2")
+ br i1 %has_sse2_alt, label %sse2.only, label %generic
+
+sse2.only:
+ call void @sse2_only_path()
+ ret void
+
+generic:
+ call void @fallback_path()
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/isel-lowering.ll b/llvm/test/Transforms/LowerTargetIntrinsics/isel-lowering.ll
new file mode 100644
index 0000000000000..fe3a341547b58
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/isel-lowering.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -passes=pre-isel-intrinsic-lowering -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s --check-prefix=HASWELL
+; RUN: opt -S -passes=pre-isel-intrinsic-lowering -mtriple=x86_64-unknown-linux -mcpu=x86-64 < %s | FileCheck %s --check-prefix=GENERIC
+
+; Verify that PreISelIntrinsicLowering acts as a safety net and replaces
+; any surviving target intrinsics with constants.
+
+define i1 @test_safety_has_feature() {
+; HASWELL-LABEL: @test_safety_has_feature(
+; HASWELL: ret i1 true
+;
+; GENERIC-LABEL: @test_safety_has_feature(
+; GENERIC: ret i1 false
+ %1 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ ret i1 %1
+}
+
+define i1 @test_safety_is_cpu() {
+; HASWELL-LABEL: @test_safety_is_cpu(
+; HASWELL: ret i1 true
+;
+; GENERIC-LABEL: @test_safety_is_cpu(
+; GENERIC: ret i1 false
+ %1 = call i1 @llvm.target.is.cpu(metadata !"haswell")
+ ret i1 %1
+}
+
+declare i1 @llvm.target.has.feature(metadata)
+declare i1 @llvm.target.is.cpu(metadata)
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/mem2reg.ll b/llvm/test/Transforms/LowerTargetIntrinsics/mem2reg.ll
new file mode 100644
index 0000000000000..dbdb9d6c59ee6
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/mem2reg.ll
@@ -0,0 +1,92 @@
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s --check-prefix=HASWELL
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux -mcpu=x86-64 < %s | FileCheck %s --check-prefix=GENERIC
+
+; This test simulates -O0 codegen where Clang emits alloca/store/load instead
+; of SSA values. The pass must promote these to SSA before resolution.
+
+declare void @avx2_path()
+declare void @fallback_path()
+
+define void @test_o0_alloca_i1() {
+; HASWELL-LABEL: @test_o0_alloca_i1(
+; HASWELL-NOT: @fallback_path
+; HASWELL: call void @avx2_path()
+; HASWELL: ret void
+;
+; GENERIC-LABEL: @test_o0_alloca_i1(
+; GENERIC-NOT: @avx2_path
+; GENERIC: call void @fallback_path()
+; GENERIC: ret void
+entry:
+ %has_avx2.addr = alloca i1
+ %result = call i1 @llvm.target.has.feature(metadata !"avx2")
+ store i1 %result, ptr %has_avx2.addr
+ %loaded = load i1, ptr %has_avx2.addr
+ br i1 %loaded, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ ret void
+
+fallback.bb:
+ call void @fallback_path()
+ ret void
+}
+
+define void @test_o0_alloca_i8_bool() {
+; HASWELL-LABEL: @test_o0_alloca_i8_bool(
+; HASWELL-NOT: @fallback_path
+; HASWELL: call void @avx2_path()
+; HASWELL: ret void
+;
+; GENERIC-LABEL: @test_o0_alloca_i8_bool(
+; GENERIC-NOT: @avx2_path
+; GENERIC: call void @fallback_path()
+; GENERIC: ret void
+entry:
+ %has_avx2.addr = alloca i8
+ %result = call i1 @llvm.target.has.feature(metadata !"avx2")
+ %ext = zext i1 %result to i8
+ store i8 %ext, ptr %has_avx2.addr
+ %loaded = load i8, ptr %has_avx2.addr
+ %tobool = trunc i8 %loaded to i1
+ br i1 %tobool, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ ret void
+
+fallback.bb:
+ call void @fallback_path()
+ ret void
+}
+
+define void @test_o0_multi_load() {
+; HASWELL-LABEL: @test_o0_multi_load(
+; HASWELL-NOT: @fallback_path
+; HASWELL: call void @avx2_path()
+; HASWELL: ret void
+;
+; GENERIC-LABEL: @test_o0_multi_load(
+; GENERIC-NOT: @avx2_path
+; GENERIC: call void @fallback_path()
+; GENERIC: ret void
+entry:
+ %has_avx2.addr = alloca i1
+ %result = call i1 @llvm.target.has.feature(metadata !"avx2")
+ store i1 %result, ptr %has_avx2.addr
+ %loaded1 = load i1, ptr %has_avx2.addr
+ br i1 %loaded1, label %check2, label %fallback.bb
+
+check2:
+ %loaded2 = load i1, ptr %has_avx2.addr
+ br i1 %loaded2, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ ret void
+
+fallback.bb:
+ call void @fallback_path()
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/per-function-attrs.ll b/llvm/test/Transforms/LowerTargetIntrinsics/per-function-attrs.ll
new file mode 100644
index 0000000000000..a8a4c845f53f8
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/per-function-attrs.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -passes=lower-target-intrinsics -mtriple=x86_64-unknown-linux -mcpu=x86-64 < %s | FileCheck %s
+
+; Test that per-function target-cpu/target-features attributes are respected.
+; The module default is x86-64, but individual functions override it.
+
+; This function has haswell attributes — it should see avx2.
+define i1 @func_with_haswell() #0 {
+; CHECK-LABEL: @func_with_haswell(
+; CHECK-NEXT: ret i1 true
+ %1 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ ret i1 %1
+}
+
+; This function has no overrides — uses module default (x86-64).
+define i1 @func_with_default() {
+; CHECK-LABEL: @func_with_default(
+; CHECK-NEXT: ret i1 false
+ %1 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ ret i1 %1
+}
+
+; is.cpu respects per-function attributes too.
+define i1 @func_is_haswell() #0 {
+; CHECK-LABEL: @func_is_haswell(
+; CHECK-NEXT: ret i1 true
+ %1 = call i1 @llvm.target.is.cpu(metadata !"haswell")
+ ret i1 %1
+}
+
+define i1 @func_is_haswell_default() {
+; CHECK-LABEL: @func_is_haswell_default(
+; CHECK-NEXT: ret i1 false
+ %1 = call i1 @llvm.target.is.cpu(metadata !"haswell")
+ ret i1 %1
+}
+
+attributes #0 = { "target-cpu"="haswell" }
diff --git a/llvm/test/Transforms/LowerTargetIntrinsics/pipeline.ll b/llvm/test/Transforms/LowerTargetIntrinsics/pipeline.ll
new file mode 100644
index 0000000000000..d2f11a4f2454a
--- /dev/null
+++ b/llvm/test/Transforms/LowerTargetIntrinsics/pipeline.ll
@@ -0,0 +1,58 @@
+; REQUIRES: x86-registered-target
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=haswell -O0 -o - %s | FileCheck %s --check-prefix=HASWELL
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=haswell -O2 -o - %s | FileCheck %s --check-prefix=HASWELL
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=x86-64 -O0 -o - %s | FileCheck %s --check-prefix=GENERIC
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=x86-64 -O2 -o - %s | FileCheck %s --check-prefix=GENERIC
+
+; Verify the intrinsics are resolved in the backend pipeline.
+
+declare void @avx2_path()
+declare void @fallback_path()
+
+define void @test_pipeline_o0() {
+; HASWELL-LABEL: test_pipeline_o0:
+; HASWELL: callq avx2_path
+; HASWELL-NOT: callq fallback_path
+;
+; GENERIC-LABEL: test_pipeline_o0:
+; GENERIC: callq fallback_path
+; GENERIC-NOT: callq avx2_path
+entry:
+ %has_avx2 = call i1 @llvm.target.has.feature(metadata !"avx2")
+ br i1 %has_avx2, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ ret void
+
+fallback.bb:
+ call void @fallback_path()
+ ret void
+}
+
+; O0 with alloca pattern
+define void @test_pipeline_o0_alloca() {
+; HASWELL-LABEL: test_pipeline_o0_alloca:
+; HASWELL: callq avx2_path
+; HASWELL-NOT: callq fallback_path
+;
+; GENERIC-LABEL: test_pipeline_o0_alloca:
+; GENERIC: callq fallback_path
+; GENERIC-NOT: callq avx2_path
+entry:
+ %has_avx2.addr = alloca i1
+ %result = call i1 @llvm.target.has.feature(metadata !"avx2")
+ store i1 %result, ptr %has_avx2.addr
+ %loaded = load i1, ptr %has_avx2.addr
+ br i1 %loaded, label %avx2.bb, label %fallback.bb
+
+avx2.bb:
+ call void @avx2_path()
+ ret void
+
+fallback.bb:
+ call void @fallback_path()
+ ret void
+}
+
+declare i1 @llvm.target.has.feature(metadata)
More information about the cfe-commits
mailing list