[clang] [llvm] [AMDGPU][clang][CodeGen][opt] Add late-resolved feature identifying predicates (PR #134016)
Alex Voicu via llvm-commits
llvm-commits at lists.llvm.org
Wed May 7 16:53:42 PDT 2025
https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/134016
>From 91eeaf02336e539f14dcb0a79ff15dbe8befe6f1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 02:47:42 +0100
Subject: [PATCH 01/17] Add the functional identity and feature queries.
---
clang/docs/LanguageExtensions.rst | 110 ++++++
clang/include/clang/Basic/BuiltinsAMDGPU.def | 5 +
.../clang/Basic/DiagnosticSemaKinds.td | 10 +
clang/lib/Basic/Targets/SPIR.cpp | 4 +
clang/lib/Basic/Targets/SPIR.h | 4 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 29 ++
clang/lib/Sema/SemaExpr.cpp | 157 ++++++++
clang/test/CodeGen/amdgpu-builtin-cpu-is.c | 65 ++++
.../CodeGen/amdgpu-builtin-is-invocable.c | 64 ++++
.../amdgpu-feature-builtins-invalid-use.cpp | 43 +++
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 207 ++++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
...pu-expand-feature-predicates-unfoldable.ll | 28 ++
.../amdgpu-expand-feature-predicates.ll | 359 ++++++++++++++++++
17 files changed, 1099 insertions(+), 1 deletion(-)
create mode 100644 clang/test/CodeGen/amdgpu-builtin-cpu-is.c
create mode 100644 clang/test/CodeGen/amdgpu-builtin-is-invocable.c
create mode 100644 clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 3b8a9cac6587a..8a7cb75af13e5 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4920,6 +4920,116 @@ If no address spaces names are provided, all address spaces are fenced.
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
+__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide
+a functional mechanism for programatically querying:
+
+* the identity of the current target processor;
+* the capability of the current target processor to invoke a particular builtin.
+
+**Syntax**:
+
+.. code-block:: c
+
+ // When used as the predicate for a control structure
+ bool __builtin_amdgcn_processor_is(const char*);
+ bool __builtin_amdgcn_is_invocable(builtin_name);
+ // Otherwise
+ void __builtin_amdgcn_processor_is(const char*);
+ void __builtin_amdgcn_is_invocable(void);
+
+**Example of use**:
+
+.. code-block:: c++
+
+ if (__builtin_amdgcn_processor_is("gfx1201") ||
+ __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+ __builtin_amdgcn_s_sleep_var(x);
+
+ if (!__builtin_amdgcn_processor_is("gfx906"))
+ __builtin_amdgcn_s_wait_event_export_ready();
+ else if (__builtin_amdgcn_processor_is("gfx1010") ||
+ __builtin_amdgcn_processor_is("gfx1101"))
+ __builtin_amdgcn_s_ttracedata_imm(1);
+
+ while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+
+ do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010"));
+
+ for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+
+ if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+ __builtin_amdgcn_s_wait_event_export_ready();
+ else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+ __builtin_amdgcn_s_ttracedata_imm(1);
+
+ do {
+ *p -= x;
+ } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+
+ for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+
+**Description**:
+
+When used as the predicate value of the following control structures:
+
+.. code-block:: c++
+
+ if (...)
+ while (...)
+ do { } while (...)
+ for (...)
+
+be it directly, or as arguments to logical operators such as ``!, ||, &&``, the
+builtins return a boolean value that:
+
+* indicates whether the current target matches the argument; the argument MUST
+ be a string literal and a valid AMDGPU target
+* indicates whether the builtin function passed as the argument can be invoked
+ by the current target; the argument MUST be either a generic or AMDGPU
+ specific builtin name
+
+Outside of these contexts, the builtins have a ``void`` returning signature
+which prevents their misuse.
+
+**Example of invalid use**:
+
+.. code-block:: c++
+
+ void kernel(int* p, int x, bool (*pfn)(bool), const char* str) {
+ if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+ else if (__builtin_amdgcn_processor_is(str)) __builtin_trap();
+
+ bool a = __builtin_amdgcn_processor_is("gfx906");
+ const bool b = !__builtin_amdgcn_processor_is("gfx906");
+ const bool c = !__builtin_amdgcn_processor_is("gfx906");
+ bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ const auto f =
+ !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
+ || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ const auto g =
+ !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
+ || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ __builtin_amdgcn_processor_is("gfx1201")
+ ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
+ if (pfn(__builtin_amdgcn_processor_is("gfx1200")))
+ __builtin_amdgcn_s_sleep_var(x);
+
+ if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+ else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap();
+ }
+
+When invoked while compiling for a concrete target, the builtins are evaluated
+early by Clang, and never produce any CodeGen effects / have no observable
+side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
+which is an abstract target, a series of predicate values are implicitly
+created. These predicates get resolved when finalizing the compilation process
+for a concrete target, and shall reflect the latter's identity and features.
+Thus, it is possible to author high-level code, in e.g. HIP, that is target
+adaptive in a dynamic fashion, contrary to macro based mechanisms.
ARM/AArch64 Language Extensions
-------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 44ef404aee72f..5d01a7e75f7e7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,6 +346,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
+// These are special FE only builtins intended for forwarding the requirements
+// to the ME.
+BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu")
+BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu")
+
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5e45482584946..45f0f9eb88e55 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13054,4 +13054,14 @@ def err_acc_decl_for_routine
// AMDGCN builtins diagnostics
def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+def err_amdgcn_processor_is_arg_not_literal
+ : Error<"the argument to __builtin_amdgcn_processor_is must be a string "
+ "literal">;
+def err_amdgcn_processor_is_arg_invalid_value
+ : Error<"the argument to __builtin_amdgcn_processor_is must be a valid "
+ "AMDGCN processor identifier; '%0' is not valid">;
+def err_amdgcn_is_invocable_arg_invalid_value
+ : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
+ "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
+ " is not valid">;
} // end of sema component.
diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp
index 5b5f47f9647a2..eb43d9b0be283 100644
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@@ -152,3 +152,7 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) {
Float128Format = DoubleFormat;
}
}
+
+bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const {
+ return AMDGPUTI.isValidCPUName(CPU);
+}
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 78505d66d6f2f..7aa13cbeb89fd 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -432,6 +432,10 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
}
bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+ // This is only needed for validating arguments passed to
+ // __builtin_amdgcn_processor_is
+ bool isValidCPUName(StringRef Name) const override;
};
} // namespace targets
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index b56b739094ff3..7b1a3815144b4 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -284,6 +284,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}
+static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
+ auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
+
+ auto P = cast<GlobalVariable>(
+ CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
+ P->setConstant(true);
+ P->setExternallyInitialized(true);
+
+ return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(),
+ KnownNonNull));
+}
+
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -585,6 +597,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
return Builder.CreateCall(F, {Env});
}
+ case AMDGPU::BI__builtin_amdgcn_processor_is: {
+ assert(CGM.getTriple().isSPIRV() &&
+ "__builtin_amdgcn_processor_is should never reach CodeGen for "
+ "concrete targets!");
+ StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
+ return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
+ }
+ case AMDGPU::BI__builtin_amdgcn_is_invocable: {
+ assert(CGM.getTriple().isSPIRV() &&
+ "__builtin_amdgcn_is_invocable should never reach CodeGen for "
+ "concrete targets!");
+ auto FD = cast<FunctionDecl>(
+ cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+ StringRef RF =
+ getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+ return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
+ }
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7cc8374e69d73..24f5262ab3cf4 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6541,6 +6541,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
if (Result.isInvalid()) return ExprError();
Fn = Result.get();
+ // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved
+ // later, when we check boolean conditions, for now we merely forward it
+ // without any additional checking.
+ if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
+ ArgExprs[0]->getType() == Context.BuiltinFnTy) {
+ auto FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+
+ if (FD->getName() == "__builtin_amdgcn_is_invocable") {
+ auto FnPtrTy = Context.getPointerType(FD->getType());
+ auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+ return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
+ ExprValueKind::VK_PRValue, RParenLoc,
+ FPOptionsOverride());
+ }
+ }
+
if (CheckArgsForPlaceholders(ArgExprs))
return ExprError();
@@ -13234,6 +13250,20 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
return InvalidOperands(Loc, LHS, RHS);
}
+static inline bool IsAMDGPUPredicateBI(Expr *E) {
+ if (!E->getType()->isVoidType())
+ return false;
+
+ if (auto CE = dyn_cast<CallExpr>(E)) {
+ if (auto BI = CE->getDirectCallee())
+ if (BI->getName() == "__builtin_amdgcn_processor_is" ||
+ BI->getName() == "__builtin_amdgcn_is_invocable")
+ return true;
+ }
+
+ return false;
+}
+
// C99 6.5.[13,14]
inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
SourceLocation Loc,
@@ -13329,6 +13359,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
// The following is safe because we only use this method for
// non-overloadable operands.
+ if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get()))
+ return Context.VoidTy;
+
// C++ [expr.log.and]p1
// C++ [expr.log.or]p1
// The operands are both contextually converted to type bool.
@@ -15576,6 +15609,38 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
}
+static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
+ if (!CE->getBuiltinCallee())
+ return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc());
+
+ if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+ CE->setType(Ctx.getLogicalOperationType());
+ return CE;
+ }
+
+ bool P = false;
+ auto &TI = Ctx.getTargetInfo();
+
+ if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
+ auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+ auto TID = TI.getTargetID();
+ if (GFX && TID) {
+ auto N = GFX->getString();
+ P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
+ }
+ } else {
+ auto FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
+
+ StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+ llvm::StringMap<bool> CF;
+ Ctx.getFunctionFeatureMap(CF, FD);
+
+ P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+ }
+
+ return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc());
+}
+
ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
UnaryOperatorKind Opc, Expr *InputExpr,
bool IsAfterAmp) {
@@ -15753,6 +15818,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
// Vector logical not returns the signed variant of the operand type.
resultType = GetSignedVectorType(resultType);
break;
+ } else if (IsAMDGPUPredicateBI(InputExpr)) {
+ break;
} else {
return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
<< resultType << Input.get()->getSourceRange());
@@ -20469,6 +20536,88 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
}
}
+static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
+ if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
+ auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+ if (!GFX) {
+ Sema.Diag(CE->getExprLoc(),
+ diag::err_amdgcn_processor_is_arg_not_literal);
+ return false;
+ }
+ auto N = GFX->getString();
+ if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) &&
+ (!Sema.getASTContext().getAuxTargetInfo() ||
+ !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
+ Sema.Diag(CE->getExprLoc(),
+ diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+ return false;
+ }
+ } else {
+ auto Arg = CE->getArg(0);
+ if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
+ Sema.Diag(CE->getExprLoc(),
+ diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
+ if (auto UO = dyn_cast<UnaryOperator>(E)) {
+ auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
+ if (IsAMDGPUPredicateBI(SE)) {
+ assert(
+ UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
+ "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+ "can only be used as operands of logical ops!");
+
+ if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
+ Invalid = true;
+ return nullptr;
+ }
+
+ UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE));
+ UO->setType(Sema.getASTContext().getLogicalOperationType());
+
+ return UO;
+ }
+ }
+ if (auto BO = dyn_cast<BinaryOperator>(E)) {
+ auto LHS = dyn_cast<CallExpr>(BO->getLHS());
+ auto RHS = dyn_cast<CallExpr>(BO->getRHS());
+ if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
+ assert(
+ BO->isLogicalOp() &&
+ "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+ "can only be used as operands of logical ops!");
+
+ if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
+ !ValidateAMDGPUPredicateBI(Sema, RHS)) {
+ Invalid = true;
+ return nullptr;
+ }
+
+ BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS));
+ BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS));
+ BO->setType(Sema.getASTContext().getLogicalOperationType());
+
+ return BO;
+ }
+ }
+ if (auto CE = dyn_cast<CallExpr>(E))
+ if (IsAMDGPUPredicateBI(CE)) {
+ if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
+ Invalid = true;
+ return nullptr;
+ }
+ return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE);
+ }
+
+ return nullptr;
+}
+
ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
bool IsConstexpr) {
DiagnoseAssignmentAsCondition(E);
@@ -20480,6 +20629,14 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
E = result.get();
if (!E->isTypeDependent()) {
+ if (E->getType()->isVoidType()) {
+ bool IsInvalidPredicate = false;
+ if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+ return BIC;
+ else if (IsInvalidPredicate)
+ return ExprError();
+ }
+
if (getLangOpts().CPlusPlus)
return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
new file mode 100644
index 0000000000000..6e261d9f5d239
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
@@ -0,0 +1,65 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+// 1) for gfx900 we emit a call to trap (concrete target, matches)
+// 2) for gfx1010 we emit an empty kernel (concrete target, does not match)
+// 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
+// load from it to provide the condition a br (abstract target)
+//.
+// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
+//.
+// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT: call void @llvm.trap()
+// AMDGCN-GFX900-NEXT: ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT: ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
+// AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV: [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT: br label %[[IF_END]]
+// AMDGCNSPIRV: [[IF_END]]:
+// AMDGCNSPIRV-NEXT: ret void
+//
+void foo() {
+ if (__builtin_cpu_is("gfx900"))
+ return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
new file mode 100644
index 0000000000000..6d2690cb75b7c
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -0,0 +1,64 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+// 1) for gfx900 we emit an empty kernel (concrete target, lacks feature)
+// 2) for gfx1010 we emit a call to trap (concrete target, has feature)
+// 3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant
+// externally initialised bool global, and load from it to provide the
+// condition to a br (abstract target)
+
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT: ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT: call void @llvm.trap()
+// AMDGCN-GFX1010-NEXT: ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV: [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT: br label %[[IF_END]]
+// AMDGCNSPIRV: [[IF_END]]:
+// AMDGCNSPIRV-NEXT: ret void
+//
+void foo() {
+ if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+ return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
new file mode 100644
index 0000000000000..f618f54909b00
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -0,0 +1,43 @@
+// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+bool predicate(bool x) { return x; }
+
+void invalid_uses(int* p, int x, bool (*pfn)(bool)) {
+ // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+ bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
+ // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void'
+ const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906");
+ // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+ bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+ bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ // CHECK: error: variable has incomplete type 'const void'
+ const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ // CHECK: error: variable has incomplete type 'const void'
+ const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900");
+ // CHECK: error: variable has incomplete type 'const void'
+ const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
+ // CHECK: error: value of type 'void' is not contextually convertible to 'bool'
+ __builtin_amdgcn_processor_is("gfx1201")
+ ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
+ // CHECK: error: no matching function for call to 'predicate'
+ if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
+ // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument
+}
+
+void invalid_invocations(int x, const char* str) {
+ // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid
+ if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+ // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
+ if (__builtin_amdgcn_processor_is(str)) return;
+
+ // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid
+ if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+ // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid
+ else if (__builtin_amdgcn_is_invocable(str)) return;
+ // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid
+ else if (__builtin_amdgcn_is_invocable(x)) return;
+ // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
+ else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index a8e4ea9429f50..1fe0016723a30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -408,6 +408,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
extern char &AMDGPUResourceUsageAnalysisID;
+struct AMDGPUExpandFeaturePredicatesPass
+ : PassInfoMixin<AMDGPUExpandFeaturePredicatesPass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+ static bool isRequired() { return true; }
+};
+
struct AMDGPUPrintfRuntimeBindingPass
: PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 0000000000000..125051c6aa0cf
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,207 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+inline Function *getCloneForInlining(Function *OldF) {
+ assert(OldF && "Must pass an existing Function!");
+
+ // TODO - Alias Value to clone arg.
+ ValueToValueMapTy VMap;
+
+ auto NewF = CloneFunction(OldF, VMap);
+
+ NewF->removeFnAttr(Attribute::OptimizeNone);
+ NewF->removeFnAttr(Attribute::NoInline);
+ NewF->addFnAttr(Attribute::AlwaysInline);
+
+ return NewF;
+}
+
+template <typename C>
+inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner,
+ ModuleAnalysisManager &MAM,
+ SmallDenseMap<Function *, Function *> &InlinableClones,
+ C &Container) {
+ assert(V && "Must pass an existing Value!");
+
+ auto A = PreservedAnalyses::all();
+
+ constexpr auto IsValidCall = [](auto &&U) {
+ if (auto CB = dyn_cast<CallBase>(U))
+ if (auto F = CB->getCalledFunction())
+ if (!F->isIntrinsic() && !F->isDeclaration())
+ return true;
+ return false;
+ };
+
+ SmallVector<User *> Calls{};
+ copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
+
+ while (!Calls.empty()) {
+ for (auto &&Call : Calls) {
+ auto CB = cast<CallBase>(Call);
+ auto &TempF = InlinableClones[CB->getCalledFunction()];
+
+ if (!TempF)
+ TempF = getCloneForInlining(CB->getCalledFunction());
+
+ CB->setCalledFunction(TempF);
+ CB->removeFnAttr(Attribute::NoInline);
+ CB->addFnAttr(Attribute::AlwaysInline);
+
+ AlwaysInliner.run(*TempF->getParent(), MAM);
+ }
+
+ Calls.clear();
+
+ copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
+ }
+
+ for (auto &&U : V->users())
+ if (auto I = dyn_cast<Instruction>(U)) {
+ if (auto CB = dyn_cast<CallBase>(I)) {
+ if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic())
+ Container.insert(Container.end(), I);
+ } else {
+ Container.insert(Container.end(), I);
+ }
+ }
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM,
+ SmallDenseMap<Function *, Function *>& InlinableClones,
+ GlobalVariable *P) {
+ auto PV = P->getName().substr(P->getName().rfind('.') + 1).str();
+ auto Dx = PV.find(',');
+ while (Dx != std::string::npos) {
+ PV.insert(++Dx, {'+'});
+
+ Dx = PV.find(',', Dx);
+ }
+
+ auto PTy = P->getValueType();
+ P->setLinkage(GlobalValue::PrivateLinkage);
+ P->setExternallyInitialized(false);
+
+ if (P->getName().starts_with("llvm.amdgcn.is"))
+ P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+ else
+ P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV)));
+
+ ModulePassManager MPM;
+ MPM.addPass(AlwaysInlinerPass());
+
+ SmallPtrSet<Instruction *, 32> ToFold;
+ collectUsers(P, MPM, MAM, InlinableClones, ToFold);
+
+ if (ToFold.empty())
+ return {PreservedAnalyses::all(), true};
+
+ do {
+ auto I = *ToFold.begin();
+ ToFold.erase(I);
+
+ if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+ collectUsers(I, MPM, MAM, InlinableClones, ToFold);
+ I->replaceAllUsesWith(C);
+ I->eraseFromParent();
+ continue;
+ } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
+ continue;
+ } else if (I->users().empty()) {
+ continue;
+ }
+
+ std::string W;
+ raw_string_ostream OS(W);
+
+ auto Caller = I->getParent()->getParent();
+
+ OS << "Impossible to constant fold feature predicate: " << P->getName()
+ << ", please simplify.\n";
+
+ Caller->getContext().diagnose(
+ DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error));
+
+ return {PreservedAnalyses::none(), false};
+ } while (!ToFold.empty());
+
+ return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+PreservedAnalyses
+AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) {
+ if (M.empty())
+ return PreservedAnalyses::all();
+
+ SmallVector<GlobalVariable *> Predicates;
+ for (auto &&G : M.globals()) {
+ if (!G.isDeclaration() || !G.hasName())
+ continue;
+ if (G.getName().starts_with("llvm.amdgcn."))
+ Predicates.push_back(&G);
+ }
+
+ if (Predicates.empty())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses Ret = PreservedAnalyses::all();
+
+ SmallDenseMap<Function *, Function *> InlinableClones;
+ const auto &ST = TM.getSubtarget<GCNSubtarget>(
+ *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+ for (auto &&P : Predicates) {
+ auto R = handlePredicate(ST, MAM, InlinableClones, P);
+
+ if (!R.second)
+ return PreservedAnalyses::none();
+
+ Ret.intersect(R.first);
+ }
+
+ for (auto &&C : InlinableClones)
+ C.second->eraseFromParent();
+
+ return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6a45392b5f099..c3c9e24c2efa4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,8 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-feature-predicates",
+ AMDGPUExpandFeaturePredicatesPass(*this))
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4937b434bc955..8e8a6e1eda437 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -803,7 +803,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#include "llvm/Passes/TargetPassRegistry.inc"
PB.registerPipelineStartEPCallback(
- [](ModulePassManager &PM, OptimizationLevel Level) {
+ [this](ModulePassManager &PM, OptimizationLevel Level) {
+ PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..a389200f0db8e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
+ AMDGPUExpandFeaturePredicates.cpp
AMDGPUExportClustering.cpp
AMDGPUExportKernelRuntimeHandles.cpp
AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
new file mode 100644
index 0000000000000..bfc35d8c76e37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
@@ -0,0 +1,28 @@
+; REQUIRES: amdgpu-registered-target
+
+; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s
+
+; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by %call = call i1 %1(i1 zeroext false), please simplify.
+
+ at llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) #1
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) {
+entry:
+ %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+ %1 = inttoptr i64 %0 to ptr
+ %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+ %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1
+ %call = call i1 %1(i1 zeroext %3)
+ br i1 %call, label %if.gfx803, label %if.end
+
+if.gfx803:
+ call void @llvm.amdgcn.s.sleep(i32 0)
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
new file mode 100644
index 0000000000000..277323c353260
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s
+
+;; The IR was derived from the following source:
+;; extern "C" __global__ void kernel(int* p, int x)
+;; {
+;; if (__builtin_amdgcn_processor_is("gfx1201") ||
+;; __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+;; __builtin_amdgcn_s_sleep_var(x);
+;; if (!__builtin_amdgcn_processor_is("gfx906"))
+;; __builtin_amdgcn_s_wait_event_export_ready();
+;; else if (__builtin_amdgcn_processor_is("gfx1010") ||
+;; __builtin_amdgcn_processor_is("gfx1101"))
+;; __builtin_amdgcn_s_ttracedata_imm(1);
+;; while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+;; do {
+;; *p -= x;
+;; } while (__builtin_amdgcn_processor_is("gfx1010"));
+;; for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+;;
+;; if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+;; __builtin_amdgcn_s_wait_event_export_ready();
+;; else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+;; __builtin_amdgcn_s_ttracedata_imm(1);
+;;
+;; do {
+;; *p -= x;
+;; } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+;; for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+;; }
+
+ at llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep.var(i32)
+declare void @llvm.amdgcn.s.wait.event.export.ready()
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg)
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
+; GFX906-LABEL: define amdgpu_kernel void @kernel(
+; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT: [[ENTRY:.*:]]
+; GFX906-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX906-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX906: [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX906-NEXT: br label %[[IF_NOT_GFX906:.*]]
+; GFX906: [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX906-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX906-NEXT: br label %[[IF_NOT_GFX906]]
+; GFX906: [[IF_NOT_GFX906]]:
+; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX906: [[IF_NOT_GFX907:.*:]]
+; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX906-NEXT: br label %[[IF_END6:.*]]
+; GFX906: [[IF_GFX1010_OR_GFX1102]]:
+; GFX906-NEXT: br label %[[LOR_NOT_GFX1010:.*]]
+; GFX906: [[LOR_NOT_GFX1010]]:
+; GFX906-NEXT: br label %[[FOR_COND:.*]]
+; GFX906: [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX906-NEXT: br label %[[IF_END6]]
+; GFX906: [[IF_END6]]:
+; GFX906-NEXT: call void @llvm.assume(i1 true)
+; GFX906-NEXT: call void @llvm.assume(i1 true)
+; GFX906-NEXT: br label %[[FOR_COND]]
+; GFX906: [[FOR_COND]]:
+; GFX906-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX906-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT: br label %[[IF_GFX10_INSTS1:.*]]
+; GFX906: [[IF_GFX11_INSTS:.*:]]
+; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX906-NEXT: br label %[[IF_END11:.*]]
+; GFX906: [[IF_GFX10_INSTS1]]:
+; GFX906-NEXT: br label %[[IF_END11]]
+; GFX906: [[IF_GFX10_INSTS:.*:]]
+; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX906-NEXT: br label %[[IF_END11]]
+; GFX906: [[IF_END11]]:
+; GFX906-NEXT: call void @llvm.assume(i1 true)
+; GFX906-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX906-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT: ret void
+;
+; GFX1010-LABEL: define amdgpu_kernel void @kernel(
+; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1010-NEXT: [[ENTRY:.*:]]
+; GFX1010-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1010-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1010-NEXT: br label %[[IF_END:.*]]
+; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX1010-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1010-NEXT: br label %[[IF_END]]
+; GFX1010: [[IF_END]]:
+; GFX1010-NEXT: br label %[[IF_NOT_GFX907:.*]]
+; GFX1010: [[IF_NOT_GFX907]]:
+; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT: br label %[[IF_END6:.*]]
+; GFX1010: [[IF_NOT_GFX906:.*:]]
+; GFX1010-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1010: [[LOR_NOT_GFX1010:.*:]]
+; GFX1010-NEXT: br label %[[FOR_COND:.*]]
+; GFX1010: [[IF_GFX1010_OR_GFX1101]]:
+; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT: br label %[[IF_END6]]
+; GFX1010: [[IF_END6]]:
+; GFX1010-NEXT: call void @llvm.assume(i1 true)
+; GFX1010-NEXT: call void @llvm.assume(i1 false)
+; GFX1010-NEXT: br label %[[FOR_COND]]
+; GFX1010: [[FOR_COND]]:
+; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1010-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT: br label %[[IF_ELSE8:.*]]
+; GFX1010: [[IF_GFX11_INSTS:.*:]]
+; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT: br label %[[IF_END11:.*]]
+; GFX1010: [[IF_ELSE8]]:
+; GFX1010-NEXT: br label %[[IF_GFX10_INSTS:.*]]
+; GFX1010: [[IF_GFX10_INSTS]]:
+; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT: br label %[[IF_END11]]
+; GFX1010: [[IF_END11]]:
+; GFX1010-NEXT: call void @llvm.assume(i1 true)
+; GFX1010-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1010-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT: ret void
+;
+; GFX1101-LABEL: define amdgpu_kernel void @kernel(
+; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1101-NEXT: [[ENTRY:.*:]]
+; GFX1101-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1101-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1101-NEXT: br label %[[IF_END:.*]]
+; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX1101-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1101-NEXT: br label %[[IF_END]]
+; GFX1101: [[IF_END]]:
+; GFX1101-NEXT: br label %[[IF_NOT_GFX907:.*]]
+; GFX1101: [[IF_NOT_GFX907]]:
+; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT: br label %[[IF_END6:.*]]
+; GFX1101: [[IF_NOT_GFX906:.*:]]
+; GFX1101-NEXT: br label %[[LOR_NOT_GFX1010:.*]]
+; GFX1101: [[LOR_NOT_GFX1010]]:
+; GFX1101-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1101: [[IF_GFX1010_OR_GFX1101]]:
+; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1101-NEXT: br label %[[IF_END6]]
+; GFX1101: [[IF_END6]]:
+; GFX1101-NEXT: call void @llvm.assume(i1 false)
+; GFX1101-NEXT: call void @llvm.assume(i1 true)
+; GFX1101-NEXT: br label %[[FOR_COND:.*]]
+; GFX1101: [[FOR_COND]]:
+; GFX1101-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1101-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT: br label %[[IF_GFX11_INSTS:.*]]
+; GFX1101: [[IF_GFX11_INSTS]]:
+; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT: br label %[[IF_END11:.*]]
+; GFX1101: [[IF_ELSE8:.*:]]
+; GFX1101-NEXT: br label %[[IF_GFX10_INSTS:.*]]
+; GFX1101: [[IF_GFX10_INSTS]]:
+; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1101-NEXT: br label %[[IF_END11]]
+; GFX1101: [[IF_END11]]:
+; GFX1101-NEXT: call void @llvm.assume(i1 true)
+; GFX1101-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1101-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT: ret void
+;
+; GFX1201-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-NEXT: [[ENTRY:.*:]]
+; GFX1201-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1201: [[LOR_NOT_GFX1201:.*:]]
+; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]]
+; GFX1201: [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-NEXT: br label %[[IF_END:.*]]
+; GFX1201: [[IF_END]]:
+; GFX1201-NEXT: br label %[[IF_NOT_GFX907:.*]]
+; GFX1201: [[IF_NOT_GFX907]]:
+; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT: br label %[[IF_END6:.*]]
+; GFX1201: [[IF_NOT_GFX906:.*:]]
+; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX1201: [[IF_GFX1010_OR_GFX1102]]:
+; GFX1201-NEXT: br label %[[FOR_COND:.*]]
+; GFX1201: [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-NEXT: br label %[[IF_END6]]
+; GFX1201: [[IF_END6]]:
+; GFX1201-NEXT: call void @llvm.assume(i1 true)
+; GFX1201-NEXT: call void @llvm.assume(i1 true)
+; GFX1201-NEXT: br label %[[FOR_COND]]
+; GFX1201: [[FOR_COND]]:
+; GFX1201-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT: br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201: [[IF_GFX11_INSTS]]:
+; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT: br label %[[IF_END11:.*]]
+; GFX1201: [[IF_ELSE8:.*:]]
+; GFX1201-NEXT: br label %[[IF_GFX10_INSTS:.*]]
+; GFX1201: [[IF_GFX10_INSTS]]:
+; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-NEXT: br label %[[IF_END11]]
+; GFX1201: [[IF_END11]]:
+; GFX1201-NEXT: call void @llvm.assume(i1 true)
+; GFX1201-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT: ret void
+;
+; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-W64-NEXT: [[ENTRY:.*:]]
+; GFX1201-W64-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-W64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1201-W64: [[LOR_NOT_GFX1201:.*:]]
+; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]]
+; GFX1201-W64: [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-W64-NEXT: br label %[[IF_END:.*]]
+; GFX1201-W64: [[IF_END]]:
+; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX907:.*]]
+; GFX1201-W64: [[IF_NOT_GFX907]]:
+; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT: br label %[[IF_END6:.*]]
+; GFX1201-W64: [[IF_NOT_GFX906:.*:]]
+; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX1201-W64: [[IF_GFX1010_OR_GFX1102]]:
+; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]]
+; GFX1201-W64: [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-W64-NEXT: br label %[[IF_END6]]
+; GFX1201-W64: [[IF_END6]]:
+; GFX1201-W64-NEXT: call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT: call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT: br label %[[FOR_COND]]
+; GFX1201-W64: [[FOR_COND]]:
+; GFX1201-W64-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-W64-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT: br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201-W64: [[IF_GFX11_INSTS]]:
+; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT: br label %[[IF_END11:.*]]
+; GFX1201-W64: [[IF_ELSE8:.*:]]
+; GFX1201-W64-NEXT: br label %[[IF_GFX10_INSTS:.*]]
+; GFX1201-W64: [[IF_GFX10_INSTS]]:
+; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-W64-NEXT: br label %[[IF_END11]]
+; GFX1201-W64: [[IF_END11]]:
+; GFX1201-W64-NEXT: call void @llvm.assume(i1 false)
+; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-W64-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT: ret void
+;
+entry:
+ %0 = ptrtoint ptr addrspace(1) %p.coerce to i64
+ %1 = inttoptr i64 %0 to ptr
+ %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
+ br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201
+
+lor.not.gfx1201:
+ %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1
+ br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end
+
+if.gfx1201.or.gfx12-insts:
+ call void @llvm.amdgcn.s.sleep.var(i32 %x)
+ br label %if.end
+
+if.end:
+ %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
+ br i1 %4, label %if.gfx906, label %if.not.gfx906
+
+if.not.gfx906:
+ call void @llvm.amdgcn.s.wait.event.export.ready()
+ br label %if.end6
+
+if.gfx906:
+ %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+ br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010
+
+lor.not.gfx1010:
+ %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+ br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond
+
+if.gfx1010.or.gfx1101:
+ call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+ br label %if.end6
+
+if.end6:
+ %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+ %7 = icmp ne i1 %.pr.pr, true
+ call void @llvm.assume(i1 %7)
+ %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+ %8 = icmp ne i1 %.pr6.pr, true
+ call void @llvm.assume(i1 %8)
+ br label %for.cond
+
+for.cond:
+ %.promoted = load i32, ptr %1, align 4
+ %sub.peel = sub nsw i32 %.promoted, %x
+ store i32 %sub.peel, ptr %1, align 4
+ %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1
+ br i1 %9, label %if.gfx11-insts, label %if.else8
+
+if.gfx11-insts:
+ call void @llvm.amdgcn.s.wait.event.export.ready()
+ br label %if.end11
+
+if.else8:
+ %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+ br i1 %10, label %if.gfx10-insts, label %if.end11
+
+if.gfx10-insts:
+ call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+ br label %if.end11
+
+if.end11:
+ %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1
+ %11 = icmp ne i1 %.pr7, true
+ call void @llvm.assume(i1 %11)
+ %.promoted9 = load i32, ptr %1, align 4
+ %sub13.peel = sub nsw i32 %.promoted9, %x
+ store i32 %sub13.peel, ptr %1, align 4
+ ret void
+}
+
+declare void @llvm.assume(i1 noundef)
>From 8bf116837e2bd77ff5906d025fdb80bfa5507382 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:39:32 +0100
Subject: [PATCH 02/17] Fix format.
---
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++++----
clang/lib/Sema/SemaExpr.cpp | 20 ++++++++++----------
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 7b1a3815144b4..8ad1ab74f221d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -292,8 +292,8 @@ static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
P->setConstant(true);
P->setExternallyInitialized(true);
- return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(),
- KnownNonNull));
+ return CGF.Builder.CreateLoad(
+ RawAddress(P, PTy, CharUnits::One(), KnownNonNull));
}
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
@@ -600,7 +600,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_processor_is: {
assert(CGM.getTriple().isSPIRV() &&
"__builtin_amdgcn_processor_is should never reach CodeGen for "
- "concrete targets!");
+ "concrete targets!");
StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
}
@@ -609,7 +609,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
"__builtin_amdgcn_is_invocable should never reach CodeGen for "
"concrete targets!");
auto FD = cast<FunctionDecl>(
- cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+ cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
StringRef RF =
getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 24f5262ab3cf4..bd0183ae4fb82 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20549,14 +20549,16 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
(!Sema.getASTContext().getAuxTargetInfo() ||
!Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
Sema.Diag(CE->getExprLoc(),
- diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+ diag::err_amdgcn_processor_is_arg_invalid_value)
+ << N;
return false;
}
} else {
auto Arg = CE->getArg(0);
if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
Sema.Diag(CE->getExprLoc(),
- diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+ diag::err_amdgcn_is_invocable_arg_invalid_value)
+ << Arg;
return false;
}
}
@@ -20568,10 +20570,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
if (auto UO = dyn_cast<UnaryOperator>(E)) {
auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
if (IsAMDGPUPredicateBI(SE)) {
- assert(
- UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
- "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
- "can only be used as operands of logical ops!");
+ assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
+ "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+ "can only be used as operands of logical ops!");
if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
Invalid = true;
@@ -20588,10 +20589,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
auto LHS = dyn_cast<CallExpr>(BO->getLHS());
auto RHS = dyn_cast<CallExpr>(BO->getRHS());
if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
- assert(
- BO->isLogicalOp() &&
- "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
- "can only be used as operands of logical ops!");
+ assert(BO->isLogicalOp() &&
+ "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+ "can only be used as operands of logical ops!");
if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
!ValidateAMDGPUPredicateBI(Sema, RHS)) {
>From 3421292b6e3261410734fb5a324f7dec79080fc1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:42:24 +0100
Subject: [PATCH 03/17] Fix broken patch merge.
---
.../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 159 ++++++++++++++
.../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 207 ------------------
2 files changed, 159 insertions(+), 207 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
new file mode 100644
index 0000000000000..17357c452b6d3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -0,0 +1,159 @@
+//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic feature
+// predicates into target specific quantities / sequences. In this context, a
+// generic feature predicate is an implementation detail global variable that
+// is inserted by the FE as a consequence of using either the __builtin_cpu_is
+// or the __builtin_amdgcn_is_invocable special builtins on an abstract target
+// (AMDGCNSPIRV). These placeholder globals are used to guide target specific
+// lowering, once the concrete target is known, by way of constant folding their
+// value all the way into a terminator (i.e. a controlled block) or into a no
+// live use scenario. The pass makes a best effort attempt to look through
+// calls, i.e. a constant evaluatable passthrough of a predicate value will
+// generally work, however we hard fail if the folding fails, to avoid obtuse
+// BE errors or opaque run time errors. This pass should run as early as
+// possible / immediately after Clang CodeGen, so that the optimisation pipeline
+// and the BE operate with concrete target data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+template <typename C>
+void collectUsers(Value *V, C &Container) {
+ assert(V && "Must pass an existing Value!");
+
+ for (auto &&U : V->users())
+ if (auto I = dyn_cast<Instruction>(U))
+ Container.insert(Container.end(), I);
+}
+
+inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+ const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has");
+ const auto Offset =
+ IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is");
+
+ auto PV = P->getName().substr(Offset).str();
+ if (IsFeature) {
+ auto Dx = PV.find(',');
+ while (Dx != std::string::npos) {
+ PV.insert(++Dx, {'+'});
+
+ Dx = PV.find(',', Dx);
+ }
+ PV.insert(PV.cbegin(), '+');
+ }
+
+ auto PTy = P->getValueType();
+ P->setLinkage(GlobalValue::PrivateLinkage);
+ P->setExternallyInitialized(false);
+
+ if (IsFeature)
+ P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV)));
+ else
+ P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+}
+
+std::pair<PreservedAnalyses, bool>
+unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
+ std::string W;
+ raw_string_ostream OS(W);
+
+ OS << "Impossible to constant fold feature predicate: " << *P
+ << " used by " << *NoFold << ", please simplify.\n";
+
+ Caller->getContext().diagnose(
+ DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
+
+ return {PreservedAnalyses::none(), false};
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+ setPredicate(ST, P);
+
+ SmallPtrSet<Instruction *, 32> ToFold;
+ collectUsers(P, ToFold);
+
+ if (ToFold.empty())
+ return {PreservedAnalyses::all(), true};
+
+ do {
+ auto I = *ToFold.begin();
+ ToFold.erase(I);
+
+ if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+ collectUsers(I, ToFold);
+ I->replaceAllUsesWith(C);
+ I->eraseFromParent();
+ continue;
+ } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
+ continue;
+ } else if (I->users().empty()) {
+ continue;
+ }
+
+ return unfoldableFound(I->getParent()->getParent(), P, I);
+ } while (!ToFold.empty());
+
+ return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+PreservedAnalyses
+AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
+ if (M.empty())
+ return PreservedAnalyses::all();
+
+ SmallVector<GlobalVariable *> Predicates;
+ for (auto &&G : M.globals()) {
+ if (!G.isDeclaration() || !G.hasName())
+ continue;
+ if (G.getName().starts_with("llvm.amdgcn."))
+ Predicates.push_back(&G);
+ }
+
+ if (Predicates.empty())
+ return PreservedAnalyses::all();
+
+ const auto &ST = TM.getSubtarget<GCNSubtarget>(
+ *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+ auto Ret = PreservedAnalyses::all();
+ for (auto &&P : Predicates) {
+ auto R = handlePredicate(ST, P);
+
+ if (!R.second)
+ break;
+
+ Ret.intersect(R.first);
+ }
+
+ for (auto &&P : Predicates)
+ P->eraseFromParent();
+
+ return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
deleted file mode 100644
index 125051c6aa0cf..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic pseudo-
-// intrinsics into target specific quantities / sequences. In this context, a
-// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
-// specific instruction, but rather is intended as a mechanism for abstractly
-// conveying target specific info to a HLL / the FE, without concretely
-// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
-// This pass should run as early as possible / immediately after Clang CodeGen,
-// so that the optimisation pipeline and the BE operate with concrete target
-// data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-namespace {
-inline Function *getCloneForInlining(Function *OldF) {
- assert(OldF && "Must pass an existing Function!");
-
- // TODO - Alias Value to clone arg.
- ValueToValueMapTy VMap;
-
- auto NewF = CloneFunction(OldF, VMap);
-
- NewF->removeFnAttr(Attribute::OptimizeNone);
- NewF->removeFnAttr(Attribute::NoInline);
- NewF->addFnAttr(Attribute::AlwaysInline);
-
- return NewF;
-}
-
-template <typename C>
-inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner,
- ModuleAnalysisManager &MAM,
- SmallDenseMap<Function *, Function *> &InlinableClones,
- C &Container) {
- assert(V && "Must pass an existing Value!");
-
- auto A = PreservedAnalyses::all();
-
- constexpr auto IsValidCall = [](auto &&U) {
- if (auto CB = dyn_cast<CallBase>(U))
- if (auto F = CB->getCalledFunction())
- if (!F->isIntrinsic() && !F->isDeclaration())
- return true;
- return false;
- };
-
- SmallVector<User *> Calls{};
- copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
-
- while (!Calls.empty()) {
- for (auto &&Call : Calls) {
- auto CB = cast<CallBase>(Call);
- auto &TempF = InlinableClones[CB->getCalledFunction()];
-
- if (!TempF)
- TempF = getCloneForInlining(CB->getCalledFunction());
-
- CB->setCalledFunction(TempF);
- CB->removeFnAttr(Attribute::NoInline);
- CB->addFnAttr(Attribute::AlwaysInline);
-
- AlwaysInliner.run(*TempF->getParent(), MAM);
- }
-
- Calls.clear();
-
- copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
- }
-
- for (auto &&U : V->users())
- if (auto I = dyn_cast<Instruction>(U)) {
- if (auto CB = dyn_cast<CallBase>(I)) {
- if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic())
- Container.insert(Container.end(), I);
- } else {
- Container.insert(Container.end(), I);
- }
- }
-}
-
-std::pair<PreservedAnalyses, bool>
-handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM,
- SmallDenseMap<Function *, Function *>& InlinableClones,
- GlobalVariable *P) {
- auto PV = P->getName().substr(P->getName().rfind('.') + 1).str();
- auto Dx = PV.find(',');
- while (Dx != std::string::npos) {
- PV.insert(++Dx, {'+'});
-
- Dx = PV.find(',', Dx);
- }
-
- auto PTy = P->getValueType();
- P->setLinkage(GlobalValue::PrivateLinkage);
- P->setExternallyInitialized(false);
-
- if (P->getName().starts_with("llvm.amdgcn.is"))
- P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
- else
- P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV)));
-
- ModulePassManager MPM;
- MPM.addPass(AlwaysInlinerPass());
-
- SmallPtrSet<Instruction *, 32> ToFold;
- collectUsers(P, MPM, MAM, InlinableClones, ToFold);
-
- if (ToFold.empty())
- return {PreservedAnalyses::all(), true};
-
- do {
- auto I = *ToFold.begin();
- ToFold.erase(I);
-
- if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
- collectUsers(I, MPM, MAM, InlinableClones, ToFold);
- I->replaceAllUsesWith(C);
- I->eraseFromParent();
- continue;
- } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
- continue;
- } else if (I->users().empty()) {
- continue;
- }
-
- std::string W;
- raw_string_ostream OS(W);
-
- auto Caller = I->getParent()->getParent();
-
- OS << "Impossible to constant fold feature predicate: " << P->getName()
- << ", please simplify.\n";
-
- Caller->getContext().diagnose(
- DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error));
-
- return {PreservedAnalyses::none(), false};
- } while (!ToFold.empty());
-
- return {PreservedAnalyses::none(), true};
-}
-} // Unnamed namespace.
-
-PreservedAnalyses
-AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) {
- if (M.empty())
- return PreservedAnalyses::all();
-
- SmallVector<GlobalVariable *> Predicates;
- for (auto &&G : M.globals()) {
- if (!G.isDeclaration() || !G.hasName())
- continue;
- if (G.getName().starts_with("llvm.amdgcn."))
- Predicates.push_back(&G);
- }
-
- if (Predicates.empty())
- return PreservedAnalyses::all();
-
- PreservedAnalyses Ret = PreservedAnalyses::all();
-
- SmallDenseMap<Function *, Function *> InlinableClones;
- const auto &ST = TM.getSubtarget<GCNSubtarget>(
- *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
-
- for (auto &&P : Predicates) {
- auto R = handlePredicate(ST, MAM, InlinableClones, P);
-
- if (!R.second)
- return PreservedAnalyses::none();
-
- Ret.intersect(R.first);
- }
-
- for (auto &&C : InlinableClones)
- C.second->eraseFromParent();
-
- return Ret;
-}
>From 539c7e6c6357fa7330de9e23fa13cf795061b85b Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:51:08 +0100
Subject: [PATCH 04/17] Add release notes.
---
clang/docs/ReleaseNotes.rst | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c4e82678949ff..005b33da29d2d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -418,6 +418,10 @@ AMDGPU Support
^^^^^^^^^^^^^^
- Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6.
+- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
+ a late / deferred query for the current target processor
+- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
+ which enables fine-grained, per-builtin, feature availability
NVPTX Support
^^^^^^^^^^^^^^
>From 5926b9f715fce59e753756f5330f311e3f916667 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:55:39 +0100
Subject: [PATCH 05/17] (Hopefully) Final format fix.
---
.../Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 17357c452b6d3..8d38508eda74b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -42,8 +42,7 @@
using namespace llvm;
namespace {
-template <typename C>
-void collectUsers(Value *V, C &Container) {
+template <typename C> void collectUsers(Value *V, C &Container) {
assert(V && "Must pass an existing Value!");
for (auto &&U : V->users())
@@ -82,8 +81,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
std::string W;
raw_string_ostream OS(W);
- OS << "Impossible to constant fold feature predicate: " << *P
- << " used by " << *NoFold << ", please simplify.\n";
+ OS << "Impossible to constant fold feature predicate: " << *P << " used by "
+ << *NoFold << ", please simplify.\n";
Caller->getContext().diagnose(
DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
@@ -91,8 +90,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
return {PreservedAnalyses::none(), false};
}
-std::pair<PreservedAnalyses, bool>
-handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
+ GlobalVariable *P) {
setPredicate(ST, P);
SmallPtrSet<Instruction *, 32> ToFold;
>From 4381d930084f38d9e4099d8c8fbea0e4267556a9 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 04:01:27 +0100
Subject: [PATCH 06/17] Remove stray space.
---
llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 8d38508eda74b..6d6c457170f8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -81,7 +81,7 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
std::string W;
raw_string_ostream OS(W);
- OS << "Impossible to constant fold feature predicate: " << *P << " used by "
+ OS << "Impossible to constant fold feature predicate: " << *P << " used by "
<< *NoFold << ", please simplify.\n";
Caller->getContext().diagnose(
>From d18f64e455f0d3b91c013bd0d99e895adc57fcad Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 11:01:59 +0100
Subject: [PATCH 07/17] Remove unused header, fix borked test.
---
...pu-builtin-cpu-is.c => amdgpu-builtin-processor-is.c} | 9 +++------
llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 1 -
2 files changed, 3 insertions(+), 7 deletions(-)
rename clang/test/CodeGen/{amdgpu-builtin-cpu-is.c => amdgpu-builtin-processor-is.c} (92%)
diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
similarity index 92%
rename from clang/test/CodeGen/amdgpu-builtin-cpu-is.c
rename to clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 6e261d9f5d239..f5d80bff1c51e 100644
--- a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -10,10 +10,6 @@
// 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
// load from it to provide the condition a br (abstract target)
//.
-// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
-//.
-// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
-//.
// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
//.
// AMDGCN-GFX900-LABEL: define dso_local void @foo(
@@ -31,7 +27,8 @@
// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]]
// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
-// AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
// AMDGCNSPIRV: [[IF_THEN]]:
// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap()
// AMDGCNSPIRV-NEXT: br label %[[IF_END]]
@@ -39,7 +36,7 @@
// AMDGCNSPIRV-NEXT: ret void
//
void foo() {
- if (__builtin_cpu_is("gfx900"))
+ if (__builtin_amdgcn_processor_is("gfx900"))
return __builtin_trap();
}
//.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 6d6c457170f8c..ae100e2f5b213 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -33,7 +33,6 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include <string>
>From 7880ff498495511c70952c0a135b5e9f9b837889 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 15:09:48 +0100
Subject: [PATCH 08/17] Stars everywhere.
---
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 4 +--
clang/lib/Sema/SemaExpr.cpp | 30 +++++++++----------
.../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 8 ++---
3 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 8ad1ab74f221d..179b9ad02177b 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -287,7 +287,7 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
- auto P = cast<GlobalVariable>(
+ auto *P = cast<GlobalVariable>(
CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
P->setConstant(true);
P->setExternallyInitialized(true);
@@ -608,7 +608,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
assert(CGM.getTriple().isSPIRV() &&
"__builtin_amdgcn_is_invocable should never reach CodeGen for "
"concrete targets!");
- auto FD = cast<FunctionDecl>(
+ auto *FD = cast<FunctionDecl>(
cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
StringRef RF =
getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index bd0183ae4fb82..44fd9aa1f1834 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6546,11 +6546,11 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
// without any additional checking.
if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
ArgExprs[0]->getType() == Context.BuiltinFnTy) {
- auto FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+ auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
if (FD->getName() == "__builtin_amdgcn_is_invocable") {
auto FnPtrTy = Context.getPointerType(FD->getType());
- auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+ auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
ExprValueKind::VK_PRValue, RParenLoc,
FPOptionsOverride());
@@ -13254,8 +13254,8 @@ static inline bool IsAMDGPUPredicateBI(Expr *E) {
if (!E->getType()->isVoidType())
return false;
- if (auto CE = dyn_cast<CallExpr>(E)) {
- if (auto BI = CE->getDirectCallee())
+ if (auto *CE = dyn_cast<CallExpr>(E)) {
+ if (auto *BI = CE->getDirectCallee())
if (BI->getName() == "__builtin_amdgcn_processor_is" ||
BI->getName() == "__builtin_amdgcn_is_invocable")
return true;
@@ -15622,14 +15622,14 @@ static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
auto &TI = Ctx.getTargetInfo();
if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
- auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+ auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
auto TID = TI.getTargetID();
if (GFX && TID) {
auto N = GFX->getString();
P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
}
} else {
- auto FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
+ auto *FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
llvm::StringMap<bool> CF;
@@ -20538,7 +20538,7 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
- auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+ auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
if (!GFX) {
Sema.Diag(CE->getExprLoc(),
diag::err_amdgcn_processor_is_arg_not_literal);
@@ -20554,7 +20554,7 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
return false;
}
} else {
- auto Arg = CE->getArg(0);
+ auto *Arg = CE->getArg(0);
if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
Sema.Diag(CE->getExprLoc(),
diag::err_amdgcn_is_invocable_arg_invalid_value)
@@ -20567,8 +20567,8 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
}
static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
- if (auto UO = dyn_cast<UnaryOperator>(E)) {
- auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
+ if (auto *UO = dyn_cast<UnaryOperator>(E)) {
+ auto *SE = dyn_cast<CallExpr>(UO->getSubExpr());
if (IsAMDGPUPredicateBI(SE)) {
assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
"__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
@@ -20585,9 +20585,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
return UO;
}
}
- if (auto BO = dyn_cast<BinaryOperator>(E)) {
- auto LHS = dyn_cast<CallExpr>(BO->getLHS());
- auto RHS = dyn_cast<CallExpr>(BO->getRHS());
+ if (auto *BO = dyn_cast<BinaryOperator>(E)) {
+ auto *LHS = dyn_cast<CallExpr>(BO->getLHS());
+ auto *RHS = dyn_cast<CallExpr>(BO->getRHS());
if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
assert(BO->isLogicalOp() &&
"__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
@@ -20606,7 +20606,7 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
return BO;
}
}
- if (auto CE = dyn_cast<CallExpr>(E))
+ if (auto *CE = dyn_cast<CallExpr>(E))
if (IsAMDGPUPredicateBI(CE)) {
if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
Invalid = true;
@@ -20631,7 +20631,7 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
if (!E->isTypeDependent()) {
if (E->getType()->isVoidType()) {
bool IsInvalidPredicate = false;
- if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+ if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
return BIC;
else if (IsInvalidPredicate)
return ExprError();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index ae100e2f5b213..f1c73e86fb4a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -45,7 +45,7 @@ template <typename C> void collectUsers(Value *V, C &Container) {
assert(V && "Must pass an existing Value!");
for (auto &&U : V->users())
- if (auto I = dyn_cast<Instruction>(U))
+ if (auto *I = dyn_cast<Instruction>(U))
Container.insert(Container.end(), I);
}
@@ -65,7 +65,7 @@ inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
PV.insert(PV.cbegin(), '+');
}
- auto PTy = P->getValueType();
+ auto *PTy = P->getValueType();
P->setLinkage(GlobalValue::PrivateLinkage);
P->setExternallyInitialized(false);
@@ -100,10 +100,10 @@ std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
return {PreservedAnalyses::all(), true};
do {
- auto I = *ToFold.begin();
+ auto *I = *ToFold.begin();
ToFold.erase(I);
- if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+ if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) {
collectUsers(I, ToFold);
I->replaceAllUsesWith(C);
I->eraseFromParent();
>From 719dfdea50ae31ac54040a95d499dae98f714a52 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 15:33:31 +0100
Subject: [PATCH 09/17] Fix format without line break.
---
clang/lib/Sema/SemaExpr.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 44fd9aa1f1834..889d54be8d91b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20630,10 +20630,10 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
if (!E->isTypeDependent()) {
if (E->getType()->isVoidType()) {
- bool IsInvalidPredicate = false;
- if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+ bool InvalidPredicate = false;
+ if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, InvalidPredicate))
return BIC;
- else if (IsInvalidPredicate)
+ else if (InvalidPredicate)
return ExprError();
}
>From 36b69b41f9d92901b1799bd8515ef4d8c9a41f51 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 15:40:37 +0100
Subject: [PATCH 10/17] Add host tests.
---
clang/test/SemaHIP/amdgpu-is-invocable.hip | 21 +++++++++++++++++++++
clang/test/SemaHIP/amdgpu-processor-is.hip | 21 +++++++++++++++++++++
2 files changed, 42 insertions(+)
create mode 100644 clang/test/SemaHIP/amdgpu-is-invocable.hip
create mode 100644 clang/test/SemaHIP/amdgpu-processor-is.hip
diff --git a/clang/test/SemaHIP/amdgpu-is-invocable.hip b/clang/test/SemaHIP/amdgpu-is-invocable.hip
new file mode 100644
index 0000000000000..214d7769a595f
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+ if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+ return __builtin_trap();
+}
+
+__global__ void bar() {
+ if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+ return __builtin_trap();
+}
diff --git a/clang/test/SemaHIP/amdgpu-processor-is.hip b/clang/test/SemaHIP/amdgpu-processor-is.hip
new file mode 100644
index 0000000000000..0f7211fd75d90
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-processor-is.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+ if (__builtin_amdgcn_processor_is("gfx900"))
+ return __builtin_trap();
+}
+
+__global__ void bar() {
+ if (__builtin_amdgcn_processor_is("gfx900"))
+ return __builtin_trap();
+}
>From e327e1520b2453e69d888d1be3d5c68c40a0456a Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 16:48:04 +0100
Subject: [PATCH 11/17] Fit code examples within 80-char limit.
---
clang/docs/LanguageExtensions.rst | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 8a7cb75af13e5..817f6a62f6a41 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4956,7 +4956,9 @@ a functional mechanism for programatically querying:
while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
- do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010"));
+ do {
+ *p -= x;
+ } while (__builtin_amdgcn_processor_is("gfx1010"));
for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
@@ -4967,9 +4969,11 @@ a functional mechanism for programatically querying:
do {
*p -= x;
- } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+ } while (
+ __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
- for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+ for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p)
+ break;
**Description**:
>From 888a0803db90e38a6d912b7d019b27196eee3bf3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 16 Apr 2025 03:35:14 +0300
Subject: [PATCH 12/17] Fix tests.
---
clang/test/CodeGen/amdgpu-builtin-is-invocable.c | 2 +-
clang/test/CodeGen/amdgpu-builtin-processor-is.c | 2 +-
clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp | 6 +++---
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 6d2690cb75b7c..12f283707308e 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -47,7 +47,7 @@ void foo() {
// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
//.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
//.
// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index f5d80bff1c51e..76dead8ebbe89 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -45,7 +45,7 @@ void foo() {
//.
// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
//.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
//.
// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
index f618f54909b00..26cc8b4c7631d 100644
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -32,11 +32,11 @@ void invalid_invocations(int x, const char* str) {
// CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
if (__builtin_amdgcn_processor_is(str)) return;
- // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid
+ // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid
if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
- // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid
+ // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid
else if (__builtin_amdgcn_is_invocable(str)) return;
- // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid
+ // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid
else if (__builtin_amdgcn_is_invocable(x)) return;
// CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
>From e35ac6281f1b22539e4771dfd2893bdabeb452b6 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 16 Apr 2025 15:31:30 +0300
Subject: [PATCH 13/17] Fix test.
---
.../CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
index 44557284fc581..cffd3c7a5fb1f 100644
--- a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
+++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
@@ -7,7 +7,7 @@
// HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
// HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
// HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
-// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass
+// HIPSTDPAR-PRE-NEXT: Running pass: AMDGPUExpandFeaturePredicatesPass
// Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
>From a8bca2fe2c054187981afcfca155e95efde26447 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 6 May 2025 01:47:53 +0100
Subject: [PATCH 14/17] Re-work implementation to return a target specific
type.
---
clang/docs/LanguageExtensions.rst | 61 ++------
clang/include/clang/Basic/AMDGPUTypes.def | 8 +
clang/include/clang/Basic/Builtins.def | 1 +
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +-
.../clang/Basic/DiagnosticSemaKinds.td | 9 +-
clang/include/clang/Sema/SemaAMDGPU.h | 4 +
clang/lib/AST/ASTContext.cpp | 11 +-
clang/lib/CodeGen/CGDebugInfo.cpp | 7 +
clang/lib/CodeGen/CGExprScalar.cpp | 4 +
clang/lib/CodeGen/CodeGenTypes.cpp | 3 +
clang/lib/Sema/Sema.cpp | 7 +-
clang/lib/Sema/SemaAMDGPU.cpp | 60 +++++++
clang/lib/Sema/SemaCast.cpp | 9 ++
clang/lib/Sema/SemaDecl.cpp | 15 ++
clang/lib/Sema/SemaExpr.cpp | 147 +-----------------
clang/lib/Sema/SemaInit.cpp | 16 ++
clang/lib/Sema/SemaOverload.cpp | 14 +-
.../amdgpu-feature-builtins-invalid-use.cpp | 41 +++--
18 files changed, 209 insertions(+), 212 deletions(-)
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 0c8dd564aed4a..da8b16501d00a 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4950,12 +4950,8 @@ a functional mechanism for programatically querying:
.. code-block:: c
- // When used as the predicate for a control structure
- bool __builtin_amdgcn_processor_is(const char*);
- bool __builtin_amdgcn_is_invocable(builtin_name);
- // Otherwise
- void __builtin_amdgcn_processor_is(const char*);
- void __builtin_amdgcn_is_invocable(void);
+ __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*);
+ __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name);
**Example of use**:
@@ -4974,7 +4970,7 @@ a functional mechanism for programatically querying:
while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
do {
- *p -= x;
+ break;
} while (__builtin_amdgcn_processor_is("gfx1010"));
for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
@@ -4985,7 +4981,7 @@ a functional mechanism for programatically querying:
__builtin_amdgcn_s_ttracedata_imm(1);
do {
- *p -= x;
+ break;
} while (
__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
@@ -4994,17 +4990,21 @@ a functional mechanism for programatically querying:
**Description**:
-When used as the predicate value of the following control structures:
+The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a
+target specific type that behaves as if its C++ definition was the following:
.. code-block:: c++
- if (...)
- while (...)
- do { } while (...)
- for (...)
+ struct __amdgpu_feature_predicate_t {
+ __amdgpu_feature_predicate_t() = delete;
+ __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete;
+ __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete;
+
+ explicit
+ operator bool() const noexcept;
+ };
-be it directly, or as arguments to logical operators such as ``!, ||, &&``, the
-builtins return a boolean value that:
+The boolean interpretation of the predicate values returned by the builtins:
* indicates whether the current target matches the argument; the argument MUST
be a string literal and a valid AMDGPU target
@@ -5012,37 +5012,6 @@ builtins return a boolean value that:
by the current target; the argument MUST be either a generic or AMDGPU
specific builtin name
-Outside of these contexts, the builtins have a ``void`` returning signature
-which prevents their misuse.
-
-**Example of invalid use**:
-
-.. code-block:: c++
-
- void kernel(int* p, int x, bool (*pfn)(bool), const char* str) {
- if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
- else if (__builtin_amdgcn_processor_is(str)) __builtin_trap();
-
- bool a = __builtin_amdgcn_processor_is("gfx906");
- const bool b = !__builtin_amdgcn_processor_is("gfx906");
- const bool c = !__builtin_amdgcn_processor_is("gfx906");
- bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- const auto f =
- !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
- || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- const auto g =
- !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
- || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- __builtin_amdgcn_processor_is("gfx1201")
- ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
- if (pfn(__builtin_amdgcn_processor_is("gfx1200")))
- __builtin_amdgcn_s_sleep_var(x);
-
- if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
- else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap();
- }
-
When invoked while compiling for a concrete target, the builtins are evaluated
early by Clang, and never produce any CodeGen effects / have no observable
side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index d3dff446f9edf..a0574c640184b 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -20,10 +20,18 @@
AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
#endif
+#ifndef AMDGPU_FEATURE_PREDICATE_TYPE
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \
+ AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
+#endif
+
AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)
+AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1)
+
#undef AMDGPU_TYPE
#undef AMDGPU_OPAQUE_PTR_TYPE
#undef AMDGPU_NAMED_BARRIER_TYPE
+#undef AMDGPU_FEATURE_PREDICATE_TYPE
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 48437c9397570..27f78af16fe06 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -34,6 +34,7 @@
// Q -> target builtin type, followed by a character to distinguish the builtin type
// Qa -> AArch64 svcount_t builtin type.
// Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
+// Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type.
// E -> ext_vector, followed by the number of elements and the base type.
// X -> _Complex, followed by the base type.
// Y -> ptrdiff_t
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 3d53223e3a5a4..b57b315b87790 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -351,8 +351,8 @@ BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
// These are special FE only builtins intended for forwarding the requirements
// to the ME.
-BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu")
-BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu")
+BUILTIN(__builtin_amdgcn_processor_is, "QccC*", "nctu")
+BUILTIN(__builtin_amdgcn_is_invocable, "Qc", "nctu")
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5f118d744a6cf..e92e8cdee4b63 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6820,7 +6820,7 @@ def err_counted_by_on_incomplete_type_on_use : Error <
def note_counted_by_consider_completing_pointee_ty : Note<
"consider providing a complete definition for %0">;
-
+
def note_counted_by_consider_using_sized_by : Note<
"consider using '__sized_by%select{|_or_null}0' instead of "
"'__counted_by%select{|_or_null}0'">;
@@ -13292,4 +13292,11 @@ def err_amdgcn_is_invocable_arg_invalid_value
: Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
"target agnostic builtin or an AMDGCN target specific builtin; `%0`"
" is not valid">;
+def err_amdgcn_predicate_type_is_not_constructible
+ : Error<"%0 has type __amdgpu_feature_predicate_t, which is not"
+ " constructible">;
+def err_amdgcn_predicate_type_needs_explicit_bool_cast
+ : Error<"%0 must be explicitly cast to %1; however, please note that this "
+ "is almost always an error and that it prevents the effective "
+ "guarding of target dependent code, and thus should be avoided">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d62c9bb65fadb..843a146243eae 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -64,6 +64,10 @@ class SemaAMDGPU : public SemaBase {
void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+
+ /// Expand a valid use of the feature identification builtins into its
+ /// corresponding sequence of instructions.
+ Expr *ExpandAMDGPUPredicateBI(CallExpr *CE);
};
} // namespace clang
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index ae136ae271882..28bdb1d90bbbd 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1477,7 +1477,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
}
if (Target.getTriple().isAMDGPU() ||
- (AuxTarget && AuxTarget->getTriple().isAMDGPU())) {
+ (Target.getTriple().isSPIRV() &&
+ Target.getTriple().getVendor() == llvm::Triple::AMD) ||
+ (AuxTarget &&
+ (AuxTarget->getTriple().isAMDGPU() ||
+ ((AuxTarget->getTriple().isSPIRV() &&
+ AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) {
#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \
InitBuiltinType(SingletonId, BuiltinType::Id);
#include "clang/Basic/AMDGPUTypes.def"
@@ -12379,6 +12384,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
Type = Context.AMDGPUBufferRsrcTy;
break;
}
+ case 'c': {
+ Type = Context.AMDGPUFeaturePredicateTy;
+ break;
+ }
default:
llvm_unreachable("Unexpected target builtin type");
}
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f3ec498d4064b..c68b1ce1f643d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -919,6 +919,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \
return SingletonId; \
}
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \
+ case BuiltinType::Id: { \
+ if (!SingletonId) \
+ SingletonId = \
+ DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean); \
+ return SingletonId; \
+ }
#include "clang/Basic/AMDGPUTypes.def"
case BuiltinType::UChar:
case BuiltinType::Char_U:
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 15a6177746403..ad543b8f713b4 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -980,6 +980,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) {
if (const MemberPointerType *MPT = dyn_cast<MemberPointerType>(SrcType))
return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT);
+ // The conversion is a NOP, and will be done when CodeGening the builtin.
+ if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy)
+ return Src;
+
assert((SrcType->isIntegerType() || isa<llvm::PointerType>(Src->getType())) &&
"Unknown scalar type to convert");
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index d1b292f23c2d2..61013242d3a08 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -584,6 +584,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
case BuiltinType::Id: \
return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier", \
{}, {Scope});
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \
+ case BuiltinType::Id: \
+ return llvm::IntegerType::getInt1Ty(getLLVMContext());
#include "clang/Basic/AMDGPUTypes.def"
#define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
#include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 1901d19b14dfc..c4ed83cc8d50a 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -546,8 +546,13 @@ void Sema::Initialize() {
}
if (Context.getTargetInfo().getTriple().isAMDGPU() ||
+ (Context.getTargetInfo().getTriple().isSPIRV() &&
+ Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) ||
(Context.getAuxTargetInfo() &&
- Context.getAuxTargetInfo()->getTriple().isAMDGPU())) {
+ (Context.getAuxTargetInfo()->getTriple().isAMDGPU() ||
+ (Context.getAuxTargetInfo()->getTriple().isSPIRV() &&
+ Context.getAuxTargetInfo()->getTriple().getVendor() ==
+ llvm::Triple::AMD)))) {
#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \
addImplicitTypedef(Name, Context.SingletonId);
#include "clang/Basic/AMDGPUTypes.def"
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a6366aceec2a6..7bf88c5c6a9a0 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -12,6 +12,7 @@
#include "clang/Sema/SemaAMDGPU.h"
#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Sema/Ownership.h"
#include "clang/Sema/Sema.h"
@@ -366,4 +367,63 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D,
addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
}
+Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
+ auto &Ctx = getASTContext();
+ auto BoolTy = Ctx.getLogicalOperationType();
+ auto False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy));
+ auto True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy));
+ auto Loc = CE->getExprLoc();
+
+ if (!CE->getBuiltinCallee())
+ return IntegerLiteral::Create(Ctx, False, BoolTy, Loc);
+
+ auto P = false;
+ auto BI = CE->getBuiltinCallee();
+ if (Ctx.BuiltinInfo.isAuxBuiltinID(BI))
+ BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI);
+
+ if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) {
+ auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+ if (!GFX) {
+ Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal);
+ return nullptr;
+ }
+
+ auto N = GFX->getString();
+ if (!Ctx.getTargetInfo().isValidCPUName(N) &&
+ (!Ctx.getAuxTargetInfo() ||
+ !Ctx.getAuxTargetInfo()->isValidCPUName(N))) {
+ Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+ return nullptr;
+ }
+ if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+ CE->setType(BoolTy);
+ return CE;
+ }
+
+ if (auto TID = Ctx.getTargetInfo().getTargetID())
+ P = TID->find(N) == 0;
+ } else {
+ auto *Arg = CE->getArg(0);
+ if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) {
+ Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+ return nullptr;
+ }
+
+ if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+ CE->setType(BoolTy);
+ return CE;
+ }
+
+ auto *FD = cast<FunctionDecl>(Arg->getReferencedDeclOfCallee());
+
+ StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+ llvm::StringMap<bool> CF;
+ Ctx.getFunctionFeatureMap(CF, FD);
+
+ P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+ }
+
+ return IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc);
+}
} // namespace clang
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 14e16bc39eb3a..2a6f167296239 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -23,6 +23,7 @@
#include "clang/Basic/TargetInfo.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Sema/Initialization.h"
+#include "clang/Sema/SemaAMDGPU.h"
#include "clang/Sema/SemaHLSL.h"
#include "clang/Sema/SemaObjC.h"
#include "clang/Sema/SemaRISCV.h"
@@ -1563,6 +1564,14 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
return TC_Success;
}
+ if (SrcType == Self.Context.AMDGPUFeaturePredicateTy &&
+ DestType == Self.Context.getLogicalOperationType()) {
+ SrcExpr =
+ Self.AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(SrcExpr.get()));
+ Kind = CK_NoOp;
+ return TC_Success;
+ }
+
// We tried everything. Everything! Nothing works! :-(
return TC_NotApplicable;
}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 63937ddc3e386..89e49645863c9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13617,6 +13617,15 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
return;
}
+ // __amdgpu_feature_predicate_t cannot be initialised
+ if (VDecl->getType().getDesugaredType(Context) ==
+ Context.AMDGPUFeaturePredicateTy) {
+ Diag(VDecl->getLocation(),
+ diag::err_amdgcn_predicate_type_is_not_constructible) << VDecl;
+ VDecl->setInvalidDecl();
+ return;
+ }
+
// WebAssembly tables can't be used to initialise a variable.
if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) {
Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0;
@@ -14151,6 +14160,12 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
if (VarDecl *Var = dyn_cast<VarDecl>(RealDecl)) {
QualType Type = Var->getType();
+ if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) {
+ Diag(Var->getLocation(),
+ diag::err_amdgcn_predicate_type_is_not_constructible) << Var;
+ Var->setInvalidDecl();
+ return;
+ }
// C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory.
if (isa<DecompositionDecl>(RealDecl)) {
Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7e36efa727072..99fdcc89429a5 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -50,6 +50,7 @@
#include "clang/Sema/ParsedTemplate.h"
#include "clang/Sema/Scope.h"
#include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
#include "clang/Sema/SemaCUDA.h"
#include "clang/Sema/SemaFixItUtils.h"
#include "clang/Sema/SemaHLSL.h"
@@ -6556,7 +6557,8 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
if (FD->getName() == "__builtin_amdgcn_is_invocable") {
auto FnPtrTy = Context.getPointerType(FD->getType());
auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
- return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
+ return CallExpr::Create(Context, R, ArgExprs,
+ Context.AMDGPUFeaturePredicateTy,
ExprValueKind::VK_PRValue, RParenLoc,
FPOptionsOverride());
}
@@ -13365,20 +13367,6 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
return InvalidOperands(Loc, LHS, RHS);
}
-static inline bool IsAMDGPUPredicateBI(Expr *E) {
- if (!E->getType()->isVoidType())
- return false;
-
- if (auto *CE = dyn_cast<CallExpr>(E)) {
- if (auto *BI = CE->getDirectCallee())
- if (BI->getName() == "__builtin_amdgcn_processor_is" ||
- BI->getName() == "__builtin_amdgcn_is_invocable")
- return true;
- }
-
- return false;
-}
-
// C99 6.5.[13,14]
inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
SourceLocation Loc,
@@ -13474,9 +13462,6 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
// The following is safe because we only use this method for
// non-overloadable operands.
- if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get()))
- return Context.VoidTy;
-
// C++ [expr.log.and]p1
// C++ [expr.log.or]p1
// The operands are both contextually converted to type bool.
@@ -15706,37 +15691,6 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
}
-static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
- if (!CE->getBuiltinCallee())
- return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc());
-
- if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
- CE->setType(Ctx.getLogicalOperationType());
- return CE;
- }
-
- bool P = false;
- auto &TI = Ctx.getTargetInfo();
-
- if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
- auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
- auto TID = TI.getTargetID();
- if (GFX && TID) {
- auto N = GFX->getString();
- P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
- }
- } else {
- auto *FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
-
- StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
- llvm::StringMap<bool> CF;
- Ctx.getFunctionFeatureMap(CF, FD);
-
- P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
- }
-
- return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc());
-}
ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
UnaryOperatorKind Opc, Expr *InputExpr,
@@ -15915,7 +15869,9 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
// Vector logical not returns the signed variant of the operand type.
resultType = GetSignedVectorType(resultType);
break;
- } else if (IsAMDGPUPredicateBI(InputExpr)) {
+ } else if (resultType == Context.AMDGPUFeaturePredicateTy) {
+ resultType = Context.getLogicalOperationType();
+ Input = AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(InputExpr));
break;
} else {
return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
@@ -20661,88 +20617,6 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
}
}
-static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
- if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
- auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
- if (!GFX) {
- Sema.Diag(CE->getExprLoc(),
- diag::err_amdgcn_processor_is_arg_not_literal);
- return false;
- }
- auto N = GFX->getString();
- if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) &&
- (!Sema.getASTContext().getAuxTargetInfo() ||
- !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
- Sema.Diag(CE->getExprLoc(),
- diag::err_amdgcn_processor_is_arg_invalid_value)
- << N;
- return false;
- }
- } else {
- auto *Arg = CE->getArg(0);
- if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
- Sema.Diag(CE->getExprLoc(),
- diag::err_amdgcn_is_invocable_arg_invalid_value)
- << Arg;
- return false;
- }
- }
-
- return true;
-}
-
-static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
- if (auto *UO = dyn_cast<UnaryOperator>(E)) {
- auto *SE = dyn_cast<CallExpr>(UO->getSubExpr());
- if (IsAMDGPUPredicateBI(SE)) {
- assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
- "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
- "can only be used as operands of logical ops!");
-
- if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
- Invalid = true;
- return nullptr;
- }
-
- UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE));
- UO->setType(Sema.getASTContext().getLogicalOperationType());
-
- return UO;
- }
- }
- if (auto *BO = dyn_cast<BinaryOperator>(E)) {
- auto *LHS = dyn_cast<CallExpr>(BO->getLHS());
- auto *RHS = dyn_cast<CallExpr>(BO->getRHS());
- if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
- assert(BO->isLogicalOp() &&
- "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
- "can only be used as operands of logical ops!");
-
- if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
- !ValidateAMDGPUPredicateBI(Sema, RHS)) {
- Invalid = true;
- return nullptr;
- }
-
- BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS));
- BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS));
- BO->setType(Sema.getASTContext().getLogicalOperationType());
-
- return BO;
- }
- }
- if (auto *CE = dyn_cast<CallExpr>(E))
- if (IsAMDGPUPredicateBI(CE)) {
- if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
- Invalid = true;
- return nullptr;
- }
- return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE);
- }
-
- return nullptr;
-}
-
ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
bool IsConstexpr) {
DiagnoseAssignmentAsCondition(E);
@@ -20754,13 +20628,8 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
E = result.get();
if (!E->isTypeDependent()) {
- if (E->getType()->isVoidType()) {
- bool InvalidPredicate = false;
- if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, InvalidPredicate))
- return BIC;
- else if (InvalidPredicate)
- return ExprError();
- }
+ if (E->getType() == Context.AMDGPUFeaturePredicateTy)
+ return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast_or_null<CallExpr>(E));
if (getLangOpts().CPlusPlus)
return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index e5670dab03cb0..4e6feb871b725 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9103,6 +9103,15 @@ bool InitializationSequence::Diagnose(Sema &S,
case FK_ConversionFailed: {
QualType FromType = OnlyArg->getType();
+ // __amdgpu_feature_predicate_t can be explicitly cast to the logical op
+ // type, although this is almost always an error and we advise against it
+ if (FromType == S.Context.AMDGPUFeaturePredicateTy &&
+ DestType == S.Context.getLogicalOperationType()) {
+ S.Diag(OnlyArg->getExprLoc(),
+ diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+ << OnlyArg << DestType;
+ break;
+ }
PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
<< (int)Entity.getKind()
<< DestType
@@ -9907,6 +9916,13 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity,
if (EqualLoc.isInvalid())
EqualLoc = InitE->getBeginLoc();
+ if (Entity.getType().getDesugaredType(Context) ==
+ Context.AMDGPUFeaturePredicateTy) {
+ Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible)
+ << Entity.getDecl();
+ return ExprError();
+ }
+
InitializationKind Kind = InitializationKind::CreateCopy(
InitE->getBeginLoc(), EqualLoc, AllowExplicit);
InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index d3ee9989c73ed..39693055c2106 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -30,6 +30,7 @@
#include "clang/Sema/Initialization.h"
#include "clang/Sema/Lookup.h"
#include "clang/Sema/Overload.h"
+#include "clang/Sema/SemaAMDGPU.h"
#include "clang/Sema/SemaCUDA.h"
#include "clang/Sema/SemaObjC.h"
#include "clang/Sema/Template.h"
@@ -6137,12 +6138,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) {
ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) {
if (checkPlaceholderForOverload(*this, From))
return ExprError();
+ if (From->getType() == Context.AMDGPUFeaturePredicateTy)
+ return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(From));
ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From);
if (!ICS.isBad())
return PerformImplicitConversion(From, Context.BoolTy, ICS,
AssignmentAction::Converting);
-
if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy))
return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition)
<< From->getType() << From->getSourceRange();
@@ -11921,6 +11923,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn))
return;
+ // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type,
+ // although this is almost always an error and we advise against it.
+ if (FromTy == S.Context.AMDGPUFeaturePredicateTy &&
+ ToTy == S.Context.getLogicalOperationType()) {
+ S.Diag(Conv.Bad.FromExpr->getExprLoc(),
+ diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+ << Conv.Bad.FromExpr << ToTy;
+ return;
+ }
+
// Emit the generic diagnostic and, optionally, add the hints to it.
PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv);
FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
index 26cc8b4c7631d..43d657d25d013 100644
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -1,29 +1,29 @@
// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
-bool predicate(bool x) { return x; }
+bool predicate(bool x);
+void pass_by_value(__amdgpu_feature_predicate_t x);
-void invalid_uses(int* p, int x, bool (*pfn)(bool)) {
- // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv,
+ __amdgpu_feature_predicate_t &&rv) {
+ // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible
+ __amdgpu_feature_predicate_t a;
+ // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible
+ __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906");
+ // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible
+ __amdgpu_feature_predicate_t c = lv;
+ // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible
+ __amdgpu_feature_predicate_t d = rv;
+ // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
- // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void'
- const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906");
- // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
- bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
- bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- // CHECK: error: variable has incomplete type 'const void'
- const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
- // CHECK: error: variable has incomplete type 'const void'
- const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900");
- // CHECK: error: variable has incomplete type 'const void'
- const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
- // CHECK: error: value of type 'void' is not contextually convertible to 'bool'
- __builtin_amdgcn_processor_is("gfx1201")
- ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
- // CHECK: error: no matching function for call to 'predicate'
+ // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible
+ pass_by_value(__builtin_amdgcn_processor_is("gfx906"));
+ // CHECK: error: '__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+ bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+ // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+ if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return;
+ // CHECK: error: '__builtin_amdgcn_processor_is("gfx1200")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
- // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument
}
void invalid_invocations(int x, const char* str) {
@@ -31,7 +31,6 @@ void invalid_invocations(int x, const char* str) {
if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
// CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
if (__builtin_amdgcn_processor_is(str)) return;
-
// CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid
if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
// CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid
>From 716cc1fe760b9a56655a3334c333876dc2b0bfb3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 6 May 2025 13:02:25 +0100
Subject: [PATCH 15/17] Fix formatting.
---
clang/lib/Sema/Sema.cpp | 2 +-
clang/lib/Sema/SemaAMDGPU.cpp | 2 +-
clang/lib/Sema/SemaCast.cpp | 4 ++--
clang/lib/Sema/SemaDecl.cpp | 8 +++++---
clang/lib/Sema/SemaExpr.cpp | 7 +++----
clang/lib/Sema/SemaInit.cpp | 2 +-
clang/lib/Sema/SemaOverload.cpp | 2 +-
7 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index c4ed83cc8d50a..3e55b5da3c027 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -552,7 +552,7 @@ void Sema::Initialize() {
(Context.getAuxTargetInfo()->getTriple().isAMDGPU() ||
(Context.getAuxTargetInfo()->getTriple().isSPIRV() &&
Context.getAuxTargetInfo()->getTriple().getVendor() ==
- llvm::Triple::AMD)))) {
+ llvm::Triple::AMD)))) {
#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \
addImplicitTypedef(Name, Context.SingletonId);
#include "clang/Basic/AMDGPUTypes.def"
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 7bf88c5c6a9a0..df4b3237a7844 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -12,8 +12,8 @@
#include "clang/Sema/SemaAMDGPU.h"
#include "clang/Basic/DiagnosticSema.h"
-#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
#include "clang/Sema/Ownership.h"
#include "clang/Sema/Sema.h"
#include "llvm/Support/AtomicOrdering.h"
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 2a6f167296239..8d47b2747f47d 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -1566,8 +1566,8 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
if (SrcType == Self.Context.AMDGPUFeaturePredicateTy &&
DestType == Self.Context.getLogicalOperationType()) {
- SrcExpr =
- Self.AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(SrcExpr.get()));
+ SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI(
+ dyn_cast<CallExpr>(SrcExpr.get()));
Kind = CK_NoOp;
return TC_Success;
}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 89e49645863c9..f932b069479c7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13619,9 +13619,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
// __amdgpu_feature_predicate_t cannot be initialised
if (VDecl->getType().getDesugaredType(Context) ==
- Context.AMDGPUFeaturePredicateTy) {
+ Context.AMDGPUFeaturePredicateTy) {
Diag(VDecl->getLocation(),
- diag::err_amdgcn_predicate_type_is_not_constructible) << VDecl;
+ diag::err_amdgcn_predicate_type_is_not_constructible)
+ << VDecl;
VDecl->setInvalidDecl();
return;
}
@@ -14162,7 +14163,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) {
Diag(Var->getLocation(),
- diag::err_amdgcn_predicate_type_is_not_constructible) << Var;
+ diag::err_amdgcn_predicate_type_is_not_constructible)
+ << Var;
Var->setInvalidDecl();
return;
}
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 99fdcc89429a5..8247f3da58280 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6557,10 +6557,9 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
if (FD->getName() == "__builtin_amdgcn_is_invocable") {
auto FnPtrTy = Context.getPointerType(FD->getType());
auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
- return CallExpr::Create(Context, R, ArgExprs,
- Context.AMDGPUFeaturePredicateTy,
- ExprValueKind::VK_PRValue, RParenLoc,
- FPOptionsOverride());
+ return CallExpr::Create(
+ Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy,
+ ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride());
}
}
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 4e6feb871b725..dafd1eee196e8 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9109,7 +9109,7 @@ bool InitializationSequence::Diagnose(Sema &S,
DestType == S.Context.getLogicalOperationType()) {
S.Diag(OnlyArg->getExprLoc(),
diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
- << OnlyArg << DestType;
+ << OnlyArg << DestType;
break;
}
PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 39693055c2106..92e7d76d064c3 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11929,7 +11929,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
ToTy == S.Context.getLogicalOperationType()) {
S.Diag(Conv.Bad.FromExpr->getExprLoc(),
diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
- << Conv.Bad.FromExpr << ToTy;
+ << Conv.Bad.FromExpr << ToTy;
return;
}
>From 79035a9624ae3d769adb5eeb91f00081021f51cd Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 6 May 2025 20:09:38 +0100
Subject: [PATCH 16/17] Delete spurious whitespace.
---
clang/lib/Sema/SemaExpr.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8247f3da58280..85a924f5b5805 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -15690,7 +15690,6 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
}
-
ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
UnaryOperatorKind Opc, Expr *InputExpr,
bool IsAfterAmp) {
>From 0f04dbc4ca49a627290b758db34654a0ad62601e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 8 May 2025 00:53:21 +0100
Subject: [PATCH 17/17] Handle jumps into controlled sequences.
---
.../clang/Basic/DiagnosticSemaKinds.td | 2 +
clang/include/clang/Sema/SemaAMDGPU.h | 4 ++
clang/lib/Sema/JumpDiagnostics.cpp | 7 ++-
clang/lib/Sema/SemaAMDGPU.cpp | 14 +++--
.../amdgpu-feature-builtins-cant-jump.hip | 62 +++++++++++++++++++
5 files changed, 84 insertions(+), 5 deletions(-)
create mode 100644 clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f2604f052512f..14880adf8e4ad 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13312,4 +13312,6 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast
: Error<"%0 must be explicitly cast to %1; however, please note that this "
"is almost always an error and that it prevents the effective "
"guarding of target dependent code, and thus should be avoided">;
+def note_amdgcn_protected_by_predicate
+ : Note<"jump enters statement controlled by AMDGPU feature predicate">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index 843a146243eae..0d11d799946b5 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -15,12 +15,15 @@
#include "clang/AST/ASTFwd.h"
#include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/SmallPtrSet.h"
namespace clang {
class AttributeCommonInfo;
+class Expr;
class ParsedAttr;
class SemaAMDGPU : public SemaBase {
+ llvm::SmallPtrSet<Expr *, 32> ExpandedPredicates;
public:
SemaAMDGPU(Sema &S);
@@ -68,6 +71,7 @@ class SemaAMDGPU : public SemaBase {
/// Expand a valid use of the feature identification builtins into its
/// corresponding sequence of instructions.
Expr *ExpandAMDGPUPredicateBI(CallExpr *CE);
+ bool IsPredicate(Expr *E) const;
};
} // namespace clang
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index a852a950b47f4..718d8b461805c 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -19,6 +19,7 @@
#include "clang/AST/StmtOpenACC.h"
#include "clang/AST/StmtOpenMP.h"
#include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/SemaAMDGPU.h"
#include "clang/Sema/SemaInternal.h"
#include "llvm/ADT/BitVector.h"
using namespace clang;
@@ -367,8 +368,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
case Stmt::IfStmtClass: {
IfStmt *IS = cast<IfStmt>(S);
+ bool AMDGPUPredicate = false;
if (!(IS->isConstexpr() || IS->isConsteval() ||
- IS->isObjCAvailabilityCheck()))
+ IS->isObjCAvailabilityCheck() ||
+ (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond()))))
break;
unsigned Diag = diag::note_protected_by_if_available;
@@ -376,6 +379,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
Diag = diag::note_protected_by_constexpr_if;
else if (IS->isConsteval())
Diag = diag::note_protected_by_consteval_if;
+ else if (AMDGPUPredicate)
+ Diag = diag::note_amdgcn_protected_by_predicate;
if (VarDecl *Var = IS->getConditionVariable())
BuildScopeInformation(Var, ParentScope);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index df4b3237a7844..6833a2678c791 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -375,7 +375,8 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
auto Loc = CE->getExprLoc();
if (!CE->getBuiltinCallee())
- return IntegerLiteral::Create(Ctx, False, BoolTy, Loc);
+ return *ExpandedPredicates.insert(
+ IntegerLiteral::Create(Ctx, False, BoolTy, Loc)).first;
auto P = false;
auto BI = CE->getBuiltinCallee();
@@ -398,7 +399,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
}
if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
CE->setType(BoolTy);
- return CE;
+ return *ExpandedPredicates.insert(CE).first;
}
if (auto TID = Ctx.getTargetInfo().getTargetID())
@@ -412,7 +413,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
CE->setType(BoolTy);
- return CE;
+ return *ExpandedPredicates.insert(CE).first;
}
auto *FD = cast<FunctionDecl>(Arg->getReferencedDeclOfCallee());
@@ -424,6 +425,11 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
}
- return IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc);
+ return *ExpandedPredicates.insert(
+ IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)).first;
+}
+
+bool SemaAMDGPU::IsPredicate(Expr *E) const {
+ return ExpandedPredicates.contains(E);
}
} // namespace clang
diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
new file mode 100644
index 0000000000000..a7f1abcdcd8fe
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
@@ -0,0 +1,62 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx900 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx1201 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void f(int *ptr, int size, bool f) {
+ int i = 0;
+ if (f)
+ goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+ if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+ for (i = 0; i < size; ++i) {
+ label:
+ ptr[i] = i;
+ }
+ }
+}
+
+__device__ void g(int *ptr, int size, bool f) {
+ int i = 0;
+ if (f)
+ goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+ if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+ for (i = 0; i < size; ++i) {
+ label:
+ ptr[i] = i;
+ }
+ }
+}
+
+__global__ void h(int *ptr, int size, bool f) {
+ int i = 0;
+ if (f)
+ goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+ if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+ for (i = 0; i < size; ++i) {
+ label:
+ ptr[i] = i;
+ }
+ }
+}
+
+__global__ void i(int *ptr, int size, bool f) {
+ int i = 0;
+ if (f)
+ goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+ if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+ for (i = 0; i < size; ++i) {
+ label:
+ ptr[i] = i;
+ }
+ }
+}
More information about the llvm-commits
mailing list