[clang] [llvm] [SPIRV][AMDGPU][clang][CodeGen][opt] Add late-resolved feature identifying predicates (PR #134016)

Alex Voicu via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 9 16:48:06 PST 2026


https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/134016

>From 91eeaf02336e539f14dcb0a79ff15dbe8befe6f1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 02:47:42 +0100
Subject: [PATCH 01/69] Add the functional identity and feature queries.

---
 clang/docs/LanguageExtensions.rst             | 110 ++++++
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   5 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  10 +
 clang/lib/Basic/Targets/SPIR.cpp              |   4 +
 clang/lib/Basic/Targets/SPIR.h                |   4 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  29 ++
 clang/lib/Sema/SemaExpr.cpp                   | 157 ++++++++
 clang/test/CodeGen/amdgpu-builtin-cpu-is.c    |  65 ++++
 .../CodeGen/amdgpu-builtin-is-invocable.c     |  64 ++++
 .../amdgpu-feature-builtins-invalid-use.cpp   |  43 +++
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   9 +
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp   | 207 ++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   3 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 ...pu-expand-feature-predicates-unfoldable.ll |  28 ++
 .../amdgpu-expand-feature-predicates.ll       | 359 ++++++++++++++++++
 17 files changed, 1099 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/amdgpu-builtin-cpu-is.c
 create mode 100644 clang/test/CodeGen/amdgpu-builtin-is-invocable.c
 create mode 100644 clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 3b8a9cac6587a..8a7cb75af13e5 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4920,6 +4920,116 @@ If no address spaces names are provided, all address spaces are fenced.
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
 
+__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide
+a functional mechanism for programatically querying:
+
+* the identity of the current target processor;
+* the capability of the current target processor to invoke a particular builtin.
+
+**Syntax**:
+
+.. code-block:: c
+
+  // When used as the predicate for a control structure
+  bool __builtin_amdgcn_processor_is(const char*);
+  bool __builtin_amdgcn_is_invocable(builtin_name);
+  // Otherwise
+  void __builtin_amdgcn_processor_is(const char*);
+  void __builtin_amdgcn_is_invocable(void);
+
+**Example of use**:
+
+.. code-block:: c++
+
+  if (__builtin_amdgcn_processor_is("gfx1201") ||
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+    __builtin_amdgcn_s_sleep_var(x);
+
+  if (!__builtin_amdgcn_processor_is("gfx906"))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_processor_is("gfx1010") ||
+           __builtin_amdgcn_processor_is("gfx1101"))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+
+  do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010"));
+
+  for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  do {
+    *p -= x;
+  } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+
+  for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+
+**Description**:
+
+When used as the predicate value of the following control structures:
+
+.. code-block:: c++
+
+  if (...)
+  while (...)
+  do { } while (...)
+  for (...)
+
+be it directly, or as arguments to logical operators such as ``!, ||, &&``, the
+builtins return a boolean value that:
+
+* indicates whether the current target matches the argument; the argument MUST
+  be a string literal and a valid AMDGPU target
+* indicates whether the builtin function passed as the argument can be invoked
+  by the current target; the argument MUST be either a generic or AMDGPU
+  specific builtin name
+
+Outside of these contexts, the builtins have a ``void`` returning signature
+which prevents their misuse.
+
+**Example of invalid use**:
+
+.. code-block:: c++
+
+  void kernel(int* p, int x, bool (*pfn)(bool), const char* str) {
+    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+    else if (__builtin_amdgcn_processor_is(str)) __builtin_trap();
+
+    bool a = __builtin_amdgcn_processor_is("gfx906");
+    const bool b = !__builtin_amdgcn_processor_is("gfx906");
+    const bool c = !__builtin_amdgcn_processor_is("gfx906");
+    bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    const auto f =
+        !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
+        || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    const auto g =
+        !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
+        || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    __builtin_amdgcn_processor_is("gfx1201")
+      ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
+    if (pfn(__builtin_amdgcn_processor_is("gfx1200")))
+      __builtin_amdgcn_s_sleep_var(x);
+
+    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+    else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap();
+  }
+
+When invoked while compiling for a concrete target, the builtins are evaluated
+early by Clang, and never produce any CodeGen effects / have no observable
+side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
+which is an abstract target, a series of predicate values are implicitly
+created. These predicates get resolved when finalizing the compilation process
+for a concrete target, and shall reflect the latter's identity and features.
+Thus, it is possible to author high-level code, in e.g. HIP, that is target
+adaptive in a dynamic fashion, contrary to macro based mechanisms.
 
 ARM/AArch64 Language Extensions
 -------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 44ef404aee72f..5d01a7e75f7e7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,6 +346,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
 BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
 BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
 
+// These are special FE only builtins intended for forwarding the requirements
+// to the ME.
+BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu")
+BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu")
+
 //===----------------------------------------------------------------------===//
 // R600-NI only builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5e45482584946..45f0f9eb88e55 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13054,4 +13054,14 @@ def err_acc_decl_for_routine
 // AMDGCN builtins diagnostics
 def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
 def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+def err_amdgcn_processor_is_arg_not_literal
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a string "
+            "literal">;
+def err_amdgcn_processor_is_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a valid "
+            "AMDGCN processor identifier; '%0' is not valid">;
+def err_amdgcn_is_invocable_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
+            "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
+            " is not valid">;
 } // end of sema component.
diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp
index 5b5f47f9647a2..eb43d9b0be283 100644
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@@ -152,3 +152,7 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) {
     Float128Format = DoubleFormat;
   }
 }
+
+bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const {
+  return AMDGPUTI.isValidCPUName(CPU);
+}
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 78505d66d6f2f..7aa13cbeb89fd 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -432,6 +432,10 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
   }
 
   bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+  // This is only needed for validating arguments passed to
+  // __builtin_amdgcn_processor_is
+  bool isValidCPUName(StringRef Name) const override;
 };
 
 } // namespace targets
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index b56b739094ff3..7b1a3815144b4 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -284,6 +284,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
   Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
 }
 
+static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
+  auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
+
+  auto P = cast<GlobalVariable>(
+      CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
+  P->setConstant(true);
+  P->setExternallyInitialized(true);
+
+  return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(),
+                                           KnownNonNull));
+}
+
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -585,6 +597,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Value *Env = EmitScalarExpr(E->getArg(0));
     return Builder.CreateCall(F, {Env});
   }
+  case AMDGPU::BI__builtin_amdgcn_processor_is: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_processor_is should never reach CodeGen for "
+             "concrete targets!");
+    StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
+    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
+  }
+  case AMDGPU::BI__builtin_amdgcn_is_invocable: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_is_invocable should never reach CodeGen for "
+           "concrete targets!");
+    auto FD = cast<FunctionDecl>(
+      cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+    StringRef RF =
+        getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
+  }
   case AMDGPU::BI__builtin_amdgcn_read_exec:
     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7cc8374e69d73..24f5262ab3cf4 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6541,6 +6541,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
   if (Result.isInvalid()) return ExprError();
   Fn = Result.get();
 
+  // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved
+  // later, when we check boolean conditions, for now we merely forward it
+  // without any additional checking.
+  if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
+      ArgExprs[0]->getType() == Context.BuiltinFnTy) {
+    auto FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+
+    if (FD->getName() == "__builtin_amdgcn_is_invocable") {
+      auto FnPtrTy = Context.getPointerType(FD->getType());
+      auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+      return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
+                              ExprValueKind::VK_PRValue, RParenLoc,
+                              FPOptionsOverride());
+    }
+  }
+
   if (CheckArgsForPlaceholders(ArgExprs))
     return ExprError();
 
@@ -13234,6 +13250,20 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
   return InvalidOperands(Loc, LHS, RHS);
 }
 
+static inline bool IsAMDGPUPredicateBI(Expr *E) {
+  if (!E->getType()->isVoidType())
+    return false;
+
+  if (auto CE = dyn_cast<CallExpr>(E)) {
+    if (auto BI = CE->getDirectCallee())
+      if (BI->getName() == "__builtin_amdgcn_processor_is" ||
+          BI->getName() == "__builtin_amdgcn_is_invocable")
+        return true;
+  }
+
+  return false;
+}
+
 // C99 6.5.[13,14]
 inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                            SourceLocation Loc,
@@ -13329,6 +13359,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
   // The following is safe because we only use this method for
   // non-overloadable operands.
 
+  if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get()))
+    return Context.VoidTy;
+
   // C++ [expr.log.and]p1
   // C++ [expr.log.or]p1
   // The operands are both contextually converted to type bool.
@@ -15576,6 +15609,38 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
   return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
 }
 
+static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
+  if (!CE->getBuiltinCallee())
+    return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc());
+
+  if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+    CE->setType(Ctx.getLogicalOperationType());
+    return CE;
+  }
+
+  bool P = false;
+  auto &TI = Ctx.getTargetInfo();
+
+  if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
+    auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    auto TID = TI.getTargetID();
+    if (GFX && TID) {
+      auto N = GFX->getString();
+      P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
+    }
+  } else {
+    auto FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
+
+    StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    llvm::StringMap<bool> CF;
+    Ctx.getFunctionFeatureMap(CF, FD);
+
+    P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+  }
+
+  return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc());
+}
+
 ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                                       UnaryOperatorKind Opc, Expr *InputExpr,
                                       bool IsAfterAmp) {
@@ -15753,6 +15818,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
         // Vector logical not returns the signed variant of the operand type.
         resultType = GetSignedVectorType(resultType);
         break;
+      } else if (IsAMDGPUPredicateBI(InputExpr)) {
+        break;
       } else {
         return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                          << resultType << Input.get()->getSourceRange());
@@ -20469,6 +20536,88 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
     }
 }
 
+static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
+  if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
+    auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    if (!GFX) {
+      Sema.Diag(CE->getExprLoc(),
+                diag::err_amdgcn_processor_is_arg_not_literal);
+      return false;
+    }
+    auto N = GFX->getString();
+    if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) &&
+        (!Sema.getASTContext().getAuxTargetInfo() ||
+         !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
+      Sema.Diag(CE->getExprLoc(),
+                diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+      return false;
+    }
+  } else {
+    auto Arg = CE->getArg(0);
+    if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
+      Sema.Diag(CE->getExprLoc(),
+                diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
+  if (auto UO = dyn_cast<UnaryOperator>(E)) {
+    auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
+    if (IsAMDGPUPredicateBI(SE)) {
+      assert(
+        UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
+        "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+          "can only be used as operands of logical ops!");
+
+      if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
+        Invalid = true;
+        return nullptr;
+      }
+
+      UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE));
+      UO->setType(Sema.getASTContext().getLogicalOperationType());
+
+      return UO;
+    }
+  }
+  if (auto BO = dyn_cast<BinaryOperator>(E)) {
+    auto LHS = dyn_cast<CallExpr>(BO->getLHS());
+    auto RHS = dyn_cast<CallExpr>(BO->getRHS());
+    if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
+      assert(
+          BO->isLogicalOp() &&
+          "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+            "can only be used as operands of logical ops!");
+
+      if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
+          !ValidateAMDGPUPredicateBI(Sema, RHS)) {
+        Invalid = true;
+        return nullptr;
+      }
+
+      BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS));
+      BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS));
+      BO->setType(Sema.getASTContext().getLogicalOperationType());
+
+      return BO;
+    }
+  }
+  if (auto CE = dyn_cast<CallExpr>(E))
+    if (IsAMDGPUPredicateBI(CE)) {
+      if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
+        Invalid = true;
+        return nullptr;
+      }
+      return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE);
+    }
+
+  return nullptr;
+}
+
 ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
                                        bool IsConstexpr) {
   DiagnoseAssignmentAsCondition(E);
@@ -20480,6 +20629,14 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
   E = result.get();
 
   if (!E->isTypeDependent()) {
+    if (E->getType()->isVoidType()) {
+      bool IsInvalidPredicate = false;
+      if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+        return BIC;
+      else if (IsInvalidPredicate)
+        return ExprError();
+    }
+
     if (getLangOpts().CPlusPlus)
       return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
 
diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
new file mode 100644
index 0000000000000..6e261d9f5d239
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
@@ -0,0 +1,65 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit a call to trap (concrete target, matches)
+//    2) for gfx1010 we emit an empty kernel (concrete target, does not match)
+//    3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
+//       load from it to provide the condition a br (abstract target)
+//.
+// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
+//.
+// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_cpu_is("gfx900"))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
new file mode 100644
index 0000000000000..6d2690cb75b7c
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -0,0 +1,64 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit an empty kernel (concrete target, lacks feature)
+//    2) for gfx1010 we emit a call to trap (concrete target, has feature)
+//    3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant
+//       externally initialised bool global, and load from it to provide the
+//       condition to a br (abstract target)
+
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
new file mode 100644
index 0000000000000..f618f54909b00
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -0,0 +1,43 @@
+// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+bool predicate(bool x) { return x; }
+
+void invalid_uses(int* p, int x, bool (*pfn)(bool)) {
+    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+    bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void'
+    const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+    bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+    bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: variable has incomplete type 'const void'
+    const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: variable has incomplete type 'const void'
+    const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900");
+    // CHECK: error: variable has incomplete type 'const void'
+    const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
+    // CHECK: error: value of type 'void' is not contextually convertible to 'bool'
+    __builtin_amdgcn_processor_is("gfx1201")
+        ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
+    // CHECK: error: no matching function for call to 'predicate'
+    if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
+    // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument
+}
+
+void invalid_invocations(int x, const char* str) {
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid
+    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
+    if (__builtin_amdgcn_processor_is(str)) return;
+
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid
+    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid
+    else if (__builtin_amdgcn_is_invocable(str)) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid
+    else if (__builtin_amdgcn_is_invocable(x)) return;
+    // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
+    else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index a8e4ea9429f50..1fe0016723a30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -408,6 +408,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
+struct AMDGPUExpandFeaturePredicatesPass
+    : PassInfoMixin<AMDGPUExpandFeaturePredicatesPass> {
+  const AMDGPUTargetMachine &TM;
+  AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 0000000000000..125051c6aa0cf
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,207 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+inline Function *getCloneForInlining(Function *OldF) {
+  assert(OldF && "Must pass an existing Function!");
+
+  // TODO - Alias Value to clone arg.
+  ValueToValueMapTy VMap;
+
+  auto NewF = CloneFunction(OldF, VMap);
+
+  NewF->removeFnAttr(Attribute::OptimizeNone);
+  NewF->removeFnAttr(Attribute::NoInline);
+  NewF->addFnAttr(Attribute::AlwaysInline);
+
+  return NewF;
+}
+
+template <typename C>
+inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner,
+                         ModuleAnalysisManager &MAM,
+                         SmallDenseMap<Function *, Function *> &InlinableClones,
+                         C &Container) {
+  assert(V && "Must pass an existing Value!");
+
+  auto A = PreservedAnalyses::all();
+
+  constexpr auto IsValidCall = [](auto &&U) {
+    if (auto CB = dyn_cast<CallBase>(U))
+      if (auto F = CB->getCalledFunction())
+        if (!F->isIntrinsic() && !F->isDeclaration())
+          return true;
+    return false;
+  };
+
+  SmallVector<User *> Calls{};
+  copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
+
+  while (!Calls.empty()) {
+    for (auto &&Call : Calls) {
+      auto CB = cast<CallBase>(Call);
+      auto &TempF = InlinableClones[CB->getCalledFunction()];
+
+      if (!TempF)
+        TempF = getCloneForInlining(CB->getCalledFunction());
+
+      CB->setCalledFunction(TempF);
+      CB->removeFnAttr(Attribute::NoInline);
+      CB->addFnAttr(Attribute::AlwaysInline);
+
+      AlwaysInliner.run(*TempF->getParent(), MAM);
+    }
+
+    Calls.clear();
+
+    copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
+  }
+
+  for (auto &&U : V->users())
+    if (auto I = dyn_cast<Instruction>(U)) {
+      if (auto CB = dyn_cast<CallBase>(I)) {
+        if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic())
+          Container.insert(Container.end(), I);
+      } else {
+        Container.insert(Container.end(), I);
+      }
+    }
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM,
+                SmallDenseMap<Function *, Function *>& InlinableClones,
+                GlobalVariable *P) {
+  auto PV = P->getName().substr(P->getName().rfind('.') + 1).str();
+  auto Dx = PV.find(',');
+  while (Dx != std::string::npos) {
+    PV.insert(++Dx, {'+'});
+
+    Dx = PV.find(',', Dx);
+  }
+
+  auto PTy = P->getValueType();
+  P->setLinkage(GlobalValue::PrivateLinkage);
+  P->setExternallyInitialized(false);
+
+  if (P->getName().starts_with("llvm.amdgcn.is"))
+    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+  else
+    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV)));
+
+  ModulePassManager MPM;
+  MPM.addPass(AlwaysInlinerPass());
+
+  SmallPtrSet<Instruction *, 32> ToFold;
+  collectUsers(P, MPM, MAM, InlinableClones, ToFold);
+
+  if (ToFold.empty())
+    return {PreservedAnalyses::all(), true};
+
+  do {
+    auto I = *ToFold.begin();
+    ToFold.erase(I);
+
+    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+      collectUsers(I, MPM, MAM, InlinableClones, ToFold);
+      I->replaceAllUsesWith(C);
+      I->eraseFromParent();
+      continue;
+    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
+      continue;
+    } else if (I->users().empty()) {
+      continue;
+    }
+
+    std::string W;
+    raw_string_ostream OS(W);
+
+    auto Caller = I->getParent()->getParent();
+
+    OS << "Impossible to constant fold feature predicate: " << P->getName()
+       << ", please simplify.\n";
+
+    Caller->getContext().diagnose(
+        DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error));
+
+    return {PreservedAnalyses::none(), false};
+  } while (!ToFold.empty());
+
+  return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+PreservedAnalyses
+AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  SmallVector<GlobalVariable *> Predicates;
+  for (auto &&G : M.globals()) {
+    if (!G.isDeclaration() || !G.hasName())
+      continue;
+    if (G.getName().starts_with("llvm.amdgcn."))
+      Predicates.push_back(&G);
+  }
+
+  if (Predicates.empty())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses Ret = PreservedAnalyses::all();
+
+  SmallDenseMap<Function *, Function *> InlinableClones;
+  const auto &ST = TM.getSubtarget<GCNSubtarget>(
+      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+  for (auto &&P : Predicates) {
+    auto R = handlePredicate(ST, MAM, InlinableClones, P);
+
+    if (!R.second)
+      return PreservedAnalyses::none();
+
+    Ret.intersect(R.first);
+  }
+
+  for (auto &&C : InlinableClones)
+    C.second->eraseFromParent();
+
+  return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6a45392b5f099..c3c9e24c2efa4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,8 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-feature-predicates",
+            AMDGPUExpandFeaturePredicatesPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4937b434bc955..8e8a6e1eda437 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -803,7 +803,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #include "llvm/Passes/TargetPassRegistry.inc"
 
   PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
         if (EnableHipStdPar)
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
       });
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..a389200f0db8e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUExpandFeaturePredicates.cpp
   AMDGPUExportClustering.cpp
   AMDGPUExportKernelRuntimeHandles.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
new file mode 100644
index 0000000000000..bfc35d8c76e37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
@@ -0,0 +1,28 @@
+; REQUIRES: amdgpu-registered-target
+
+; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s
+
+; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by   %call = call i1 %1(i1 zeroext false), please simplify.
+
+ at llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) #1
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) {
+entry:
+  %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1
+  %call = call i1 %1(i1 zeroext %3)
+  br i1 %call, label %if.gfx803, label %if.end
+
+if.gfx803:
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
new file mode 100644
index 0000000000000..277323c353260
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s
+
+;; The IR was derived from the following source:
+;; extern "C" __global__ void kernel(int* p, int x)
+;; {
+;;     if (__builtin_amdgcn_processor_is("gfx1201") ||
+;;         __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+;;         __builtin_amdgcn_s_sleep_var(x);
+;;     if (!__builtin_amdgcn_processor_is("gfx906"))
+;;         __builtin_amdgcn_s_wait_event_export_ready();
+;;     else if (__builtin_amdgcn_processor_is("gfx1010") ||
+;;         __builtin_amdgcn_processor_is("gfx1101"))
+;;         __builtin_amdgcn_s_ttracedata_imm(1);
+;;     while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+;;     do {
+;;         *p -= x;
+;;     } while (__builtin_amdgcn_processor_is("gfx1010"));
+;;     for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+;;
+;;     if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+;;         __builtin_amdgcn_s_wait_event_export_ready();
+;;     else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+;;         __builtin_amdgcn_s_ttracedata_imm(1);
+;;
+;;     do {
+;;         *p -= x;
+;;     } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+;;     for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+;; }
+
+ at llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep.var(i32)
+declare void @llvm.amdgcn.s.wait.event.export.ready()
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg)
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
+; GFX906-LABEL: define amdgpu_kernel void @kernel(
+; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX906-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX906-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX906-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX906-NEXT:    br label %[[IF_NOT_GFX906]]
+; GFX906:       [[IF_NOT_GFX906]]:
+; GFX906-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX906:       [[IF_NOT_GFX907:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX906-NEXT:    br label %[[IF_END6:.*]]
+; GFX906:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX906-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX906:       [[LOR_NOT_GFX1010]]:
+; GFX906-NEXT:    br label %[[FOR_COND:.*]]
+; GFX906:       [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX906-NEXT:    br label %[[IF_END6]]
+; GFX906:       [[IF_END6]]:
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    br label %[[FOR_COND]]
+; GFX906:       [[FOR_COND]]:
+; GFX906-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX906-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT:    br label %[[IF_GFX10_INSTS1:.*]]
+; GFX906:       [[IF_GFX11_INSTS:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX906-NEXT:    br label %[[IF_END11:.*]]
+; GFX906:       [[IF_GFX10_INSTS1]]:
+; GFX906-NEXT:    br label %[[IF_END11]]
+; GFX906:       [[IF_GFX10_INSTS:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX906-NEXT:    br label %[[IF_END11]]
+; GFX906:       [[IF_END11]]:
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX906-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT:    ret void
+;
+; GFX1010-LABEL: define amdgpu_kernel void @kernel(
+; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1010-NEXT:  [[ENTRY:.*:]]
+; GFX1010-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1010-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1010-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1010-NEXT:    br label %[[IF_END:.*]]
+; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1010-NEXT:    br label %[[IF_END]]
+; GFX1010:       [[IF_END]]:
+; GFX1010-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1010:       [[IF_NOT_GFX907]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT:    br label %[[IF_END6:.*]]
+; GFX1010:       [[IF_NOT_GFX906:.*:]]
+; GFX1010-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1010:       [[LOR_NOT_GFX1010:.*:]]
+; GFX1010-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1010:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT:    br label %[[IF_END6]]
+; GFX1010:       [[IF_END6]]:
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    call void @llvm.assume(i1 false)
+; GFX1010-NEXT:    br label %[[FOR_COND]]
+; GFX1010:       [[FOR_COND]]:
+; GFX1010-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1010-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1010:       [[IF_GFX11_INSTS:.*:]]
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT:    br label %[[IF_END11:.*]]
+; GFX1010:       [[IF_ELSE8]]:
+; GFX1010-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1010:       [[IF_GFX10_INSTS]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT:    br label %[[IF_END11]]
+; GFX1010:       [[IF_END11]]:
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1010-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT:    ret void
+;
+; GFX1101-LABEL: define amdgpu_kernel void @kernel(
+; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1101-NEXT:  [[ENTRY:.*:]]
+; GFX1101-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1101-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1101-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1101-NEXT:    br label %[[IF_END:.*]]
+; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1101-NEXT:    br label %[[IF_END]]
+; GFX1101:       [[IF_END]]:
+; GFX1101-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1101:       [[IF_NOT_GFX907]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT:    br label %[[IF_END6:.*]]
+; GFX1101:       [[IF_NOT_GFX906:.*:]]
+; GFX1101-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX1101:       [[LOR_NOT_GFX1010]]:
+; GFX1101-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1101:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1101-NEXT:    br label %[[IF_END6]]
+; GFX1101:       [[IF_END6]]:
+; GFX1101-NEXT:    call void @llvm.assume(i1 false)
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1101:       [[FOR_COND]]:
+; GFX1101-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1101-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1101:       [[IF_GFX11_INSTS]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT:    br label %[[IF_END11:.*]]
+; GFX1101:       [[IF_ELSE8:.*:]]
+; GFX1101-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1101:       [[IF_GFX10_INSTS]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1101-NEXT:    br label %[[IF_END11]]
+; GFX1101:       [[IF_END11]]:
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1101-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT:    ret void
+;
+; GFX1201-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1201:       [[LOR_NOT_GFX1201:.*:]]
+; GFX1201-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS]]
+; GFX1201:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-NEXT:    br label %[[IF_END:.*]]
+; GFX1201:       [[IF_END]]:
+; GFX1201-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1201:       [[IF_NOT_GFX907]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT:    br label %[[IF_END6:.*]]
+; GFX1201:       [[IF_NOT_GFX906:.*:]]
+; GFX1201-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX1201:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX1201-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1201:       [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-NEXT:    br label %[[IF_END6]]
+; GFX1201:       [[IF_END6]]:
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    br label %[[FOR_COND]]
+; GFX1201:       [[FOR_COND]]:
+; GFX1201-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201:       [[IF_GFX11_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT:    br label %[[IF_END11:.*]]
+; GFX1201:       [[IF_ELSE8:.*:]]
+; GFX1201-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1201:       [[IF_GFX10_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-NEXT:    br label %[[IF_END11]]
+; GFX1201:       [[IF_END11]]:
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT:    ret void
+;
+; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-W64-NEXT:  [[ENTRY:.*:]]
+; GFX1201-W64-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-W64-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1201-W64:       [[LOR_NOT_GFX1201:.*:]]
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS]]
+; GFX1201-W64:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-W64-NEXT:    br label %[[IF_END:.*]]
+; GFX1201-W64:       [[IF_END]]:
+; GFX1201-W64-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1201-W64:       [[IF_NOT_GFX907]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT:    br label %[[IF_END6:.*]]
+; GFX1201-W64:       [[IF_NOT_GFX906:.*:]]
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX1201-W64:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX1201-W64-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1201-W64:       [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-W64-NEXT:    br label %[[IF_END6]]
+; GFX1201-W64:       [[IF_END6]]:
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    br label %[[FOR_COND]]
+; GFX1201-W64:       [[FOR_COND]]:
+; GFX1201-W64-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-W64-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201-W64:       [[IF_GFX11_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT:    br label %[[IF_END11:.*]]
+; GFX1201-W64:       [[IF_ELSE8:.*:]]
+; GFX1201-W64-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1201-W64:       [[IF_GFX10_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-W64-NEXT:    br label %[[IF_END11]]
+; GFX1201-W64:       [[IF_END11]]:
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 false)
+; GFX1201-W64-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-W64-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    ret void
+;
+entry:
+  %0 = ptrtoint ptr addrspace(1) %p.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
+  br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201
+
+lor.not.gfx1201:
+  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1
+  br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end
+
+if.gfx1201.or.gfx12-insts:
+  call void @llvm.amdgcn.s.sleep.var(i32 %x)
+  br label %if.end
+
+if.end:
+  %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
+  br i1 %4, label %if.gfx906, label %if.not.gfx906
+
+if.not.gfx906:
+  call void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end6
+
+if.gfx906:
+  %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+  br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010
+
+lor.not.gfx1010:
+  %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+  br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond
+
+if.gfx1010.or.gfx1101:
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end6
+
+if.end6:
+  %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+  %7 = icmp ne i1 %.pr.pr, true
+  call void @llvm.assume(i1 %7)
+  %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+  %8 = icmp ne i1 %.pr6.pr, true
+  call void @llvm.assume(i1 %8)
+  br label %for.cond
+
+for.cond:
+  %.promoted = load i32, ptr %1, align 4
+  %sub.peel = sub nsw i32 %.promoted, %x
+  store i32 %sub.peel, ptr %1, align 4
+  %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1
+  br i1 %9, label %if.gfx11-insts, label %if.else8
+
+if.gfx11-insts:
+  call void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end11
+
+if.else8:
+  %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+  br i1 %10, label %if.gfx10-insts, label %if.end11
+
+if.gfx10-insts:
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end11
+
+if.end11:
+  %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1
+  %11 = icmp ne i1 %.pr7, true
+  call void @llvm.assume(i1 %11)
+  %.promoted9 = load i32, ptr %1, align 4
+  %sub13.peel = sub nsw i32 %.promoted9, %x
+  store i32 %sub13.peel, ptr %1, align 4
+  ret void
+}
+
+declare void @llvm.assume(i1 noundef)

>From 8bf116837e2bd77ff5906d025fdb80bfa5507382 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:39:32 +0100
Subject: [PATCH 02/69] Fix format.

---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp |  8 ++++----
 clang/lib/Sema/SemaExpr.cpp                 | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 7b1a3815144b4..8ad1ab74f221d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -292,8 +292,8 @@ static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
   P->setConstant(true);
   P->setExternallyInitialized(true);
 
-  return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(),
-                                           KnownNonNull));
+  return CGF.Builder.CreateLoad(
+      RawAddress(P, PTy, CharUnits::One(), KnownNonNull));
 }
 
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
@@ -600,7 +600,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_processor_is: {
     assert(CGM.getTriple().isSPIRV() &&
            "__builtin_amdgcn_processor_is should never reach CodeGen for "
-             "concrete targets!");
+           "concrete targets!");
     StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
     return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
   }
@@ -609,7 +609,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
            "__builtin_amdgcn_is_invocable should never reach CodeGen for "
            "concrete targets!");
     auto FD = cast<FunctionDecl>(
-      cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+        cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
     StringRef RF =
         getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
     return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 24f5262ab3cf4..bd0183ae4fb82 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20549,14 +20549,16 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
         (!Sema.getASTContext().getAuxTargetInfo() ||
          !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
       Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+                diag::err_amdgcn_processor_is_arg_invalid_value)
+          << N;
       return false;
     }
   } else {
     auto Arg = CE->getArg(0);
     if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
       Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+                diag::err_amdgcn_is_invocable_arg_invalid_value)
+          << Arg;
       return false;
     }
   }
@@ -20568,10 +20570,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
   if (auto UO = dyn_cast<UnaryOperator>(E)) {
     auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
     if (IsAMDGPUPredicateBI(SE)) {
-      assert(
-        UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
-        "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
-          "can only be used as operands of logical ops!");
+      assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
+             "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+             "can only be used as operands of logical ops!");
 
       if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
         Invalid = true;
@@ -20588,10 +20589,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
     auto LHS = dyn_cast<CallExpr>(BO->getLHS());
     auto RHS = dyn_cast<CallExpr>(BO->getRHS());
     if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
-      assert(
-          BO->isLogicalOp() &&
-          "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
-            "can only be used as operands of logical ops!");
+      assert(BO->isLogicalOp() &&
+             "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+             "can only be used as operands of logical ops!");
 
       if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
           !ValidateAMDGPUPredicateBI(Sema, RHS)) {

>From 3421292b6e3261410734fb5a324f7dec79080fc1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:42:24 +0100
Subject: [PATCH 03/69] Fix broken patch merge.

---
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  | 159 ++++++++++++++
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp   | 207 ------------------
 2 files changed, 159 insertions(+), 207 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
new file mode 100644
index 0000000000000..17357c452b6d3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -0,0 +1,159 @@
+//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic feature
+// predicates into target specific quantities / sequences. In this context, a
+// generic feature predicate is an implementation detail global variable that
+// is inserted by the FE as a consequence of using either the __builtin_cpu_is
+// or the __builtin_amdgcn_is_invocable special builtins on an abstract target
+// (AMDGCNSPIRV). These placeholder globals are used to guide target specific
+// lowering, once the concrete target is known, by way of constant folding their
+// value all the way into a terminator (i.e. a controlled block) or into a no
+// live use scenario. The pass makes a best effort attempt to look through
+// calls, i.e. a constant evaluatable passthrough of a predicate value will
+// generally work, however we hard fail if the folding fails, to avoid obtuse
+// BE errors or opaque run time errors. This pass should run as early as
+// possible / immediately after Clang CodeGen, so that the optimisation pipeline
+// and the BE operate with concrete target data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+template <typename C>
+void collectUsers(Value *V, C &Container) {
+  assert(V && "Must pass an existing Value!");
+
+  for (auto &&U : V->users())
+    if (auto I = dyn_cast<Instruction>(U))
+      Container.insert(Container.end(), I);
+}
+
+inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+  const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has");
+  const auto Offset =
+      IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is");
+
+  auto PV = P->getName().substr(Offset).str();
+  if (IsFeature) {
+    auto Dx = PV.find(',');
+    while (Dx != std::string::npos) {
+      PV.insert(++Dx, {'+'});
+
+      Dx = PV.find(',', Dx);
+    }
+    PV.insert(PV.cbegin(), '+');
+  }
+
+  auto PTy = P->getValueType();
+  P->setLinkage(GlobalValue::PrivateLinkage);
+  P->setExternallyInitialized(false);
+
+  if (IsFeature)
+    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV)));
+  else
+    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+}
+
+std::pair<PreservedAnalyses, bool>
+unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
+  std::string W;
+  raw_string_ostream OS(W);
+
+  OS << "Impossible to constant fold feature predicate: " << *P
+     << " used by " << *NoFold << ", please simplify.\n";
+
+  Caller->getContext().diagnose(
+      DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
+
+  return {PreservedAnalyses::none(), false};
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+  setPredicate(ST, P);
+
+  SmallPtrSet<Instruction *, 32> ToFold;
+  collectUsers(P, ToFold);
+
+  if (ToFold.empty())
+    return {PreservedAnalyses::all(), true};
+
+  do {
+    auto I = *ToFold.begin();
+    ToFold.erase(I);
+
+    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+      collectUsers(I, ToFold);
+      I->replaceAllUsesWith(C);
+      I->eraseFromParent();
+      continue;
+    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
+      continue;
+    } else if (I->users().empty()) {
+      continue;
+    }
+
+    return unfoldableFound(I->getParent()->getParent(), P, I);
+  } while (!ToFold.empty());
+
+  return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+PreservedAnalyses
+AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  SmallVector<GlobalVariable *> Predicates;
+  for (auto &&G : M.globals()) {
+    if (!G.isDeclaration() || !G.hasName())
+      continue;
+    if (G.getName().starts_with("llvm.amdgcn."))
+      Predicates.push_back(&G);
+  }
+
+  if (Predicates.empty())
+    return PreservedAnalyses::all();
+
+  const auto &ST = TM.getSubtarget<GCNSubtarget>(
+      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+  auto Ret = PreservedAnalyses::all();
+  for (auto &&P : Predicates) {
+    auto R = handlePredicate(ST, P);
+
+    if (!R.second)
+      break;
+
+    Ret.intersect(R.first);
+  }
+
+  for (auto &&P : Predicates)
+    P->eraseFromParent();
+
+  return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
deleted file mode 100644
index 125051c6aa0cf..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic pseudo-
-// intrinsics into target specific quantities / sequences. In this context, a
-// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
-// specific instruction, but rather is intended as a mechanism for abstractly
-// conveying target specific info to a HLL / the FE, without concretely
-// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
-// This pass should run as early as possible / immediately after Clang CodeGen,
-// so that the optimisation pipeline and the BE operate with concrete target
-// data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-namespace {
-inline Function *getCloneForInlining(Function *OldF) {
-  assert(OldF && "Must pass an existing Function!");
-
-  // TODO - Alias Value to clone arg.
-  ValueToValueMapTy VMap;
-
-  auto NewF = CloneFunction(OldF, VMap);
-
-  NewF->removeFnAttr(Attribute::OptimizeNone);
-  NewF->removeFnAttr(Attribute::NoInline);
-  NewF->addFnAttr(Attribute::AlwaysInline);
-
-  return NewF;
-}
-
-template <typename C>
-inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner,
-                         ModuleAnalysisManager &MAM,
-                         SmallDenseMap<Function *, Function *> &InlinableClones,
-                         C &Container) {
-  assert(V && "Must pass an existing Value!");
-
-  auto A = PreservedAnalyses::all();
-
-  constexpr auto IsValidCall = [](auto &&U) {
-    if (auto CB = dyn_cast<CallBase>(U))
-      if (auto F = CB->getCalledFunction())
-        if (!F->isIntrinsic() && !F->isDeclaration())
-          return true;
-    return false;
-  };
-
-  SmallVector<User *> Calls{};
-  copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
-
-  while (!Calls.empty()) {
-    for (auto &&Call : Calls) {
-      auto CB = cast<CallBase>(Call);
-      auto &TempF = InlinableClones[CB->getCalledFunction()];
-
-      if (!TempF)
-        TempF = getCloneForInlining(CB->getCalledFunction());
-
-      CB->setCalledFunction(TempF);
-      CB->removeFnAttr(Attribute::NoInline);
-      CB->addFnAttr(Attribute::AlwaysInline);
-
-      AlwaysInliner.run(*TempF->getParent(), MAM);
-    }
-
-    Calls.clear();
-
-    copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
-  }
-
-  for (auto &&U : V->users())
-    if (auto I = dyn_cast<Instruction>(U)) {
-      if (auto CB = dyn_cast<CallBase>(I)) {
-        if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic())
-          Container.insert(Container.end(), I);
-      } else {
-        Container.insert(Container.end(), I);
-      }
-    }
-}
-
-std::pair<PreservedAnalyses, bool>
-handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM,
-                SmallDenseMap<Function *, Function *>& InlinableClones,
-                GlobalVariable *P) {
-  auto PV = P->getName().substr(P->getName().rfind('.') + 1).str();
-  auto Dx = PV.find(',');
-  while (Dx != std::string::npos) {
-    PV.insert(++Dx, {'+'});
-
-    Dx = PV.find(',', Dx);
-  }
-
-  auto PTy = P->getValueType();
-  P->setLinkage(GlobalValue::PrivateLinkage);
-  P->setExternallyInitialized(false);
-
-  if (P->getName().starts_with("llvm.amdgcn.is"))
-    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
-  else
-    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV)));
-
-  ModulePassManager MPM;
-  MPM.addPass(AlwaysInlinerPass());
-
-  SmallPtrSet<Instruction *, 32> ToFold;
-  collectUsers(P, MPM, MAM, InlinableClones, ToFold);
-
-  if (ToFold.empty())
-    return {PreservedAnalyses::all(), true};
-
-  do {
-    auto I = *ToFold.begin();
-    ToFold.erase(I);
-
-    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
-      collectUsers(I, MPM, MAM, InlinableClones, ToFold);
-      I->replaceAllUsesWith(C);
-      I->eraseFromParent();
-      continue;
-    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
-      continue;
-    } else if (I->users().empty()) {
-      continue;
-    }
-
-    std::string W;
-    raw_string_ostream OS(W);
-
-    auto Caller = I->getParent()->getParent();
-
-    OS << "Impossible to constant fold feature predicate: " << P->getName()
-       << ", please simplify.\n";
-
-    Caller->getContext().diagnose(
-        DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error));
-
-    return {PreservedAnalyses::none(), false};
-  } while (!ToFold.empty());
-
-  return {PreservedAnalyses::none(), true};
-}
-} // Unnamed namespace.
-
-PreservedAnalyses
-AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) {
-  if (M.empty())
-    return PreservedAnalyses::all();
-
-  SmallVector<GlobalVariable *> Predicates;
-  for (auto &&G : M.globals()) {
-    if (!G.isDeclaration() || !G.hasName())
-      continue;
-    if (G.getName().starts_with("llvm.amdgcn."))
-      Predicates.push_back(&G);
-  }
-
-  if (Predicates.empty())
-    return PreservedAnalyses::all();
-
-  PreservedAnalyses Ret = PreservedAnalyses::all();
-
-  SmallDenseMap<Function *, Function *> InlinableClones;
-  const auto &ST = TM.getSubtarget<GCNSubtarget>(
-      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
-
-  for (auto &&P : Predicates) {
-    auto R = handlePredicate(ST, MAM, InlinableClones, P);
-
-    if (!R.second)
-      return PreservedAnalyses::none();
-
-    Ret.intersect(R.first);
-  }
-
-  for (auto &&C : InlinableClones)
-    C.second->eraseFromParent();
-
-  return Ret;
-}

>From 539c7e6c6357fa7330de9e23fa13cf795061b85b Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:51:08 +0100
Subject: [PATCH 04/69] Add release notes.

---
 clang/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c4e82678949ff..005b33da29d2d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -418,6 +418,10 @@ AMDGPU Support
 ^^^^^^^^^^^^^^
 
 - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6.
+- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
+  a late / deferred query for the current target processor
+- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
+  which enables fine-grained, per-builtin, feature availability
 
 NVPTX Support
 ^^^^^^^^^^^^^^

>From 5926b9f715fce59e753756f5330f311e3f916667 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:55:39 +0100
Subject: [PATCH 05/69] (Hopefully) Final format fix.

---
 .../Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp   | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 17357c452b6d3..8d38508eda74b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -42,8 +42,7 @@
 using namespace llvm;
 
 namespace {
-template <typename C>
-void collectUsers(Value *V, C &Container) {
+template <typename C> void collectUsers(Value *V, C &Container) {
   assert(V && "Must pass an existing Value!");
 
   for (auto &&U : V->users())
@@ -82,8 +81,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
   std::string W;
   raw_string_ostream OS(W);
 
-  OS << "Impossible to constant fold feature predicate: " << *P
-     << " used by " << *NoFold << ", please simplify.\n";
+  OS << "Impossible to constant fold feature predicate: " << *P  << " used by "
+     << *NoFold << ", please simplify.\n";
 
   Caller->getContext().diagnose(
       DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
@@ -91,8 +90,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
   return {PreservedAnalyses::none(), false};
 }
 
-std::pair<PreservedAnalyses, bool>
-handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
+                                                   GlobalVariable *P) {
   setPredicate(ST, P);
 
   SmallPtrSet<Instruction *, 32> ToFold;

>From 4381d930084f38d9e4099d8c8fbea0e4267556a9 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 04:01:27 +0100
Subject: [PATCH 06/69] Remove stray space.

---
 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 8d38508eda74b..6d6c457170f8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -81,7 +81,7 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
   std::string W;
   raw_string_ostream OS(W);
 
-  OS << "Impossible to constant fold feature predicate: " << *P  << " used by "
+  OS << "Impossible to constant fold feature predicate: " << *P << " used by "
      << *NoFold << ", please simplify.\n";
 
   Caller->getContext().diagnose(

>From d18f64e455f0d3b91c013bd0d99e895adc57fcad Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 11:01:59 +0100
Subject: [PATCH 07/69] Remove unused header, fix borked test.

---
 ...pu-builtin-cpu-is.c => amdgpu-builtin-processor-is.c} | 9 +++------
 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 1 -
 2 files changed, 3 insertions(+), 7 deletions(-)
 rename clang/test/CodeGen/{amdgpu-builtin-cpu-is.c => amdgpu-builtin-processor-is.c} (92%)

diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
similarity index 92%
rename from clang/test/CodeGen/amdgpu-builtin-cpu-is.c
rename to clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 6e261d9f5d239..f5d80bff1c51e 100644
--- a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -10,10 +10,6 @@
 //    3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
 //       load from it to provide the condition a br (abstract target)
 //.
-// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
-//.
-// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
-//.
 // AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
 //.
 // AMDGCN-GFX900-LABEL: define dso_local void @foo(
@@ -31,7 +27,8 @@
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
-// AMDGCNSPIRV-NEXT:    br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
 // AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
@@ -39,7 +36,7 @@
 // AMDGCNSPIRV-NEXT:    ret void
 //
 void foo() {
-    if (__builtin_cpu_is("gfx900"))
+    if (__builtin_amdgcn_processor_is("gfx900"))
         return __builtin_trap();
 }
 //.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 6d6c457170f8c..ae100e2f5b213 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -33,7 +33,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 #include <string>

>From 7880ff498495511c70952c0a135b5e9f9b837889 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 15:09:48 +0100
Subject: [PATCH 08/69] Stars everywhere.

---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  4 +--
 clang/lib/Sema/SemaExpr.cpp                   | 30 +++++++++----------
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  |  8 ++---
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 8ad1ab74f221d..179b9ad02177b 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -287,7 +287,7 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
 static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
   auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
 
-  auto P = cast<GlobalVariable>(
+  auto *P = cast<GlobalVariable>(
       CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
   P->setConstant(true);
   P->setExternallyInitialized(true);
@@ -608,7 +608,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     assert(CGM.getTriple().isSPIRV() &&
            "__builtin_amdgcn_is_invocable should never reach CodeGen for "
            "concrete targets!");
-    auto FD = cast<FunctionDecl>(
+    auto *FD = cast<FunctionDecl>(
         cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
     StringRef RF =
         getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index bd0183ae4fb82..44fd9aa1f1834 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6546,11 +6546,11 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
   // without any additional checking.
   if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
       ArgExprs[0]->getType() == Context.BuiltinFnTy) {
-    auto FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+    auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
 
     if (FD->getName() == "__builtin_amdgcn_is_invocable") {
       auto FnPtrTy = Context.getPointerType(FD->getType());
-      auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+      auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
       return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
                               ExprValueKind::VK_PRValue, RParenLoc,
                               FPOptionsOverride());
@@ -13254,8 +13254,8 @@ static inline bool IsAMDGPUPredicateBI(Expr *E) {
   if (!E->getType()->isVoidType())
     return false;
 
-  if (auto CE = dyn_cast<CallExpr>(E)) {
-    if (auto BI = CE->getDirectCallee())
+  if (auto *CE = dyn_cast<CallExpr>(E)) {
+    if (auto *BI = CE->getDirectCallee())
       if (BI->getName() == "__builtin_amdgcn_processor_is" ||
           BI->getName() == "__builtin_amdgcn_is_invocable")
         return true;
@@ -15622,14 +15622,14 @@ static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
   auto &TI = Ctx.getTargetInfo();
 
   if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
-    auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
     auto TID = TI.getTargetID();
     if (GFX && TID) {
       auto N = GFX->getString();
       P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
     }
   } else {
-    auto FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
+    auto *FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
 
     StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
     llvm::StringMap<bool> CF;
@@ -20538,7 +20538,7 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
 
 static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
   if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
-    auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
     if (!GFX) {
       Sema.Diag(CE->getExprLoc(),
                 diag::err_amdgcn_processor_is_arg_not_literal);
@@ -20554,7 +20554,7 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
       return false;
     }
   } else {
-    auto Arg = CE->getArg(0);
+    auto *Arg = CE->getArg(0);
     if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
       Sema.Diag(CE->getExprLoc(),
                 diag::err_amdgcn_is_invocable_arg_invalid_value)
@@ -20567,8 +20567,8 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
 }
 
 static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
-  if (auto UO = dyn_cast<UnaryOperator>(E)) {
-    auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
+  if (auto *UO = dyn_cast<UnaryOperator>(E)) {
+    auto *SE = dyn_cast<CallExpr>(UO->getSubExpr());
     if (IsAMDGPUPredicateBI(SE)) {
       assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
              "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
@@ -20585,9 +20585,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
       return UO;
     }
   }
-  if (auto BO = dyn_cast<BinaryOperator>(E)) {
-    auto LHS = dyn_cast<CallExpr>(BO->getLHS());
-    auto RHS = dyn_cast<CallExpr>(BO->getRHS());
+  if (auto *BO = dyn_cast<BinaryOperator>(E)) {
+    auto *LHS = dyn_cast<CallExpr>(BO->getLHS());
+    auto *RHS = dyn_cast<CallExpr>(BO->getRHS());
     if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
       assert(BO->isLogicalOp() &&
              "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
@@ -20606,7 +20606,7 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
       return BO;
     }
   }
-  if (auto CE = dyn_cast<CallExpr>(E))
+  if (auto *CE = dyn_cast<CallExpr>(E))
     if (IsAMDGPUPredicateBI(CE)) {
       if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
         Invalid = true;
@@ -20631,7 +20631,7 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
   if (!E->isTypeDependent()) {
     if (E->getType()->isVoidType()) {
       bool IsInvalidPredicate = false;
-      if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+      if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
         return BIC;
       else if (IsInvalidPredicate)
         return ExprError();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index ae100e2f5b213..f1c73e86fb4a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -45,7 +45,7 @@ template <typename C> void collectUsers(Value *V, C &Container) {
   assert(V && "Must pass an existing Value!");
 
   for (auto &&U : V->users())
-    if (auto I = dyn_cast<Instruction>(U))
+    if (auto *I = dyn_cast<Instruction>(U))
       Container.insert(Container.end(), I);
 }
 
@@ -65,7 +65,7 @@ inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
     PV.insert(PV.cbegin(), '+');
   }
 
-  auto PTy = P->getValueType();
+  auto *PTy = P->getValueType();
   P->setLinkage(GlobalValue::PrivateLinkage);
   P->setExternallyInitialized(false);
 
@@ -100,10 +100,10 @@ std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
     return {PreservedAnalyses::all(), true};
 
   do {
-    auto I = *ToFold.begin();
+    auto *I = *ToFold.begin();
     ToFold.erase(I);
 
-    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+    if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) {
       collectUsers(I, ToFold);
       I->replaceAllUsesWith(C);
       I->eraseFromParent();

>From 719dfdea50ae31ac54040a95d499dae98f714a52 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 15:33:31 +0100
Subject: [PATCH 09/69] Fix format without line break.

---
 clang/lib/Sema/SemaExpr.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 44fd9aa1f1834..889d54be8d91b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20630,10 +20630,10 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
 
   if (!E->isTypeDependent()) {
     if (E->getType()->isVoidType()) {
-      bool IsInvalidPredicate = false;
-      if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+      bool InvalidPredicate = false;
+      if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, InvalidPredicate))
         return BIC;
-      else if (IsInvalidPredicate)
+      else if (InvalidPredicate)
         return ExprError();
     }
 

>From 36b69b41f9d92901b1799bd8515ef4d8c9a41f51 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 15:40:37 +0100
Subject: [PATCH 10/69] Add host tests.

---
 clang/test/SemaHIP/amdgpu-is-invocable.hip | 21 +++++++++++++++++++++
 clang/test/SemaHIP/amdgpu-processor-is.hip | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 clang/test/SemaHIP/amdgpu-is-invocable.hip
 create mode 100644 clang/test/SemaHIP/amdgpu-processor-is.hip

diff --git a/clang/test/SemaHIP/amdgpu-is-invocable.hip b/clang/test/SemaHIP/amdgpu-is-invocable.hip
new file mode 100644
index 0000000000000..214d7769a595f
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
+
+__global__ void bar() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
diff --git a/clang/test/SemaHIP/amdgpu-processor-is.hip b/clang/test/SemaHIP/amdgpu-processor-is.hip
new file mode 100644
index 0000000000000..0f7211fd75d90
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-processor-is.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}
+
+__global__ void bar() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}

>From e327e1520b2453e69d888d1be3d5c68c40a0456a Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 16:48:04 +0100
Subject: [PATCH 11/69] Fit code examples within 80-char limit.

---
 clang/docs/LanguageExtensions.rst | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 8a7cb75af13e5..817f6a62f6a41 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4956,7 +4956,9 @@ a functional mechanism for programatically querying:
 
   while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
 
-  do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010"));
+  do {
+    *p -= x;
+  } while (__builtin_amdgcn_processor_is("gfx1010"));
 
   for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
 
@@ -4967,9 +4969,11 @@ a functional mechanism for programatically querying:
 
   do {
     *p -= x;
-  } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+  } while (
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
 
-  for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+  for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p)
+    break;
 
 **Description**:
 

>From 888a0803db90e38a6d912b7d019b27196eee3bf3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 16 Apr 2025 03:35:14 +0300
Subject: [PATCH 12/69] Fix tests.

---
 clang/test/CodeGen/amdgpu-builtin-is-invocable.c           | 2 +-
 clang/test/CodeGen/amdgpu-builtin-processor-is.c           | 2 +-
 clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 6d2690cb75b7c..12f283707308e 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -47,7 +47,7 @@ void foo() {
 // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
 // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
 // AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index f5d80bff1c51e..76dead8ebbe89 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -45,7 +45,7 @@ void foo() {
 //.
 // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
 // AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
index f618f54909b00..26cc8b4c7631d 100644
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -32,11 +32,11 @@ void invalid_invocations(int x, const char* str) {
     // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
     if (__builtin_amdgcn_processor_is(str)) return;
 
-    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid
     if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
-    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid
     else if (__builtin_amdgcn_is_invocable(str)) return;
-    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid
     else if (__builtin_amdgcn_is_invocable(x)) return;
     // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
     else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;

>From e35ac6281f1b22539e4771dfd2893bdabeb452b6 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 16 Apr 2025 15:31:30 +0300
Subject: [PATCH 13/69] Fix test.

---
 .../CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
index 44557284fc581..cffd3c7a5fb1f 100644
--- a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
+++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
@@ -7,7 +7,7 @@
 // HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
 // HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
 // HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
-// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass
+// HIPSTDPAR-PRE-NEXT: Running pass: AMDGPUExpandFeaturePredicatesPass
 
 // Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \

>From a8bca2fe2c054187981afcfca155e95efde26447 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 6 May 2025 01:47:53 +0100
Subject: [PATCH 14/69] Re-work implementation to return a target specific
 type.

---
 clang/docs/LanguageExtensions.rst             |  61 ++------
 clang/include/clang/Basic/AMDGPUTypes.def     |   8 +
 clang/include/clang/Basic/Builtins.def        |   1 +
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   4 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   9 +-
 clang/include/clang/Sema/SemaAMDGPU.h         |   4 +
 clang/lib/AST/ASTContext.cpp                  |  11 +-
 clang/lib/CodeGen/CGDebugInfo.cpp             |   7 +
 clang/lib/CodeGen/CGExprScalar.cpp            |   4 +
 clang/lib/CodeGen/CodeGenTypes.cpp            |   3 +
 clang/lib/Sema/Sema.cpp                       |   7 +-
 clang/lib/Sema/SemaAMDGPU.cpp                 |  60 +++++++
 clang/lib/Sema/SemaCast.cpp                   |   9 ++
 clang/lib/Sema/SemaDecl.cpp                   |  15 ++
 clang/lib/Sema/SemaExpr.cpp                   | 147 +-----------------
 clang/lib/Sema/SemaInit.cpp                   |  16 ++
 clang/lib/Sema/SemaOverload.cpp               |  14 +-
 .../amdgpu-feature-builtins-invalid-use.cpp   |  41 +++--
 18 files changed, 209 insertions(+), 212 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 0c8dd564aed4a..da8b16501d00a 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4950,12 +4950,8 @@ a functional mechanism for programatically querying:
 
 .. code-block:: c
 
-  // When used as the predicate for a control structure
-  bool __builtin_amdgcn_processor_is(const char*);
-  bool __builtin_amdgcn_is_invocable(builtin_name);
-  // Otherwise
-  void __builtin_amdgcn_processor_is(const char*);
-  void __builtin_amdgcn_is_invocable(void);
+  __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*);
+  __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name);
 
 **Example of use**:
 
@@ -4974,7 +4970,7 @@ a functional mechanism for programatically querying:
   while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
 
   do {
-    *p -= x;
+    break;
   } while (__builtin_amdgcn_processor_is("gfx1010"));
 
   for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
@@ -4985,7 +4981,7 @@ a functional mechanism for programatically querying:
     __builtin_amdgcn_s_ttracedata_imm(1);
 
   do {
-    *p -= x;
+    break;
   } while (
       __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
 
@@ -4994,17 +4990,21 @@ a functional mechanism for programatically querying:
 
 **Description**:
 
-When used as the predicate value of the following control structures:
+The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a
+target specific type that behaves as if its C++ definition was the following:
 
 .. code-block:: c++
 
-  if (...)
-  while (...)
-  do { } while (...)
-  for (...)
+  struct __amdgpu_feature_predicate_t {
+    __amdgpu_feature_predicate_t() = delete;
+    __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete;
+    __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete;
+
+    explicit
+    operator bool() const noexcept;
+  };
 
-be it directly, or as arguments to logical operators such as ``!, ||, &&``, the
-builtins return a boolean value that:
+The boolean interpretation of the predicate values returned by the builtins:
 
 * indicates whether the current target matches the argument; the argument MUST
   be a string literal and a valid AMDGPU target
@@ -5012,37 +5012,6 @@ builtins return a boolean value that:
   by the current target; the argument MUST be either a generic or AMDGPU
   specific builtin name
 
-Outside of these contexts, the builtins have a ``void`` returning signature
-which prevents their misuse.
-
-**Example of invalid use**:
-
-.. code-block:: c++
-
-  void kernel(int* p, int x, bool (*pfn)(bool), const char* str) {
-    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
-    else if (__builtin_amdgcn_processor_is(str)) __builtin_trap();
-
-    bool a = __builtin_amdgcn_processor_is("gfx906");
-    const bool b = !__builtin_amdgcn_processor_is("gfx906");
-    const bool c = !__builtin_amdgcn_processor_is("gfx906");
-    bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    const auto f =
-        !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
-        || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    const auto g =
-        !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
-        || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    __builtin_amdgcn_processor_is("gfx1201")
-      ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
-    if (pfn(__builtin_amdgcn_processor_is("gfx1200")))
-      __builtin_amdgcn_s_sleep_var(x);
-
-    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
-    else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap();
-  }
-
 When invoked while compiling for a concrete target, the builtins are evaluated
 early by Clang, and never produce any CodeGen effects / have no observable
 side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index d3dff446f9edf..a0574c640184b 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -20,10 +20,18 @@
   AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
 #endif
 
+#ifndef AMDGPU_FEATURE_PREDICATE_TYPE
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \
+  AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
+#endif
+
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
 
 AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)
 
+AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1)
+
 #undef AMDGPU_TYPE
 #undef AMDGPU_OPAQUE_PTR_TYPE
 #undef AMDGPU_NAMED_BARRIER_TYPE
+#undef AMDGPU_FEATURE_PREDICATE_TYPE
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 48437c9397570..27f78af16fe06 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -34,6 +34,7 @@
 //  Q -> target builtin type, followed by a character to distinguish the builtin type
 //    Qa -> AArch64 svcount_t builtin type.
 //    Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
+//    Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type.
 //  E -> ext_vector, followed by the number of elements and the base type.
 //  X -> _Complex, followed by the base type.
 //  Y -> ptrdiff_t
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 3d53223e3a5a4..b57b315b87790 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -351,8 +351,8 @@ BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
 
 // These are special FE only builtins intended for forwarding the requirements
 // to the ME.
-BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu")
-BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu")
+BUILTIN(__builtin_amdgcn_processor_is, "QccC*", "nctu")
+BUILTIN(__builtin_amdgcn_is_invocable, "Qc", "nctu")
 
 //===----------------------------------------------------------------------===//
 // R600-NI only builtins.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5f118d744a6cf..e92e8cdee4b63 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6820,7 +6820,7 @@ def err_counted_by_on_incomplete_type_on_use : Error <
 
 def note_counted_by_consider_completing_pointee_ty : Note<
   "consider providing a complete definition for %0">;
-  
+
 def note_counted_by_consider_using_sized_by : Note<
   "consider using '__sized_by%select{|_or_null}0' instead of "
   "'__counted_by%select{|_or_null}0'">;
@@ -13292,4 +13292,11 @@ def err_amdgcn_is_invocable_arg_invalid_value
     : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
             "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
             " is not valid">;
+def err_amdgcn_predicate_type_is_not_constructible
+    : Error<"%0 has type __amdgpu_feature_predicate_t, which is not"
+            " constructible">;
+def err_amdgcn_predicate_type_needs_explicit_bool_cast
+    : Error<"%0 must be explicitly cast to %1; however, please note that this "
+            "is almost always an error and that it prevents the effective "
+            "guarding of target dependent code, and thus should be avoided">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d62c9bb65fadb..843a146243eae 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -64,6 +64,10 @@ class SemaAMDGPU : public SemaBase {
   void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+
+  /// Expand a valid use of the feature identification builtins into its
+  /// corresponding sequence of instructions.
+  Expr *ExpandAMDGPUPredicateBI(CallExpr *CE);
 };
 } // namespace clang
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index ae136ae271882..28bdb1d90bbbd 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1477,7 +1477,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
   }
 
   if (Target.getTriple().isAMDGPU() ||
-      (AuxTarget && AuxTarget->getTriple().isAMDGPU())) {
+      (Target.getTriple().isSPIRV() &&
+       Target.getTriple().getVendor() == llvm::Triple::AMD) ||
+      (AuxTarget &&
+       (AuxTarget->getTriple().isAMDGPU() ||
+        ((AuxTarget->getTriple().isSPIRV() &&
+          AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   InitBuiltinType(SingletonId, BuiltinType::Id);
 #include "clang/Basic/AMDGPUTypes.def"
@@ -12379,6 +12384,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       Type = Context.AMDGPUBufferRsrcTy;
       break;
     }
+    case 'c': {
+      Type = Context.AMDGPUFeaturePredicateTy;
+      break;
+    }
     default:
       llvm_unreachable("Unexpected target builtin type");
     }
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f3ec498d4064b..c68b1ce1f643d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -919,6 +919,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
           DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \
     return SingletonId;                                                        \
   }
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
+  case BuiltinType::Id: {                                                      \
+    if (!SingletonId)                                                          \
+      SingletonId =                                                            \
+          DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean);  \
+    return SingletonId;                                                        \
+  }
 #include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::UChar:
   case BuiltinType::Char_U:
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 15a6177746403..ad543b8f713b4 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -980,6 +980,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) {
   if (const MemberPointerType *MPT = dyn_cast<MemberPointerType>(SrcType))
     return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT);
 
+  // The conversion is a NOP, and will be done when CodeGening the builtin.
+  if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy)
+    return Src;
+
   assert((SrcType->isIntegerType() || isa<llvm::PointerType>(Src->getType())) &&
          "Unknown scalar type to convert");
 
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index d1b292f23c2d2..61013242d3a08 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -584,6 +584,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:                                                        \
     return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier",  \
                                     {}, {Scope});
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
+  case BuiltinType::Id:                                                        \
+    return llvm::IntegerType::getInt1Ty(getLLVMContext());
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 1901d19b14dfc..c4ed83cc8d50a 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -546,8 +546,13 @@ void Sema::Initialize() {
   }
 
   if (Context.getTargetInfo().getTriple().isAMDGPU() ||
+      (Context.getTargetInfo().getTriple().isSPIRV() &&
+       Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) ||
       (Context.getAuxTargetInfo() &&
-       Context.getAuxTargetInfo()->getTriple().isAMDGPU())) {
+       (Context.getAuxTargetInfo()->getTriple().isAMDGPU() ||
+        (Context.getAuxTargetInfo()->getTriple().isSPIRV() &&
+         Context.getAuxTargetInfo()->getTriple().getVendor() ==
+            llvm::Triple::AMD)))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   addImplicitTypedef(Name, Context.SingletonId);
 #include "clang/Basic/AMDGPUTypes.def"
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a6366aceec2a6..7bf88c5c6a9a0 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -12,6 +12,7 @@
 
 #include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Sema.h"
@@ -366,4 +367,63 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D,
   addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
 }
 
+Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
+  auto &Ctx = getASTContext();
+  auto BoolTy = Ctx.getLogicalOperationType();
+  auto False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy));
+  auto True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy));
+  auto Loc = CE->getExprLoc();
+
+  if (!CE->getBuiltinCallee())
+    return IntegerLiteral::Create(Ctx, False, BoolTy, Loc);
+
+  auto P = false;
+  auto BI = CE->getBuiltinCallee();
+  if (Ctx.BuiltinInfo.isAuxBuiltinID(BI))
+    BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI);
+
+  if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) {
+    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    if (!GFX) {
+      Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal);
+      return nullptr;
+    }
+
+    auto N = GFX->getString();
+    if (!Ctx.getTargetInfo().isValidCPUName(N) &&
+        (!Ctx.getAuxTargetInfo() ||
+         !Ctx.getAuxTargetInfo()->isValidCPUName(N))) {
+      Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+      return nullptr;
+    }
+    if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+      CE->setType(BoolTy);
+      return CE;
+    }
+
+    if (auto TID = Ctx.getTargetInfo().getTargetID())
+      P = TID->find(N) == 0;
+  } else {
+    auto *Arg = CE->getArg(0);
+    if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) {
+      Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+      return nullptr;
+    }
+
+    if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+      CE->setType(BoolTy);
+      return CE;
+    }
+
+    auto *FD = cast<FunctionDecl>(Arg->getReferencedDeclOfCallee());
+
+    StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    llvm::StringMap<bool> CF;
+    Ctx.getFunctionFeatureMap(CF, FD);
+
+    P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+  }
+
+  return IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc);
+}
 } // namespace clang
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 14e16bc39eb3a..2a6f167296239 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -23,6 +23,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Initialization.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaRISCV.h"
@@ -1563,6 +1564,14 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
     return TC_Success;
   }
 
+  if (SrcType == Self.Context.AMDGPUFeaturePredicateTy &&
+      DestType == Self.Context.getLogicalOperationType()) {
+    SrcExpr =
+      Self.AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(SrcExpr.get()));
+    Kind = CK_NoOp;
+    return TC_Success;
+  }
+
   // We tried everything. Everything! Nothing works! :-(
   return TC_NotApplicable;
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 63937ddc3e386..89e49645863c9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13617,6 +13617,15 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     return;
   }
 
+  // __amdgpu_feature_predicate_t cannot be initialised
+  if (VDecl->getType().getDesugaredType(Context) ==
+        Context.AMDGPUFeaturePredicateTy) {
+    Diag(VDecl->getLocation(),
+         diag::err_amdgcn_predicate_type_is_not_constructible) << VDecl;
+    VDecl->setInvalidDecl();
+    return;
+  }
+
   // WebAssembly tables can't be used to initialise a variable.
   if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) {
     Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0;
@@ -14151,6 +14160,12 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
   if (VarDecl *Var = dyn_cast<VarDecl>(RealDecl)) {
     QualType Type = Var->getType();
 
+    if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) {
+      Diag(Var->getLocation(),
+           diag::err_amdgcn_predicate_type_is_not_constructible) << Var;
+      Var->setInvalidDecl();
+      return;
+    }
     // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory.
     if (isa<DecompositionDecl>(RealDecl)) {
       Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7e36efa727072..99fdcc89429a5 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -50,6 +50,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaFixItUtils.h"
 #include "clang/Sema/SemaHLSL.h"
@@ -6556,7 +6557,8 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
     if (FD->getName() == "__builtin_amdgcn_is_invocable") {
       auto FnPtrTy = Context.getPointerType(FD->getType());
       auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
-      return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
+      return CallExpr::Create(Context, R, ArgExprs,
+                              Context.AMDGPUFeaturePredicateTy,
                               ExprValueKind::VK_PRValue, RParenLoc,
                               FPOptionsOverride());
     }
@@ -13365,20 +13367,6 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
   return InvalidOperands(Loc, LHS, RHS);
 }
 
-static inline bool IsAMDGPUPredicateBI(Expr *E) {
-  if (!E->getType()->isVoidType())
-    return false;
-
-  if (auto *CE = dyn_cast<CallExpr>(E)) {
-    if (auto *BI = CE->getDirectCallee())
-      if (BI->getName() == "__builtin_amdgcn_processor_is" ||
-          BI->getName() == "__builtin_amdgcn_is_invocable")
-        return true;
-  }
-
-  return false;
-}
-
 // C99 6.5.[13,14]
 inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                            SourceLocation Loc,
@@ -13474,9 +13462,6 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
   // The following is safe because we only use this method for
   // non-overloadable operands.
 
-  if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get()))
-    return Context.VoidTy;
-
   // C++ [expr.log.and]p1
   // C++ [expr.log.or]p1
   // The operands are both contextually converted to type bool.
@@ -15706,37 +15691,6 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
   return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
 }
 
-static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
-  if (!CE->getBuiltinCallee())
-    return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc());
-
-  if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
-    CE->setType(Ctx.getLogicalOperationType());
-    return CE;
-  }
-
-  bool P = false;
-  auto &TI = Ctx.getTargetInfo();
-
-  if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
-    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
-    auto TID = TI.getTargetID();
-    if (GFX && TID) {
-      auto N = GFX->getString();
-      P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
-    }
-  } else {
-    auto *FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
-
-    StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
-    llvm::StringMap<bool> CF;
-    Ctx.getFunctionFeatureMap(CF, FD);
-
-    P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
-  }
-
-  return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc());
-}
 
 ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                                       UnaryOperatorKind Opc, Expr *InputExpr,
@@ -15915,7 +15869,9 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
         // Vector logical not returns the signed variant of the operand type.
         resultType = GetSignedVectorType(resultType);
         break;
-      } else if (IsAMDGPUPredicateBI(InputExpr)) {
+      } else if (resultType == Context.AMDGPUFeaturePredicateTy) {
+        resultType = Context.getLogicalOperationType();
+        Input = AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(InputExpr));
         break;
       } else {
         return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
@@ -20661,88 +20617,6 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
     }
 }
 
-static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
-  if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
-    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
-    if (!GFX) {
-      Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_processor_is_arg_not_literal);
-      return false;
-    }
-    auto N = GFX->getString();
-    if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) &&
-        (!Sema.getASTContext().getAuxTargetInfo() ||
-         !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
-      Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_processor_is_arg_invalid_value)
-          << N;
-      return false;
-    }
-  } else {
-    auto *Arg = CE->getArg(0);
-    if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
-      Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_is_invocable_arg_invalid_value)
-          << Arg;
-      return false;
-    }
-  }
-
-  return true;
-}
-
-static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
-  if (auto *UO = dyn_cast<UnaryOperator>(E)) {
-    auto *SE = dyn_cast<CallExpr>(UO->getSubExpr());
-    if (IsAMDGPUPredicateBI(SE)) {
-      assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
-             "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
-             "can only be used as operands of logical ops!");
-
-      if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
-        Invalid = true;
-        return nullptr;
-      }
-
-      UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE));
-      UO->setType(Sema.getASTContext().getLogicalOperationType());
-
-      return UO;
-    }
-  }
-  if (auto *BO = dyn_cast<BinaryOperator>(E)) {
-    auto *LHS = dyn_cast<CallExpr>(BO->getLHS());
-    auto *RHS = dyn_cast<CallExpr>(BO->getRHS());
-    if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
-      assert(BO->isLogicalOp() &&
-             "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
-             "can only be used as operands of logical ops!");
-
-      if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
-          !ValidateAMDGPUPredicateBI(Sema, RHS)) {
-        Invalid = true;
-        return nullptr;
-      }
-
-      BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS));
-      BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS));
-      BO->setType(Sema.getASTContext().getLogicalOperationType());
-
-      return BO;
-    }
-  }
-  if (auto *CE = dyn_cast<CallExpr>(E))
-    if (IsAMDGPUPredicateBI(CE)) {
-      if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
-        Invalid = true;
-        return nullptr;
-      }
-      return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE);
-    }
-
-  return nullptr;
-}
-
 ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
                                        bool IsConstexpr) {
   DiagnoseAssignmentAsCondition(E);
@@ -20754,13 +20628,8 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
   E = result.get();
 
   if (!E->isTypeDependent()) {
-    if (E->getType()->isVoidType()) {
-      bool InvalidPredicate = false;
-      if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, InvalidPredicate))
-        return BIC;
-      else if (InvalidPredicate)
-        return ExprError();
-    }
+    if (E->getType() == Context.AMDGPUFeaturePredicateTy)
+      return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast_or_null<CallExpr>(E));
 
     if (getLangOpts().CPlusPlus)
       return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index e5670dab03cb0..4e6feb871b725 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9103,6 +9103,15 @@ bool InitializationSequence::Diagnose(Sema &S,
 
   case FK_ConversionFailed: {
     QualType FromType = OnlyArg->getType();
+    // __amdgpu_feature_predicate_t can be explicitly cast to the logical op
+    // type, although this is almost always an error and we advise against it
+    if (FromType == S.Context.AMDGPUFeaturePredicateTy &&
+        DestType == S.Context.getLogicalOperationType()) {
+      S.Diag(OnlyArg->getExprLoc(),
+             diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+      << OnlyArg << DestType;
+      break;
+    }
     PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
       << (int)Entity.getKind()
       << DestType
@@ -9907,6 +9916,13 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity,
   if (EqualLoc.isInvalid())
     EqualLoc = InitE->getBeginLoc();
 
+  if (Entity.getType().getDesugaredType(Context) ==
+      Context.AMDGPUFeaturePredicateTy) {
+    Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible)
+        << Entity.getDecl();
+    return ExprError();
+  }
+
   InitializationKind Kind = InitializationKind::CreateCopy(
       InitE->getBeginLoc(), EqualLoc, AllowExplicit);
   InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index d3ee9989c73ed..39693055c2106 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -30,6 +30,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Overload.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/Template.h"
@@ -6137,12 +6138,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) {
 ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) {
   if (checkPlaceholderForOverload(*this, From))
     return ExprError();
+  if (From->getType() == Context.AMDGPUFeaturePredicateTy)
+    return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(From));
 
   ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From);
   if (!ICS.isBad())
     return PerformImplicitConversion(From, Context.BoolTy, ICS,
                                      AssignmentAction::Converting);
-
   if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy))
     return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition)
            << From->getType() << From->getSourceRange();
@@ -11921,6 +11923,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
   if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn))
     return;
 
+  // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type,
+  // although this is almost always an error and we advise against it.
+  if (FromTy == S.Context.AMDGPUFeaturePredicateTy &&
+      ToTy == S.Context.getLogicalOperationType()) {
+    S.Diag(Conv.Bad.FromExpr->getExprLoc(),
+           diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+      << Conv.Bad.FromExpr << ToTy;
+    return;
+  }
+
   // Emit the generic diagnostic and, optionally, add the hints to it.
   PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv);
   FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
index 26cc8b4c7631d..43d657d25d013 100644
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -1,29 +1,29 @@
 // RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
 // RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
 
-bool predicate(bool x) { return x; }
+bool predicate(bool x);
+void pass_by_value(__amdgpu_feature_predicate_t x);
 
-void invalid_uses(int* p, int x, bool (*pfn)(bool)) {
-    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv,
+                  __amdgpu_feature_predicate_t &&rv) {
+    // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t a;
+    // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t c = lv;
+    // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t d = rv;
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
     bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
-    // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void'
-    const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906");
-    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
-    bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
-    bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    // CHECK: error: variable has incomplete type 'const void'
-    const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
-    // CHECK: error: variable has incomplete type 'const void'
-    const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900");
-    // CHECK: error: variable has incomplete type 'const void'
-    const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
-    // CHECK: error: value of type 'void' is not contextually convertible to 'bool'
-    __builtin_amdgcn_processor_is("gfx1201")
-        ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
-    // CHECK: error: no matching function for call to 'predicate'
+    // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible
+    pass_by_value(__builtin_amdgcn_processor_is("gfx906"));
+    // CHECK: error: '__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return;
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx1200")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
     if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
-    // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument
 }
 
 void invalid_invocations(int x, const char* str) {
@@ -31,7 +31,6 @@ void invalid_invocations(int x, const char* str) {
     if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
     // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
     if (__builtin_amdgcn_processor_is(str)) return;
-
     // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid
     if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
     // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid

>From 716cc1fe760b9a56655a3334c333876dc2b0bfb3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 6 May 2025 13:02:25 +0100
Subject: [PATCH 15/69] Fix formatting.

---
 clang/lib/Sema/Sema.cpp         | 2 +-
 clang/lib/Sema/SemaAMDGPU.cpp   | 2 +-
 clang/lib/Sema/SemaCast.cpp     | 4 ++--
 clang/lib/Sema/SemaDecl.cpp     | 8 +++++---
 clang/lib/Sema/SemaExpr.cpp     | 7 +++----
 clang/lib/Sema/SemaInit.cpp     | 2 +-
 clang/lib/Sema/SemaOverload.cpp | 2 +-
 7 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index c4ed83cc8d50a..3e55b5da3c027 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -552,7 +552,7 @@ void Sema::Initialize() {
        (Context.getAuxTargetInfo()->getTriple().isAMDGPU() ||
         (Context.getAuxTargetInfo()->getTriple().isSPIRV() &&
          Context.getAuxTargetInfo()->getTriple().getVendor() ==
-            llvm::Triple::AMD)))) {
+             llvm::Triple::AMD)))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   addImplicitTypedef(Name, Context.SingletonId);
 #include "clang/Basic/AMDGPUTypes.def"
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 7bf88c5c6a9a0..df4b3237a7844 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -12,8 +12,8 @@
 
 #include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Basic/DiagnosticSema.h"
-#include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Sema.h"
 #include "llvm/Support/AtomicOrdering.h"
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 2a6f167296239..8d47b2747f47d 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -1566,8 +1566,8 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
 
   if (SrcType == Self.Context.AMDGPUFeaturePredicateTy &&
       DestType == Self.Context.getLogicalOperationType()) {
-    SrcExpr =
-      Self.AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(SrcExpr.get()));
+    SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI(
+        dyn_cast<CallExpr>(SrcExpr.get()));
     Kind = CK_NoOp;
     return TC_Success;
   }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 89e49645863c9..f932b069479c7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13619,9 +13619,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
 
   // __amdgpu_feature_predicate_t cannot be initialised
   if (VDecl->getType().getDesugaredType(Context) ==
-        Context.AMDGPUFeaturePredicateTy) {
+      Context.AMDGPUFeaturePredicateTy) {
     Diag(VDecl->getLocation(),
-         diag::err_amdgcn_predicate_type_is_not_constructible) << VDecl;
+         diag::err_amdgcn_predicate_type_is_not_constructible)
+        << VDecl;
     VDecl->setInvalidDecl();
     return;
   }
@@ -14162,7 +14163,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
 
     if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) {
       Diag(Var->getLocation(),
-           diag::err_amdgcn_predicate_type_is_not_constructible) << Var;
+           diag::err_amdgcn_predicate_type_is_not_constructible)
+          << Var;
       Var->setInvalidDecl();
       return;
     }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 99fdcc89429a5..8247f3da58280 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6557,10 +6557,9 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
     if (FD->getName() == "__builtin_amdgcn_is_invocable") {
       auto FnPtrTy = Context.getPointerType(FD->getType());
       auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
-      return CallExpr::Create(Context, R, ArgExprs,
-                              Context.AMDGPUFeaturePredicateTy,
-                              ExprValueKind::VK_PRValue, RParenLoc,
-                              FPOptionsOverride());
+      return CallExpr::Create(
+          Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy,
+          ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride());
     }
   }
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 4e6feb871b725..dafd1eee196e8 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9109,7 +9109,7 @@ bool InitializationSequence::Diagnose(Sema &S,
         DestType == S.Context.getLogicalOperationType()) {
       S.Diag(OnlyArg->getExprLoc(),
              diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
-      << OnlyArg << DestType;
+          << OnlyArg << DestType;
       break;
     }
     PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 39693055c2106..92e7d76d064c3 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11929,7 +11929,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
       ToTy == S.Context.getLogicalOperationType()) {
     S.Diag(Conv.Bad.FromExpr->getExprLoc(),
            diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
-      << Conv.Bad.FromExpr << ToTy;
+        << Conv.Bad.FromExpr << ToTy;
     return;
   }
 

>From 79035a9624ae3d769adb5eeb91f00081021f51cd Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 6 May 2025 20:09:38 +0100
Subject: [PATCH 16/69] Delete spurious whitespace.

---
 clang/lib/Sema/SemaExpr.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8247f3da58280..85a924f5b5805 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -15690,7 +15690,6 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
   return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
 }
 
-
 ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                                       UnaryOperatorKind Opc, Expr *InputExpr,
                                       bool IsAfterAmp) {

>From 0f04dbc4ca49a627290b758db34654a0ad62601e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 8 May 2025 00:53:21 +0100
Subject: [PATCH 17/69] Handle jumps into controlled sequences.

---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +
 clang/include/clang/Sema/SemaAMDGPU.h         |  4 ++
 clang/lib/Sema/JumpDiagnostics.cpp            |  7 ++-
 clang/lib/Sema/SemaAMDGPU.cpp                 | 14 +++--
 .../amdgpu-feature-builtins-cant-jump.hip     | 62 +++++++++++++++++++
 5 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f2604f052512f..14880adf8e4ad 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13312,4 +13312,6 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast
     : Error<"%0 must be explicitly cast to %1; however, please note that this "
             "is almost always an error and that it prevents the effective "
             "guarding of target dependent code, and thus should be avoided">;
+def note_amdgcn_protected_by_predicate
+    : Note<"jump enters statement controlled by AMDGPU feature predicate">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index 843a146243eae..0d11d799946b5 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -15,12 +15,15 @@
 
 #include "clang/AST/ASTFwd.h"
 #include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/SmallPtrSet.h"
 
 namespace clang {
 class AttributeCommonInfo;
+class Expr;
 class ParsedAttr;
 
 class SemaAMDGPU : public SemaBase {
+  llvm::SmallPtrSet<Expr *, 32> ExpandedPredicates;
 public:
   SemaAMDGPU(Sema &S);
 
@@ -68,6 +71,7 @@ class SemaAMDGPU : public SemaBase {
   /// Expand a valid use of the feature identification builtins into its
   /// corresponding sequence of instructions.
   Expr *ExpandAMDGPUPredicateBI(CallExpr *CE);
+  bool IsPredicate(Expr *E) const;
 };
 } // namespace clang
 
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index a852a950b47f4..718d8b461805c 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/BitVector.h"
 using namespace clang;
@@ -367,8 +368,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
 
   case Stmt::IfStmtClass: {
     IfStmt *IS = cast<IfStmt>(S);
+    bool AMDGPUPredicate = false;
     if (!(IS->isConstexpr() || IS->isConsteval() ||
-          IS->isObjCAvailabilityCheck()))
+          IS->isObjCAvailabilityCheck() ||
+          (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond()))))
       break;
 
     unsigned Diag = diag::note_protected_by_if_available;
@@ -376,6 +379,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
       Diag = diag::note_protected_by_constexpr_if;
     else if (IS->isConsteval())
       Diag = diag::note_protected_by_consteval_if;
+    else if (AMDGPUPredicate)
+      Diag = diag::note_amdgcn_protected_by_predicate;
 
     if (VarDecl *Var = IS->getConditionVariable())
       BuildScopeInformation(Var, ParentScope);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index df4b3237a7844..6833a2678c791 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -375,7 +375,8 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
   auto Loc = CE->getExprLoc();
 
   if (!CE->getBuiltinCallee())
-    return IntegerLiteral::Create(Ctx, False, BoolTy, Loc);
+    return *ExpandedPredicates.insert(
+        IntegerLiteral::Create(Ctx, False, BoolTy, Loc)).first;
 
   auto P = false;
   auto BI = CE->getBuiltinCallee();
@@ -398,7 +399,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
     }
     if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
       CE->setType(BoolTy);
-      return CE;
+      return *ExpandedPredicates.insert(CE).first;
     }
 
     if (auto TID = Ctx.getTargetInfo().getTargetID())
@@ -412,7 +413,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
 
     if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
       CE->setType(BoolTy);
-      return CE;
+      return *ExpandedPredicates.insert(CE).first;
     }
 
     auto *FD = cast<FunctionDecl>(Arg->getReferencedDeclOfCallee());
@@ -424,6 +425,11 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
     P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
   }
 
-  return IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc);
+  return *ExpandedPredicates.insert(
+      IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)).first;
+}
+
+bool SemaAMDGPU::IsPredicate(Expr *E) const {
+  return ExpandedPredicates.contains(E);
 }
 } // namespace clang
diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
new file mode 100644
index 0000000000000..a7f1abcdcd8fe
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
@@ -0,0 +1,62 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx900 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx1201 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void f(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__device__ void g(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__global__ void h(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__global__ void i(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}

>From 39a9d55c704f729f299d4ac12ffad5127757d65e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 8 May 2025 00:57:15 +0100
Subject: [PATCH 18/69] Fix formatting.

---
 clang/include/clang/Sema/SemaAMDGPU.h |  1 +
 clang/lib/Sema/SemaAMDGPU.cpp         | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index 0d11d799946b5..f72e1c53d2c92 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -24,6 +24,7 @@ class ParsedAttr;
 
 class SemaAMDGPU : public SemaBase {
   llvm::SmallPtrSet<Expr *, 32> ExpandedPredicates;
+
 public:
   SemaAMDGPU(Sema &S);
 
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 6833a2678c791..39d0f2b70d157 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -375,8 +375,9 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
   auto Loc = CE->getExprLoc();
 
   if (!CE->getBuiltinCallee())
-    return *ExpandedPredicates.insert(
-        IntegerLiteral::Create(Ctx, False, BoolTy, Loc)).first;
+    return *ExpandedPredicates
+                .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc))
+                .first;
 
   auto P = false;
   auto BI = CE->getBuiltinCallee();
@@ -425,8 +426,10 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
     P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
   }
 
-  return *ExpandedPredicates.insert(
-      IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)).first;
+  return *ExpandedPredicates
+              .insert(
+                  IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc))
+              .first;
 }
 
 bool SemaAMDGPU::IsPredicate(Expr *E) const {

>From ebde49b3190beaf41625e8953c0b72594f8cf5d4 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 17 May 2025 00:49:49 +0100
Subject: [PATCH 19/69] Start incorporating review feedback.

---
 clang/docs/ReleaseNotes.rst   |  4 ++--
 clang/lib/Sema/SemaAMDGPU.cpp | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b86057bff7043..487e2516ea878 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -768,9 +768,9 @@ AMDGPU Support
 
 - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6.
 - Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
-  a late / deferred query for the current target processor
+  a late / deferred query for the current target processor.
 - Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
-  which enables fine-grained, per-builtin, feature availability
+  which enables fine-grained, per-builtin, feature availability.
 
 NVPTX Support
 ^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 39d0f2b70d157..55ff489aed702 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -368,19 +368,19 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D,
 }
 
 Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
-  auto &Ctx = getASTContext();
-  auto BoolTy = Ctx.getLogicalOperationType();
-  auto False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy));
-  auto True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy));
-  auto Loc = CE->getExprLoc();
+  ASTContext &Ctx = getASTContext();
+  QualType BoolTy = Ctx.getLogicalOperationType();
+  llvm::APInt False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy));
+  llvm::APInt True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy));
+  SourceLocation Loc = CE->getExprLoc();
 
   if (!CE->getBuiltinCallee())
     return *ExpandedPredicates
                 .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc))
                 .first;
 
-  auto P = false;
-  auto BI = CE->getBuiltinCallee();
+  bool P = false;
+  unsigned BI = CE->getBuiltinCallee();
   if (Ctx.BuiltinInfo.isAuxBuiltinID(BI))
     BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI);
 
@@ -391,7 +391,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
       return nullptr;
     }
 
-    auto N = GFX->getString();
+    StringRef N = GFX->getString();
     if (!Ctx.getTargetInfo().isValidCPUName(N) &&
         (!Ctx.getAuxTargetInfo() ||
          !Ctx.getAuxTargetInfo()->isValidCPUName(N))) {
@@ -406,7 +406,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
     if (auto TID = Ctx.getTargetInfo().getTargetID())
       P = TID->find(N) == 0;
   } else {
-    auto *Arg = CE->getArg(0);
+    Expr *Arg = CE->getArg(0);
     if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) {
       Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
       return nullptr;

>From 4bdd30e64f5e139a101f7570a36174f539827d22 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 17 May 2025 00:50:27 +0100
Subject: [PATCH 20/69] Less `auto`.

---
 clang/lib/Sema/SemaExpr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index f75af55923779..e2c109d0b667e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6662,8 +6662,8 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
     auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
 
     if (FD->getName() == "__builtin_amdgcn_is_invocable") {
-      auto FnPtrTy = Context.getPointerType(FD->getType());
-      auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+      QualType FnPtrTy = Context.getPointerType(FD->getType());
+      Expr *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
       return CallExpr::Create(
           Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy,
           ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride());

>From 76848d5fac7eee879e0a0c07b5441b9267cbd897 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 22 May 2025 17:28:44 +0300
Subject: [PATCH 21/69] Print out valid AMDGCN processor identifiers.

---
 clang/include/clang/Basic/DiagnosticSemaKinds.td   |  4 +++-
 clang/lib/Basic/Targets/SPIR.cpp                   |  5 +++++
 clang/lib/Basic/Targets/SPIR.h                     |  1 +
 clang/lib/Sema/SemaAMDGPU.cpp                      | 14 +++++++++++---
 .../amdgpu-feature-builtins-invalid-use.cpp        |  1 +
 5 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8ef5e0a5a1bc8..8aebd64cb1f16 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12445,7 +12445,7 @@ def warn_zero_as_null_pointer_constant : Warning<
   InGroup<DiagGroup<"zero-as-null-pointer-constant">>, DefaultIgnore;
 
 def warn_not_eliding_copy_on_return : Warning<
-  "not eliding copy on return">, 
+  "not eliding copy on return">,
   InGroup<DiagGroup<"nrvo">>, DefaultIgnore;
 
 def err_nullability_cs_multilevel : Error<
@@ -13347,6 +13347,8 @@ def err_amdgcn_processor_is_arg_not_literal
 def err_amdgcn_processor_is_arg_invalid_value
     : Error<"the argument to __builtin_amdgcn_processor_is must be a valid "
             "AMDGCN processor identifier; '%0' is not valid">;
+def note_amdgcn_processor_is_valid_options
+    : Note<"valid AMDGCN processor identifiers are: %0">;
 def err_amdgcn_is_invocable_arg_invalid_value
     : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
             "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp
index eb43d9b0be283..8056b124d5fc5 100644
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@@ -156,3 +156,8 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) {
 bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const {
   return AMDGPUTI.isValidCPUName(CPU);
 }
+
+void SPIRV64AMDGCNTargetInfo::fillValidCPUList(
+    SmallVectorImpl<StringRef> &Values) const {
+  return AMDGPUTI.fillValidCPUList(Values);
+}
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index df8dab591bf70..27b93744bb8f8 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -453,6 +453,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
   // This is only needed for validating arguments passed to
   // __builtin_amdgcn_processor_is
   bool isValidCPUName(StringRef Name) const override;
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 };
 
 } // namespace targets
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 1927b7d103a88..5d381229f63c7 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -393,10 +393,18 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
     }
 
     StringRef N = GFX->getString();
-    if (!Ctx.getTargetInfo().isValidCPUName(N) &&
-        (!Ctx.getAuxTargetInfo() ||
-         !Ctx.getAuxTargetInfo()->isValidCPUName(N))) {
+    const TargetInfo &TI = Ctx.getTargetInfo();
+    const TargetInfo *AuxTI = Ctx.getAuxTargetInfo();
+    if (!TI.isValidCPUName(N) && (!AuxTI || !AuxTI->isValidCPUName(N))) {
       Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+      SmallVector<StringRef, 32> ValidList;
+      if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD)
+        TI.fillValidCPUList(ValidList);
+      else if (AuxTI) // Since the BI is present it must be and AMDGPU triple.
+        AuxTI->fillValidCPUList(ValidList);
+      if (!ValidList.empty())
+        Diag(Loc, diag::note_amdgcn_processor_is_valid_options)
+            << llvm::join(ValidList, ", ");
       return nullptr;
     }
     if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
index 43d657d25d013..9e50f9493977f 100644
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -28,6 +28,7 @@ void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv,
 
 void invalid_invocations(int x, const char* str) {
     // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid
+    // CHECK-DAG: note: valid AMDGCN processor identifiers are: {{.*}}gfx{{.*}}
     if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
     // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
     if (__builtin_amdgcn_processor_is(str)) return;

>From e1bfdf3580451b0c0a97475a29d6bb6c2b5bbdf0 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 22 May 2025 19:31:39 +0300
Subject: [PATCH 22/69] Use boolean type for the predicate, even though it
 should never get emitted.

---
 clang/lib/CodeGen/CodeGenTypes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 45b510f9aaba5..7c237e8ea8b1d 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -583,7 +583,7 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
                                     {}, {Scope});
 #define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
   case BuiltinType::Id:                                                        \
-    return llvm::IntegerType::getInt1Ty(getLLVMContext());
+    return ConvertType(getContext().getLogicalOperationType());
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"

>From 4f6546813c922f9614b2d66b3df25001ec474b06 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 22 May 2025 22:17:31 +0300
Subject: [PATCH 23/69] Register pass early.

---
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5c92c4eb411eb..fa34d67703e57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -16,6 +16,8 @@
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
+MODULE_PASS("amdgpu-expand-feature-predicates",
+            AMDGPUExpandFeaturePredicatesPass(*this))
 MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
 MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
 MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
@@ -30,8 +32,6 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
-MODULE_PASS("amdgpu-expand-feature-predicates",
-            AMDGPUExpandFeaturePredicatesPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS

>From e940d4213714957039e3b30aa384b4bb7ee3ba3c Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 22 May 2025 22:56:19 +0300
Subject: [PATCH 24/69] Clarify builtins are also available in C.

---
 clang/docs/LanguageExtensions.rst | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 95a9116926ec8..1373438423aab 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5033,6 +5033,35 @@ target specific type that behaves as if its C++ definition was the following:
     operator bool() const noexcept;
   };
 
+The builtins can be used in C as well, wherein the
+``__amdgpu_feature_predicate_t`` type behaves as an opaque, forward declared
+type with conditional automated conversion to ``_Bool`` when used as the
+predicate argument to a control structure:
+
+.. code-block:: c
+
+  struct __amdgpu_feature_predicate_t ret();     // Error
+  void arg(struct __amdgpu_feature_predicate_t); // Error
+  void local() {
+    struct __amdgpu_feature_predicate_t x;       // Error
+    struct __amdgpu_feature_predicate_t y =
+        __builtin_amdgcn_processor_is("gfx900"); // Error
+  }
+  void valid_use() {
+    _Bool x = (_Bool)__builtin_amdgcn_processor_is("gfx900"); // OK
+    if (__builtin_amdgcn_processor_is("gfx900"))       // Implicit cast to _Bool
+      return;
+    for (; __builtin_amdgcn_processor_is("gfx900");)   // Implicit cast to _Bool
+      break;
+    while (__builtin_amdgcn_processor_is("gfx900"))    // Implicit cast to _Bool
+      break;
+    do {
+      break;
+    } while (__builtin_amdgcn_processor_is("gfx900")); // Implicit cast to _Bool
+
+    __builtin_amdgcn_processor_is("gfx900") ? x : !x;
+  }
+
 The boolean interpretation of the predicate values returned by the builtins:
 
 * indicates whether the current target matches the argument; the argument MUST

>From 11dd5709644a2e5d887f83e8f35945986de133da Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 2 Jun 2025 18:12:31 +0100
Subject: [PATCH 25/69] Try to fix potentially erroneous indentation in note.

---
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4d49c24d5d948..02190b5d544e4 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13404,6 +13404,6 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast
     : Error<"%0 must be explicitly cast to %1; however, please note that this "
             "is almost always an error and that it prevents the effective "
             "guarding of target dependent code, and thus should be avoided">;
-def note_amdgcn_protected_by_predicate
-    : Note<"jump enters statement controlled by AMDGPU feature predicate">;
+def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled"
+                                              " by AMDGPU feature predicate">;
 } // end of sema component.

>From 03b029f3f400eba5dc165a31e7401a0862f06a91 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 2 Jun 2025 22:35:32 +0100
Subject: [PATCH 26/69] Add test for returning a predicate.

---
 clang/lib/Sema/SemaInit.cpp                   |  2 +-
 .../amdgpu-feature-builtins-invalid-use.cpp   |  5 +++
 ...feature-builtins-return-type-deduction.hip | 31 +++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 6e3660bea9d06..5fc270681683a 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9915,7 +9915,7 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity,
     EqualLoc = InitE->getBeginLoc();
 
   if (Entity.getType().getDesugaredType(Context) ==
-      Context.AMDGPUFeaturePredicateTy) {
+      Context.AMDGPUFeaturePredicateTy && Entity.getDecl()) {
     Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible)
         << Entity.getDecl();
     return ExprError();
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
index 9e50f9493977f..78f18d3a37b46 100644
--- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -41,3 +41,8 @@ void invalid_invocations(int x, const char* str) {
     // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
     else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
 }
+
+bool return_needs_cast() {
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx900")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    return __builtin_amdgcn_processor_is("gfx900");
+}
diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip
new file mode 100644
index 0000000000000..27bbb3f2f3d07
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip
@@ -0,0 +1,31 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple x86_64 -aux-triple amdgcn -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple x86_64 -aux-triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+
+__attribute__((device)) auto foo() {
+  return __builtin_amdgcn_processor_is("gfx900");
+}
+
+__attribute__((device)) decltype(auto) bar() {
+  return __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
+}
+
+// CHECK: |-TypedefDecl {{.*}} implicit __amdgpu_feature_predicate_t '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: | `-BuiltinType {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-DAG: |-FunctionDecl {{.*}} foo '__amdgpu_feature_predicate_t ()'
+// CHECK-NEXT: |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-ReturnStmt {{.*}}
+// CHECK-NEXT: |   `-CallExpr {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: |     |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)(const char *) noexcept'
+// CHECK-NEXT: |     | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_processor_is' '__amdgpu_feature_predicate_t (const char *) noexcept'
+// CHECK-NEXT: |     `-StringLiteral {{.*}} "gfx900"
+// CHECK-DAG: |-FunctionDecl {{.*}} bar '__amdgpu_feature_predicate_t ()'
+// CHECK-NEXT: |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-ReturnStmt {{.*}}
+// CHECK-NEXT: |   `-CallExpr {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: |     |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)() noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: |     | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_is_invocable' '__amdgpu_feature_predicate_t () noexcept'

>From 012f74d7418f6cbdacf22e694beaf6cecebd6c81 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 2 Jun 2025 22:40:49 +0100
Subject: [PATCH 27/69] Fix formatting.

---
 clang/lib/Sema/SemaInit.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 5fc270681683a..e0a5b6f2ac1a5 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9915,7 +9915,8 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity,
     EqualLoc = InitE->getBeginLoc();
 
   if (Entity.getType().getDesugaredType(Context) ==
-      Context.AMDGPUFeaturePredicateTy && Entity.getDecl()) {
+          Context.AMDGPUFeaturePredicateTy &&
+      Entity.getDecl()) {
     Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible)
         << Entity.getDecl();
     return ExprError();

>From 33bbe3566986f9acc64ca082ba6c67d85cf4067e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 10 Jun 2025 16:51:26 +0100
Subject: [PATCH 28/69] Add predicate expansion pass to LTO pipeline.

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp    | 5 +++++
 llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3405020467336..fb6099d6cd380 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -816,6 +816,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
       });
 
+  PB.registerFullLinkTimeOptimizationEarlyEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel) {
+        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
+      });
+
   PB.registerScalarOptimizerLateEPCallback(
       [this](FunctionPassManager &FPM, OptimizationLevel Level) {
         if (Level == OptimizationLevel::O0)
diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
index b1fc76f457ece..93f43b274e28d 100644
--- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
+++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
@@ -2,16 +2,22 @@
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O1>" -print-pipeline-passes %s -o - | FileCheck %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O2>" -print-pipeline-passes %s -o - | FileCheck %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O3>" -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O0 -print-pipeline-passes %s -o - | FileCheck --check-prefix=O0 %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O1 -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O2 -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -print-pipeline-passes %s -o - | FileCheck %s
 
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O0>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O1>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O2>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O3>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 
-
+; CHECK: amdgpu-expand-feature-predicates
 ; CHECK: amdgpu-attributor
+; O0: amdgpu-expand-feature-predicates
 ; O0-NOT: amdgpu-attributor
 
+; PRE: amdgpu-expand-feature-predicates
 ; PRE-NOT: internalize
 ; PRE-NOT: amdgpu-attributor
 ; PRE-NOT: printfToRuntime

>From 81a55d8eb76513d56159f72218bd26aff362e30b Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 14 Jun 2025 00:45:55 +0100
Subject: [PATCH 29/69] Try to fix odd but persistent doc generation error.

---
 clang/include/clang/Basic/DiagnosticGroups.td | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 38b4f581fa5c9..8e9621259e295 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -827,8 +827,7 @@ The warning can be resolved by removing one of the conditions above. In rough
 order of preference, this may be done by:
 1. Marking the object ``const`` (if possible)
 2. Moving the object's definition to a source file
-3. Making the object visible using ``__attribute((visibility("default")))``,
-   ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
+3. Making the object visible using ``__attribute((visibility("default")))``, ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
 
 When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``,
 take care to ensure that the object is only exported from one dll, and is imported

>From c495630dde49f7662308a4687be2d5edbe46b182 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 18 Jun 2025 18:18:01 +0100
Subject: [PATCH 30/69] Adopt suggestions.

---
 clang/lib/Sema/SemaExpr.cpp | 2 +-
 clang/lib/Sema/SemaInit.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 79577fca323c6..b029036b49a0e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6576,7 +6576,7 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
   // without any additional checking.
   if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
       ArgExprs[0]->getType() == Context.BuiltinFnTy) {
-    auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+    const auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
 
     if (FD->getName() == "__builtin_amdgcn_is_invocable") {
       QualType FnPtrTy = Context.getPointerType(FD->getType());
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 60110165e08f0..8f1a935559b19 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9103,7 +9103,7 @@ bool InitializationSequence::Diagnose(Sema &S,
   case FK_ConversionFailed: {
     QualType FromType = OnlyArg->getType();
     // __amdgpu_feature_predicate_t can be explicitly cast to the logical op
-    // type, although this is almost always an error and we advise against it
+    // type, although this is almost always an error and we advise against it.
     if (FromType == S.Context.AMDGPUFeaturePredicateTy &&
         DestType == S.Context.getLogicalOperationType()) {
       S.Diag(OnlyArg->getExprLoc(),

>From dc0221e07b2e2e76ded776b488fda67a7ac2f835 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 24 Jun 2025 00:41:44 +0100
Subject: [PATCH 31/69] Implement some of the review suggestions.

---
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  | 24 +++++++++----------
 .../amdgpu-expand-feature-predicates.ll       |  6 ++---
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index f1c73e86fb4a0..06cd2d474df87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -13,12 +13,10 @@
 // (AMDGCNSPIRV). These placeholder globals are used to guide target specific
 // lowering, once the concrete target is known, by way of constant folding their
 // value all the way into a terminator (i.e. a controlled block) or into a no
-// live use scenario. The pass makes a best effort attempt to look through
-// calls, i.e. a constant evaluatable passthrough of a predicate value will
-// generally work, however we hard fail if the folding fails, to avoid obtuse
-// BE errors or opaque run time errors. This pass should run as early as
-// possible / immediately after Clang CodeGen, so that the optimisation pipeline
-// and the BE operate with concrete target data.
+// live use scenario. We hard fail if the folding fails, to avoid obtuse BE
+// errors or opaque run time errors. This pass should run as early as possible /
+// immediately after Clang CodeGen, so that the optimisation pipeline and the BE
+// operate with concrete target data.
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -50,13 +48,13 @@ template <typename C> void collectUsers(Value *V, C &Container) {
 }
 
 inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
-  const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has");
-  const auto Offset =
+  const bool IsFeature = P->getName().starts_with("llvm.amdgcn.has");
+  const size_t Offset =
       IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is");
 
-  auto PV = P->getName().substr(Offset).str();
+  std::string PV = P->getName().substr(Offset).str();
   if (IsFeature) {
-    auto Dx = PV.find(',');
+    size_t Dx = PV.find(',');
     while (Dx != std::string::npos) {
       PV.insert(++Dx, {'+'});
 
@@ -65,7 +63,7 @@ inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
     PV.insert(PV.cbegin(), '+');
   }
 
-  auto *PTy = P->getValueType();
+  Type *PTy = P->getValueType();
   P->setLinkage(GlobalValue::PrivateLinkage);
   P->setExternallyInitialized(false);
 
@@ -103,6 +101,8 @@ std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
     auto *I = *ToFold.begin();
     ToFold.erase(I);
 
+    I->dropDroppableUses();
+
     if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) {
       collectUsers(I, ToFold);
       I->replaceAllUsesWith(C);
@@ -110,8 +110,6 @@ std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
       continue;
     } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
       continue;
-    } else if (I->users().empty()) {
-      continue;
     }
 
     return unfoldableFound(I->getParent()->getParent(), P, I);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
index 277323c353260..60e1954220738 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
@@ -121,7 +121,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1010-NEXT:    br label %[[IF_END6]]
 ; GFX1010:       [[IF_END6]]:
 ; GFX1010-NEXT:    call void @llvm.assume(i1 true)
-; GFX1010-NEXT:    call void @llvm.assume(i1 false)
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1010-NEXT:    br label %[[FOR_COND]]
 ; GFX1010:       [[FOR_COND]]:
 ; GFX1010-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -167,7 +167,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
 ; GFX1101-NEXT:    br label %[[IF_END6]]
 ; GFX1101:       [[IF_END6]]:
-; GFX1101-NEXT:    call void @llvm.assume(i1 false)
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1101-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1101-NEXT:    br label %[[FOR_COND:.*]]
 ; GFX1101:       [[FOR_COND]]:
@@ -278,7 +278,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
 ; GFX1201-W64-NEXT:    br label %[[IF_END11]]
 ; GFX1201-W64:       [[IF_END11]]:
-; GFX1201-W64-NEXT:    call void @llvm.assume(i1 false)
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1201-W64-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1201-W64-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
 ; GFX1201-W64-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4

>From 3b727b9c75180079e11783bd52a1578f02d30fb4 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 24 Jun 2025 23:10:51 +0100
Subject: [PATCH 32/69] Clean up unreachable BBs.

---
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  |  24 ++-
 .../amdgpu-expand-feature-predicates.ll       | 163 +++++-------------
 2 files changed, 62 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 06cd2d474df87..cd9e29a4e7d67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -87,8 +87,9 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
   return {PreservedAnalyses::none(), false};
 }
 
-std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
-                                                   GlobalVariable *P) {
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM,
+                SmallPtrSet<Function *, 32> &Predicated, GlobalVariable *P) {
   setPredicate(ST, P);
 
   SmallPtrSet<Instruction *, 32> ToFold;
@@ -98,18 +99,25 @@ std::pair<PreservedAnalyses, bool> handlePredicate(const GCNSubtarget &ST,
     return {PreservedAnalyses::all(), true};
 
   do {
-    auto *I = *ToFold.begin();
+    Instruction *I = *ToFold.begin();
     ToFold.erase(I);
 
     I->dropDroppableUses();
 
+    Function *F = I->getParent()->getParent();
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
     if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) {
       collectUsers(I, ToFold);
       I->replaceAllUsesWith(C);
       I->eraseFromParent();
       continue;
-    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
-      continue;
+    } else if (I->isTerminator() &&
+               ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) {
+        Predicated.insert(F);
+
+        continue;
     }
 
     return unfoldableFound(I->getParent()->getParent(), P, I);
@@ -138,9 +146,11 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
   const auto &ST = TM.getSubtarget<GCNSubtarget>(
       *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
 
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  SmallPtrSet<Function *, 32> Predicated;
   auto Ret = PreservedAnalyses::all();
   for (auto &&P : Predicates) {
-    auto R = handlePredicate(ST, P);
+    auto R = handlePredicate(ST, FAM, Predicated, P);
 
     if (!R.second)
       break;
@@ -150,6 +160,8 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   for (auto &&P : Predicates)
     P->eraseFromParent();
+  for (auto &&F : Predicated)
+    removeUnreachableBlocks(*F);
 
   return Ret;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
index 60e1954220738..a16a7fc31da22 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
@@ -54,42 +54,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX906-NEXT:  [[ENTRY:.*:]]
 ; GFX906-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
 ; GFX906-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX906-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
-; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
-; GFX906-NEXT:    br label %[[IF_NOT_GFX906:.*]]
-; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
-; GFX906-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX906-NEXT:    br label %[[IF_NOT_GFX906]]
-; GFX906:       [[IF_NOT_GFX906]]:
-; GFX906-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
-; GFX906:       [[IF_NOT_GFX907:.*:]]
-; GFX906-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX906-NEXT:    br label %[[IF_END6:.*]]
-; GFX906:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX906-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX906-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX906:       [[IF_NOT_GFX907]]:
+; GFX906-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX906:       [[IF_GFX1010_OR_GFX1101]]:
 ; GFX906-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
 ; GFX906:       [[LOR_NOT_GFX1010]]:
 ; GFX906-NEXT:    br label %[[FOR_COND:.*]]
-; GFX906:       [[IF_GFX1010_OR_GFX1101:.*:]]
-; GFX906-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX906-NEXT:    br label %[[IF_END6]]
-; GFX906:       [[IF_END6]]:
-; GFX906-NEXT:    call void @llvm.assume(i1 true)
-; GFX906-NEXT:    call void @llvm.assume(i1 true)
-; GFX906-NEXT:    br label %[[FOR_COND]]
 ; GFX906:       [[FOR_COND]]:
 ; GFX906-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX906-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
 ; GFX906-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX906-NEXT:    br label %[[IF_GFX10_INSTS1:.*]]
-; GFX906:       [[IF_GFX11_INSTS:.*:]]
-; GFX906-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX906-NEXT:    br label %[[IF_END11:.*]]
-; GFX906:       [[IF_GFX10_INSTS1]]:
-; GFX906-NEXT:    br label %[[IF_END11]]
-; GFX906:       [[IF_GFX10_INSTS:.*:]]
-; GFX906-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX906-NEXT:    br label %[[IF_END11]]
-; GFX906:       [[IF_END11]]:
+; GFX906-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX906:       [[IF_GFX11_INSTS]]:
+; GFX906-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX906:       [[IF_GFX10_INSTS]]:
 ; GFX906-NEXT:    call void @llvm.assume(i1 true)
 ; GFX906-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX906-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
@@ -101,41 +82,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1010-NEXT:  [[ENTRY:.*:]]
 ; GFX1010-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
 ; GFX1010-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1010-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
-; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
-; GFX1010-NEXT:    br label %[[IF_END:.*]]
-; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
-; GFX1010-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX1010-NEXT:    br label %[[IF_END]]
-; GFX1010:       [[IF_END]]:
-; GFX1010-NEXT:    br label %[[IF_NOT_GFX907:.*]]
-; GFX1010:       [[IF_NOT_GFX907]]:
+; GFX1010-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1010-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1010:       [[IF_NOT_GFX906]]:
+; GFX1010-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX1010:       [[LOR_NOT_GFX1010]]:
 ; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
 ; GFX1010-NEXT:    br label %[[IF_END6:.*]]
-; GFX1010:       [[IF_NOT_GFX906:.*:]]
-; GFX1010-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
-; GFX1010:       [[LOR_NOT_GFX1010:.*:]]
-; GFX1010-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1010:       [[IF_GFX1010_OR_GFX1101]]:
-; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1010-NEXT:    br label %[[IF_END6]]
 ; GFX1010:       [[IF_END6]]:
 ; GFX1010-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1010-NEXT:    call void @llvm.assume(i1 true)
-; GFX1010-NEXT:    br label %[[FOR_COND]]
+; GFX1010-NEXT:    br label %[[FOR_COND:.*]]
 ; GFX1010:       [[FOR_COND]]:
 ; GFX1010-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1010-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
 ; GFX1010-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX1010-NEXT:    br label %[[IF_ELSE8:.*]]
-; GFX1010:       [[IF_GFX11_INSTS:.*:]]
-; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1010-NEXT:    br label %[[IF_END11:.*]]
-; GFX1010:       [[IF_ELSE8]]:
+; GFX1010-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1010:       [[IF_GFX11_INSTS]]:
 ; GFX1010-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
 ; GFX1010:       [[IF_GFX10_INSTS]]:
 ; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1010-NEXT:    br label %[[IF_END11]]
+; GFX1010-NEXT:    br label %[[IF_END11:.*]]
 ; GFX1010:       [[IF_END11]]:
 ; GFX1010-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1010-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -148,25 +116,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1101-NEXT:  [[ENTRY:.*:]]
 ; GFX1101-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
 ; GFX1101-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1101-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
-; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1101-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS]]:
 ; GFX1101-NEXT:    br label %[[IF_END:.*]]
-; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
-; GFX1101-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX1101-NEXT:    br label %[[IF_END]]
 ; GFX1101:       [[IF_END]]:
 ; GFX1101-NEXT:    br label %[[IF_NOT_GFX907:.*]]
 ; GFX1101:       [[IF_NOT_GFX907]]:
 ; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1101-NEXT:    br label %[[IF_END6:.*]]
-; GFX1101:       [[IF_NOT_GFX906:.*:]]
-; GFX1101-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
-; GFX1101:       [[LOR_NOT_GFX1010]]:
-; GFX1101-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
-; GFX1101:       [[IF_GFX1010_OR_GFX1101]]:
-; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1101-NEXT:    br label %[[IF_END6]]
-; GFX1101:       [[IF_END6]]:
+; GFX1101-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1101:       [[IF_NOT_GFX906]]:
 ; GFX1101-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1101-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1101-NEXT:    br label %[[FOR_COND:.*]]
@@ -177,13 +135,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1101-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
 ; GFX1101:       [[IF_GFX11_INSTS]]:
 ; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1101-NEXT:    br label %[[IF_END11:.*]]
-; GFX1101:       [[IF_ELSE8:.*:]]
-; GFX1101-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
-; GFX1101:       [[IF_GFX10_INSTS]]:
-; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1101-NEXT:    br label %[[IF_END11]]
-; GFX1101:       [[IF_END11]]:
+; GFX1101-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1101:       [[IF_ELSE8]]:
 ; GFX1101-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1101-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1101-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
@@ -195,28 +148,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1201-NEXT:  [[ENTRY:.*:]]
 ; GFX1201-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
 ; GFX1201-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1201-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
-; GFX1201:       [[LOR_NOT_GFX1201:.*:]]
-; GFX1201-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS]]
-; GFX1201:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-NEXT:    br label %[[LOR_NOT_GFX1201:.*]]
+; GFX1201:       [[LOR_NOT_GFX1201]]:
 ; GFX1201-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX1201-NEXT:    br label %[[IF_END:.*]]
-; GFX1201:       [[IF_END]]:
-; GFX1201-NEXT:    br label %[[IF_NOT_GFX907:.*]]
-; GFX1201:       [[IF_NOT_GFX907]]:
+; GFX1201-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1201:       [[IF_NOT_GFX906]]:
+; GFX1201-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1201:       [[IF_GFX1010_OR_GFX1101]]:
 ; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
 ; GFX1201-NEXT:    br label %[[IF_END6:.*]]
-; GFX1201:       [[IF_NOT_GFX906:.*:]]
-; GFX1201-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
-; GFX1201:       [[IF_GFX1010_OR_GFX1102]]:
-; GFX1201-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1201:       [[IF_GFX1010_OR_GFX1101:.*:]]
-; GFX1201-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1201-NEXT:    br label %[[IF_END6]]
 ; GFX1201:       [[IF_END6]]:
 ; GFX1201-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1201-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-NEXT:    br label %[[FOR_COND]]
+; GFX1201-NEXT:    br label %[[FOR_COND:.*]]
 ; GFX1201:       [[FOR_COND]]:
 ; GFX1201-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1201-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
@@ -224,13 +168,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1201-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
 ; GFX1201:       [[IF_GFX11_INSTS]]:
 ; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1201-NEXT:    br label %[[IF_END11:.*]]
-; GFX1201:       [[IF_ELSE8:.*:]]
-; GFX1201-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
-; GFX1201:       [[IF_GFX10_INSTS]]:
-; GFX1201-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1201-NEXT:    br label %[[IF_END11]]
-; GFX1201:       [[IF_END11]]:
+; GFX1201-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1201:       [[IF_ELSE8]]:
 ; GFX1201-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1201-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1201-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
@@ -242,28 +181,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1201-W64-NEXT:  [[ENTRY:.*:]]
 ; GFX1201-W64-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
 ; GFX1201-W64-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1201-W64-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
-; GFX1201-W64:       [[LOR_NOT_GFX1201:.*:]]
-; GFX1201-W64-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS]]
-; GFX1201-W64:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-W64-NEXT:    br label %[[LOR_NOT_GFX1201:.*]]
+; GFX1201-W64:       [[LOR_NOT_GFX1201]]:
 ; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX1201-W64-NEXT:    br label %[[IF_END:.*]]
-; GFX1201-W64:       [[IF_END]]:
-; GFX1201-W64-NEXT:    br label %[[IF_NOT_GFX907:.*]]
-; GFX1201-W64:       [[IF_NOT_GFX907]]:
+; GFX1201-W64-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1201-W64:       [[IF_NOT_GFX906]]:
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1201-W64:       [[IF_GFX1010_OR_GFX1101]]:
 ; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
 ; GFX1201-W64-NEXT:    br label %[[IF_END6:.*]]
-; GFX1201-W64:       [[IF_NOT_GFX906:.*:]]
-; GFX1201-W64-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
-; GFX1201-W64:       [[IF_GFX1010_OR_GFX1102]]:
-; GFX1201-W64-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1201-W64:       [[IF_GFX1010_OR_GFX1101:.*:]]
-; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1201-W64-NEXT:    br label %[[IF_END6]]
 ; GFX1201-W64:       [[IF_END6]]:
 ; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-W64-NEXT:    br label %[[FOR_COND]]
+; GFX1201-W64-NEXT:    br label %[[FOR_COND:.*]]
 ; GFX1201-W64:       [[FOR_COND]]:
 ; GFX1201-W64-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1201-W64-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
@@ -271,13 +201,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
 ; GFX1201-W64-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
 ; GFX1201-W64:       [[IF_GFX11_INSTS]]:
 ; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1201-W64-NEXT:    br label %[[IF_END11:.*]]
-; GFX1201-W64:       [[IF_ELSE8:.*:]]
-; GFX1201-W64-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
-; GFX1201-W64:       [[IF_GFX10_INSTS]]:
-; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1201-W64-NEXT:    br label %[[IF_END11]]
-; GFX1201-W64:       [[IF_END11]]:
+; GFX1201-W64-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1201-W64:       [[IF_ELSE8]]:
 ; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
 ; GFX1201-W64-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
 ; GFX1201-W64-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]

>From 246ff3809552af6c14d0953f8a4352de32659513 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Thu, 3 Jul 2025 03:06:57 +0100
Subject: [PATCH 33/69] Fix formatting.

---
 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index cd9e29a4e7d67..cf95171df55c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -115,9 +115,9 @@ handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM,
       continue;
     } else if (I->isTerminator() &&
                ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) {
-        Predicated.insert(F);
+      Predicated.insert(F);
 
-        continue;
+      continue;
     }
 
     return unfoldableFound(I->getParent()->getParent(), P, I);

>From 6b368d593156ced102e265f99ea6e057e7599274 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 4 Jul 2025 19:15:24 +0100
Subject: [PATCH 34/69] Remove internal functions made unreachable by predicate
 expansion.

---
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  |  30 ++++-
 ...predicates-remove-unreachable-functions.ll | 104 ++++++++++++++++++
 2 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index cf95171df55c2..40ab71b609bc8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -127,6 +127,30 @@ handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM,
 }
 } // Unnamed namespace.
 
+static inline SmallVector<Function *> collectUsedFunctions(Module &M) {
+  SmallVector<Function *> Ret;
+  for (auto &&F : M) {
+    if (F.isIntrinsic() || F.isDeclaration())
+      continue;
+    if (!F.hasInternalLinkage() && !F.hasPrivateLinkage())
+      continue;
+    if (F.hasNUndroppableUsesOrMore(1))
+      Ret.push_back(&F);
+  }
+
+  return Ret;
+}
+
+template<typename Container0, typename Container1, typename Container2>
+static inline void removeUnreachable(const Container0 &Predicates,
+                                     const Container1 &PredicatedFns,
+                                     const Container2 &UnreachableFns) {
+  for_each(Predicates, [](auto &&P) { P->eraseFromParent(); });
+  for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); });
+  for_each(UnreachableFns,
+           [](auto &&F) { if (F->getNumUses() == 0) F->eraseFromParent(); });
+}
+
 PreservedAnalyses
 AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
   if (M.empty())
@@ -148,6 +172,7 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   SmallPtrSet<Function *, 32> Predicated;
+  SmallVector<Function *> MaybeUnreachable = collectUsedFunctions(M);
   auto Ret = PreservedAnalyses::all();
   for (auto &&P : Predicates) {
     auto R = handlePredicate(ST, FAM, Predicated, P);
@@ -158,10 +183,7 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
     Ret.intersect(R.first);
   }
 
-  for (auto &&P : Predicates)
-    P->eraseFromParent();
-  for (auto &&F : Predicated)
-    removeUnreachableBlocks(*F);
+  removeUnreachable(Predicates, Predicated, MaybeUnreachable);
 
   return Ret;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
new file mode 100644
index 0000000000000..c5089de333849
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
+
+ at llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
+
+define external void @extern_linkage() {
+; GFX906-LABEL: define void @extern_linkage(
+; GFX906-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    ret void
+;
+; GFX1201-LABEL: define void @extern_linkage(
+; GFX1201-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define private void @non_predicated_uses() {
+; GFX906-LABEL: define private void @non_predicated_uses(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    ret void
+;
+; GFX1201-LABEL: define private void @non_predicated_uses(
+; GFX1201-SAME: ) #[[ATTR0]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define internal void @remove_on_906() {
+; GFX1201-LABEL: define internal void @remove_on_906(
+; GFX1201-SAME: ) #[[ATTR0]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define internal void @remove_on_1201() {
+; GFX906-LABEL: define internal void @remove_on_1201(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define void @foo() {
+; GFX906-LABEL: define void @foo(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    call void @non_predicated_uses()
+; GFX906-NEXT:    br label %[[NOT_GFX1201:.*]]
+; GFX906:       [[NOT_GFX1201]]:
+; GFX906-NEXT:    br label %[[GFX906:.*]]
+; GFX906:       [[GFX906]]:
+; GFX906-NEXT:    call void @remove_on_1201()
+; GFX906-NEXT:    br label %[[END:.*]]
+; GFX906:       [[END]]:
+; GFX906-NEXT:    ret void
+;
+; GFX1201-LABEL: define void @foo(
+; GFX1201-SAME: ) #[[ATTR0]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    call void @non_predicated_uses()
+; GFX1201-NEXT:    br label %[[GFX1201:.*]]
+; GFX1201:       [[GFX1201]]:
+; GFX1201-NEXT:    call void @remove_on_906()
+; GFX1201-NEXT:    br label %[[END:.*]]
+; GFX1201:       [[END]]:
+; GFX1201-NEXT:    ret void
+;
+entry:
+  call void @non_predicated_uses()
+  %0 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
+  br i1 %0, label %gfx1201, label %not.gfx1201
+
+gfx1201:
+  call void @remove_on_906()
+  br label %end
+
+not.gfx1201:
+  %1 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
+  br i1 %1, label %gfx906, label %end
+
+gfx906:
+  call void @remove_on_1201()
+  br label %end
+
+end:
+  ret void
+}

>From 435ce05571bc599eb493cf9508b6c320b69f2fa5 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 4 Jul 2025 19:30:34 +0100
Subject: [PATCH 35/69] Fix formatting, tweak use count.

---
 .../Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index 40ab71b609bc8..d83c305fb0404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -134,8 +134,9 @@ static inline SmallVector<Function *> collectUsedFunctions(Module &M) {
       continue;
     if (!F.hasInternalLinkage() && !F.hasPrivateLinkage())
       continue;
-    if (F.hasNUndroppableUsesOrMore(1))
-      Ret.push_back(&F);
+    if (F.hasNUndroppableUses(0))
+      continue;
+    Ret.push_back(&F);
   }
 
   return Ret;
@@ -147,8 +148,12 @@ static inline void removeUnreachable(const Container0 &Predicates,
                                      const Container2 &UnreachableFns) {
   for_each(Predicates, [](auto &&P) { P->eraseFromParent(); });
   for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); });
-  for_each(UnreachableFns,
-           [](auto &&F) { if (F->getNumUses() == 0) F->eraseFromParent(); });
+  for_each(UnreachableFns, [](auto &&F) {
+    if (!F->hasNUndroppableUses(0))
+      return;
+    F->dropDroppableUses();
+    F->eraseFromParent();
+  });
 }
 
 PreservedAnalyses

>From 2c2f78bd3f189a2a93b19160cb974a50279326a2 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 4 Jul 2025 19:34:52 +0100
Subject: [PATCH 36/69] Fix formatting, again.

---
 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index d83c305fb0404..fc0d3e378044d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -142,7 +142,7 @@ static inline SmallVector<Function *> collectUsedFunctions(Module &M) {
   return Ret;
 }
 
-template<typename Container0, typename Container1, typename Container2>
+template <typename Container0, typename Container1, typename Container2>
 static inline void removeUnreachable(const Container0 &Predicates,
                                      const Container1 &PredicatedFns,
                                      const Container2 &UnreachableFns) {

>From 81778760b4022296ca9387a034067830317387e1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 12 Jul 2025 02:59:50 +0100
Subject: [PATCH 37/69] Add warnings around unguarded builtin usage, suggesting
 `__builtin_amdgcn_is_invocable` as the solution.

---
 clang/include/clang/Basic/DiagnosticGroups.td |   3 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   6 +
 clang/include/clang/Sema/SemaAMDGPU.h         |   6 +
 clang/lib/Sema/SemaAMDGPU.cpp                 | 111 ++++++++++++++++++
 clang/lib/Sema/SemaDecl.cpp                   |   9 +-
 clang/lib/Sema/SemaExpr.cpp                   |   7 ++
 ...amdgpu-is-invocable-guards-builtin-use.hip |  47 ++++++++
 7 files changed, 187 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 9a7a308600763..bcfc0d99a8bfd 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1755,3 +1755,6 @@ def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-stor
 
 // A warning for options that enable a feature that is not yet complete
 def ExperimentalOption : DiagGroup<"experimental-option">;
+
+// Warnings about unguarded usages of AMDGPU target specific constructs
+def UnguardedBuiltinUsageAMDGPU : DiagGroup<"amdgpu-unguarded-builtin-usage">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0481503fe8de6..16cb7814d0626 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13485,4 +13485,10 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast
             "guarding of target dependent code, and thus should be avoided">;
 def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled"
                                               " by AMDGPU feature predicate">;
+def warn_amdgcn_unguarded_builtin :
+  Warning<"%0 might be unavailable on some AMDGPU targets">,
+  InGroup<UnguardedBuiltinUsageAMDGPU>, DefaultIgnore;
+def note_amdgcn_unguarded_builtin_silence
+  : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence "
+         "this warning">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index f72e1c53d2c92..1a6752d7ec0d6 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -24,6 +24,7 @@ class ParsedAttr;
 
 class SemaAMDGPU : public SemaBase {
   llvm::SmallPtrSet<Expr *, 32> ExpandedPredicates;
+  llvm::SmallPtrSet<FunctionDecl *, 32> PotentiallyUnguardedBuiltinUsers;
 
 public:
   SemaAMDGPU(Sema &S);
@@ -73,6 +74,11 @@ class SemaAMDGPU : public SemaBase {
   /// corresponding sequence of instructions.
   Expr *ExpandAMDGPUPredicateBI(CallExpr *CE);
   bool IsPredicate(Expr *E) const;
+  /// Diagnose unguarded usages of AMDGPU builtins and recommend guarding with
+  /// __builtin_amdgcn_is_invocable
+  void AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD);
+  bool HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const;
+  void DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD);
 };
 } // namespace clang
 
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 5d381229f63c7..aed734ef94465 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -11,10 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DynamicRecursiveASTVisitor.h"
+#include "clang/AST/Expr.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Ownership.h"
+#include "clang/Sema/Scope.h"
 #include "clang/Sema/Sema.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include <cstdint>
@@ -444,4 +448,111 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
 bool SemaAMDGPU::IsPredicate(Expr *E) const {
   return ExpandedPredicates.contains(E);
 }
+
+void SemaAMDGPU::AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD) {
+  PotentiallyUnguardedBuiltinUsers.insert(FD);
+}
+
+bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const {
+  return PotentiallyUnguardedBuiltinUsers.contains(FD);
+}
+
+namespace {
+  /// This class implements -Wamdgpu-unguarded-builtin-usage.
+  ///
+  /// This is done with a traversal of the AST of a function that includes a
+  /// call to a target specific builtin. Whenever we encounter an \c if of the
+  /// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
+  /// guarded.
+class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
+  // TODO: this is conservative, and should be extended to:
+  //       - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the
+  //         guard);
+  //       - build sets of builtins which are invocable from nested
+  //         if (__builtin_amdgcn_is_invocable) calls, rather than assume
+  //         sanity / that the existence of a guard implies its correctness;
+  //       - derive the set of available builtins / valid ASM constraints from
+  //         the target architecture passed to __builtin_amdgcn_processor_is;
+  //       - consider attributes such as target.
+  Sema &SemaRef;
+
+  unsigned Guards;
+public:
+  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {}
+
+  bool TraverseLambdaExpr(LambdaExpr *LE) override {
+    if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(LE->getCallOperator()))
+      return true; // We have already handled this.
+    return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE);
+  }
+
+  bool TraverseStmt(Stmt *S) override {
+    if (!S)
+      return true;
+    return DynamicRecursiveASTVisitor::TraverseStmt(S);
+  }
+
+  void IssueDiagnostics(Stmt *S) { TraverseStmt(S); }
+
+  bool TraverseIfStmt(IfStmt *If) override;
+
+  bool TraverseCaseStmt(CaseStmt *CS) override {
+    return TraverseStmt(CS->getSubStmt());
+  }
+
+  bool VisitCallExpr(CallExpr *CE) override;
+};
+
+inline Expr *FindPredicate(Expr *Cond) {
+  if (auto *CE = dyn_cast<CallExpr>(Cond)) {
+    if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable)
+      return Cond;
+  } else if (auto *UO = dyn_cast<UnaryOperator>(Cond)) {
+    return FindPredicate(UO->getSubExpr());
+  } else if (auto *BO = dyn_cast<BinaryOperator>(Cond)) {
+    if ((Cond = FindPredicate(BO->getLHS())))
+      return Cond;
+    return FindPredicate(BO->getRHS());
+  }
+  return nullptr;
+}
+
+bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
+  if (FindPredicate(If->getCond())) {
+    ++Guards;
+    bool Continue = TraverseStmt(If->getThen());
+    --Guards;
+
+    return Continue && TraverseStmt(If->getElse());
+  }
+
+  return DynamicRecursiveASTVisitor::TraverseIfStmt(If);
+}
+
+bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
+    if (Guards)
+      return true;
+
+    unsigned ID = CE->getBuiltinCallee();
+
+    if (!ID)
+      return true;
+    if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID))
+      return true;
+    if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
+        ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
+      return true;
+
+    SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
+        << CE->getDirectCallee();
+    SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
+        << CE->getDirectCallee();
+
+    return true;
+  }
+} // Unnamed namespace
+
+void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) {
+  DiagnoseUnguardedBuiltins(SemaRef).IssueDiagnostics(FD->getBody());
+}
 } // namespace clang
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index e405cfc0a5d17..f586f09bce19a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -45,6 +45,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
@@ -16602,8 +16603,12 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       return nullptr;
     }
 
-    if (Body && FSI->HasPotentialAvailabilityViolations)
-      DiagnoseUnguardedAvailabilityViolations(dcl);
+    if (Body) {
+      if (FSI->HasPotentialAvailabilityViolations)
+        DiagnoseUnguardedAvailabilityViolations(dcl);
+      else if (AMDGPU().HasPotentiallyUnguardedBuiltinUsage(FD))
+        AMDGPU().DiagnoseUnguardedBuiltinUsage(FD);
+    }
 
     assert(!FSI->ObjCShouldCallSuper &&
            "This should only be set for ObjC methods, which should have been "
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 0aa0ccf89d0f9..ef2059d05a8d3 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6705,6 +6705,13 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
 
     FunctionDecl *FDecl = dyn_cast<FunctionDecl>(NDecl);
     if (FDecl && FDecl->getBuiltinID()) {
+      if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID())) {
+        const llvm::Triple &Triple = Context.getTargetInfo().getTriple();
+        if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD)
+          AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast<FunctionDecl>(
+              getFunctionLevelDeclContext(/*AllowLambda=*/ true)));
+      }
+
       // Rewrite the function decl for this builtin by replacing parameters
       // with no explicit address space with the address space of the arguments
       // in ArgExprs.
diff --git a/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
new file mode 100644
index 0000000000000..26544590f7536
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
@@ -0,0 +1,47 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void g();
+
+__device__ void f(int x, bool b) {
+    const auto lambda = [=] __device__  () {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+            __builtin_amdgcn_s_sleep_var(x);
+    };
+
+    const auto generic_lambda = [] __device__ (auto&& y) {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+            __builtin_amdgcn_s_sleep_var(y);
+    };
+
+    __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+    // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    // processor_is does not (yet) guard
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        __builtin_amdgcn_s_sleep_var(x); // expected-warning {{'__builtin_amdgcn_s_sleep_var' might be unavailable on some AMDGPU targets}}
+        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep_var' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    // Direct guard
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep))
+        __builtin_amdgcn_s_sleep(42);
+
+    // Guarded scope
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
+        if (b) {
+            g();
+            while (--x > 42)
+                __builtin_amdgcn_s_sleep_var(x);
+        }
+    }
+}

>From b4decc204a48d1e7d06db5dae4b9704fbac5ab3c Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 12 Jul 2025 03:10:17 +0100
Subject: [PATCH 38/69] Fix formatting.

---
 clang/lib/Sema/SemaAMDGPU.cpp | 48 ++++++++++++++++++-----------------
 clang/lib/Sema/SemaExpr.cpp   |  2 +-
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index aed734ef94465..af4668e17ed68 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -458,12 +458,12 @@ bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const {
 }
 
 namespace {
-  /// This class implements -Wamdgpu-unguarded-builtin-usage.
-  ///
-  /// This is done with a traversal of the AST of a function that includes a
-  /// call to a target specific builtin. Whenever we encounter an \c if of the
-  /// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
-  /// guarded.
+/// This class implements -Wamdgpu-unguarded-builtin-usage.
+///
+/// This is done with a traversal of the AST of a function that includes a
+/// call to a target specific builtin. Whenever we encounter an \c if of the
+/// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
+/// guarded.
 class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
   // TODO: this is conservative, and should be extended to:
   //       - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the
@@ -477,11 +477,13 @@ class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
   Sema &SemaRef;
 
   unsigned Guards;
+
 public:
   DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {}
 
   bool TraverseLambdaExpr(LambdaExpr *LE) override {
-    if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(LE->getCallOperator()))
+    if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(
+            LE->getCallOperator()))
       return true; // We have already handled this.
     return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE);
   }
@@ -530,26 +532,26 @@ bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
 }
 
 bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
-    if (Guards)
-      return true;
+  if (Guards)
+    return true;
 
-    unsigned ID = CE->getBuiltinCallee();
+  unsigned ID = CE->getBuiltinCallee();
 
-    if (!ID)
-      return true;
-    if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID))
-      return true;
-    if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
-        ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
-      return true;
+  if (!ID)
+    return true;
+  if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID))
+    return true;
+  if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
+      ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
+    return true;
 
-    SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
-        << CE->getDirectCallee();
-    SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
-        << CE->getDirectCallee();
+  SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
+      << CE->getDirectCallee();
+  SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
+      << CE->getDirectCallee();
 
-    return true;
-  }
+  return true;
+}
 } // Unnamed namespace
 
 void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ef2059d05a8d3..0eb95826342ff 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6709,7 +6709,7 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
         const llvm::Triple &Triple = Context.getTargetInfo().getTriple();
         if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD)
           AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast<FunctionDecl>(
-              getFunctionLevelDeclContext(/*AllowLambda=*/ true)));
+              getFunctionLevelDeclContext(/*AllowLambda=*/true)));
       }
 
       // Rewrite the function decl for this builtin by replacing parameters

>From 659dc762446e87af8a6019131194360941f33d65 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 14 Jan 2026 03:03:27 +0200
Subject: [PATCH 39/69] Fix build.

---
 clang/include/clang/Serialization/ASTBitCodes.h          | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 5a86d540e5d0b..7d1ec6247bd0d 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1163,7 +1163,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 514;
+const unsigned NUM_PREDEF_TYPE_IDS = 515;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
index fc0d3e378044d..e40b40c556576 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"

>From 2a406fc8b88d45e5b1383e09dfa1f2633c6cbef5 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 14 Jan 2026 20:19:17 +0200
Subject: [PATCH 40/69] Fix tests.

---
 clang/test/CodeGen/amdgpu-builtin-is-invocable.c | 6 +++---
 clang/test/CodeGen/amdgpu-builtin-processor-is.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 12f283707308e..5a3395d2e0c55 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -42,12 +42,12 @@ void foo() {
         return __builtin_trap();
 }
 //.
-// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" }
 //.
-// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" }
 // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
 // AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 76dead8ebbe89..4c55160e5ea6d 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -40,12 +40,12 @@ void foo() {
         return __builtin_trap();
 }
 //.
-// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" }
 // AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
-// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" }
 //.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
 // AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}

>From 558fb6062f0bfdb6a1a7f900c2a38337c2521ad9 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 20 Jan 2026 17:05:48 +0000
Subject: [PATCH 41/69] Adopt TableGen based intrinsic definitions.

---
 clang/include/clang/Basic/BuiltinsAMDGPU.td   | 5 +++++
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp | 1 +
 2 files changed, 6 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 12ffad305e7c0..e070a2728aaec 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -380,6 +380,11 @@ def __builtin_amdgcn_endpgm : AMDGPUBuiltin<"void()", [NoReturn]>;
 def __builtin_amdgcn_get_fpenv : AMDGPUBuiltin<"uint64_t()">;
 def __builtin_amdgcn_set_fpenv : AMDGPUBuiltin<"void(uint64_t)">;
 
+// These are special FE only builtins intended for forwarding the requirements
+// to the ME.
+def __builtin_amdgcn_processor_is : AMDGPUBuiltin<"__amdgpu_feature_predicate_t(char const *)", [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments]>;
+def __builtin_amdgcn_is_invocable : AMDGPUBuiltin<"__amdgpu_feature_predicate_t()", [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments]>;
+
 //===----------------------------------------------------------------------===//
 
 // Wave Reduction builtins.
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index fb089a811ef92..c19492af5b926 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -341,6 +341,7 @@ class PrototypeParser {
                                .Case("__fp16", "h")
                                .Case("__hlsl_resource_t", "Qr")
                                .Case("__amdgpu_buffer_rsrc_t", "Qb")
+                               .Case("__amdgpu_feature_predicate_t", "Qc")
                                .Case("__amdgpu_texture_t", "Qt")
                                .Case("__int128_t", "LLLi")
                                .Case("_Float16", "x")

>From 6c96c9fe2742697409a476fcc86b724569ec603f Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 02:21:58 +0000
Subject: [PATCH 42/69] Start cleaning up vestigial predicate expansion infra.

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  9 ---
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  2 -
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 55 +------------------
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  1 -
 .../CodeGen/AMDGPU/print-pipeline-passes.ll   |  8 +--
 5 files changed, 3 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index a896c678eff9d..5df11a45b4889 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -454,15 +454,6 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisWrapperPassPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
-struct AMDGPUExpandFeaturePredicatesPass
-    : PassInfoMixin<AMDGPUExpandFeaturePredicatesPass> {
-  const AMDGPUTargetMachine &TM;
-  AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
-  static bool isRequired() { return true; }
-};
-
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b7b0542d87c04..f464fbf31c754 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,8 +22,6 @@ MODULE_ANALYSIS("amdgpu-argument-usage", AMDGPUArgumentUsageAnalysis())
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
-MODULE_PASS("amdgpu-expand-feature-predicates",
-            AMDGPUExpandFeaturePredicatesPass(*this))
 MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
 MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
 MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0639923b6cf68..d2a2f81255344 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -154,10 +154,8 @@ class AMDGPUCodeGenPassBuilder
   void addPostRegAlloc(PassManagerWrapper &PMW) const;
   void addPreEmitPass(PassManagerWrapper &PMWM) const;
   void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
-  Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
   Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
   void addPreRegAlloc(PassManagerWrapper &PMW) const;
-  Error addFastRegAlloc(PassManagerWrapper &PMW) const;
   void addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
   void addPreSched2(PassManagerWrapper &PMW) const;
   void addPostBBSections(PassManagerWrapper &PMW) const;
@@ -817,8 +815,7 @@ static bool mustPreserveGV(const GlobalValue &GV) {
 }
 
 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
-  if (EnableAMDGPUAliasAnalysis)
-    AAM.registerFunctionAnalysis<AMDGPUAA>();
+  AAM.registerFunctionAnalysis<AMDGPUAA>();
 }
 
 static Expected<ScanOptions>
@@ -859,18 +856,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
-  PB.registerPipelineStartEPCallback(
-      [this](ModulePassManager &PM, OptimizationLevel Level) {
-        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
-      });
-
-  PB.registerFullLinkTimeOptimizationEarlyEPCallback(
-      [this](ModulePassManager &PM, OptimizationLevel) {
-        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
-      });
-
   PB.registerScalarOptimizerLateEPCallback(
-      [this](FunctionPassManager &FPM, OptimizationLevel Level) {
+      [](FunctionPassManager &FPM, OptimizationLevel Level) {
         if (Level == OptimizationLevel::O0)
           return;
 
@@ -2300,8 +2287,6 @@ void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
   if (EnableRegReassign) {
     addMachineFunctionPass(GCNNSAReassignPass(), PMW);
   }
-
-  addMachineFunctionPass(AMDGPURewriteAGPRCopyMFMAPass(), PMW);
 }
 
 void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
@@ -2323,42 +2308,6 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
   addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
 }
 
-Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
-  insertPass<PHIEliminationPass>(SILowerControlFlowPass());
-
-  insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass());
-
-  return Base::addFastRegAlloc(PMW);
-}
-
-Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
-    PassManagerWrapper &PMW) const {
-  // TODO: handle default regalloc override error (with regalloc-npm)
-
-  addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
-
-  addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
-                         PMW);
-
-  // Equivalent of PEI for SGPRs.
-  addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
-
-  // To Allocate wwm registers used in whole quad mode operations (for shaders).
-  addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
-
-  // For allocating other wwm register operands.
-  addMachineFunctionPass(RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}),
-                         PMW);
-
-  addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
-  addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
-
-  // For allocating per-thread VGPRs.
-  addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
-
-  return Error::success();
-}
-
 void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
     PassManagerWrapper &PMW) const {
   if (EnableDCEInRA)
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 5e4d24c21a7c7..782cbfa76e6e9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
-  AMDGPUExpandFeaturePredicates.cpp
   AMDGPUExportClustering.cpp
   AMDGPUExportKernelRuntimeHandles.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
index 93f43b274e28d..b1fc76f457ece 100644
--- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
+++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
@@ -2,22 +2,16 @@
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O1>" -print-pipeline-passes %s -o - | FileCheck %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O2>" -print-pipeline-passes %s -o - | FileCheck %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O3>" -print-pipeline-passes %s -o - | FileCheck %s
-; RUN: opt -mtriple=amdgcn--amdhsa -S -O0 -print-pipeline-passes %s -o - | FileCheck --check-prefix=O0 %s
-; RUN: opt -mtriple=amdgcn--amdhsa -S -O1 -print-pipeline-passes %s -o - | FileCheck %s
-; RUN: opt -mtriple=amdgcn--amdhsa -S -O2 -print-pipeline-passes %s -o - | FileCheck %s
-; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -print-pipeline-passes %s -o - | FileCheck %s
 
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O0>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O1>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O2>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O3>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 
-; CHECK: amdgpu-expand-feature-predicates
+
 ; CHECK: amdgpu-attributor
-; O0: amdgpu-expand-feature-predicates
 ; O0-NOT: amdgpu-attributor
 
-; PRE: amdgpu-expand-feature-predicates
 ; PRE-NOT: internalize
 ; PRE-NOT: amdgpu-attributor
 ; PRE-NOT: printfToRuntime

>From 6d23d0a50d4d0ae95aa059cb5a1725f1469ba5c1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 02:35:45 +0000
Subject: [PATCH 43/69] Predicates don't need noise when specialisation
 constants are available.

---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   | 20 +++---
 .../CodeGen/amdgpu-builtin-is-invocable.c     |  7 +-
 .../CodeGen/amdgpu-builtin-processor-is.c     |  7 +-
 .../select-accelerator-code-pass-ordering.cpp |  2 +-
 llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp | 68 ++++++++++++++++++-
 5 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 64baa34211218..c40155497f443 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/SyncScope.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "TargetInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -367,15 +368,12 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
 }
 
 static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
-  auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
-
-  auto *P = cast<GlobalVariable>(
-      CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
-  P->setConstant(true);
-  P->setExternallyInitialized(true);
-
-  return CGF.Builder.CreateLoad(
-      RawAddress(P, PTy, CharUnits::One(), KnownNonNull));
+  Function *SpecConstFn = CGF.getSpecConstantFunction(CGF.getContext().BoolTy);
+  llvm::Type *SpecIdTy = SpecConstFn->getArg(0)->getType();
+  Constant *SpecId = ConstantInt::getAllOnesValue(SpecIdTy);
+  return CGF.Builder.CreateCall(
+      SpecConstFn, {SpecId, ConstantInt::getFalse(CGF.getLLVMContext())},
+      Name + ".");
 }
 
 static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
@@ -913,7 +911,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
            "__builtin_amdgcn_processor_is should never reach CodeGen for "
            "concrete targets!");
     StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
-    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
+    return GetOrInsertAMDGPUPredicate(*this, "is." + Proc);
   }
   case AMDGPU::BI__builtin_amdgcn_is_invocable: {
     assert(CGM.getTriple().isSPIRV() &&
@@ -923,7 +921,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
         cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
     StringRef RF =
         getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
-    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
+    return GetOrInsertAMDGPUPredicate(*this, "has." + RF);
   }
   case AMDGPU::BI__builtin_amdgcn_read_exec:
     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 5a3395d2e0c55..e51c391a46764 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -11,9 +11,6 @@
 //       externally initialised bool global, and load from it to provide the
 //       condition to a br (abstract target)
 
-//.
-// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
-//.
 // AMDGCN-GFX900-LABEL: define dso_local void @foo(
 // AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
 // AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
@@ -28,8 +25,8 @@
 // AMDGCNSPIRV-LABEL: define spir_func void @foo(
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
-// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    [[LLVM_AMDGCN_HAS_GFX10_INSTS:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[LLVM_AMDGCN_HAS_GFX10_INSTS]], false
 // AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 4c55160e5ea6d..028f97f41181f 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -9,9 +9,6 @@
 //    2) for gfx1010 we emit an empty kernel (concrete target, does not match)
 //    3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
 //       load from it to provide the condition a br (abstract target)
-//.
-// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
-//.
 // AMDGCN-GFX900-LABEL: define dso_local void @foo(
 // AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
 // AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
@@ -26,8 +23,8 @@
 // AMDGCNSPIRV-LABEL: define spir_func void @foo(
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
-// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    [[LLVM_AMDGCN_IS_GFX900:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[LLVM_AMDGCN_IS_GFX900]], false
 // AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
index cffd3c7a5fb1f..44557284fc581 100644
--- a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
+++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
@@ -7,7 +7,7 @@
 // HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
 // HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
 // HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
-// HIPSTDPAR-PRE-NEXT: Running pass: AMDGPUExpandFeaturePredicatesPass
+// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass
 
 // Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
index c3cb7c5f9126d..bf97a1f571abf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -6,8 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The pass transforms IR globals that cannot be trivially mapped to SPIRV
-// into something that is trival to lower.
+// The pass:
+//   - transforms IR globals that cannot be trivially mapped to SPIRV into
+//     something that is trival to lower;
+//   - for AMDGCN flavoured SPIRV, it assigns unique IDs to the specialisation
+//     constants associated with feature predicates, which were inserted by the
+//     FE when expanding calls to __builtin_amdgcn_processor_is or
+//     __builtin_amdgcn_is_invocable
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,9 +20,14 @@
 #include "SPIRVUtils.h"
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 
+#include <climits>
+#include <string>
+
 #define DEBUG_TYPE "spirv-prepare-globals"
 
 using namespace llvm;
@@ -71,6 +81,51 @@ bool tryReplaceAliasWithAliasee(GlobalAlias &GA) {
   return true;
 }
 
+bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
+  StringMap<unsigned> IDs;
+
+  // Replace placeholder Specialisation Constant IDs with unique IDs associated
+  // with the predicate being evaluated, which is encoded in the call name.
+  for (auto &&U : F->users()) {
+    if (!isa<CallInst>(U))
+      continue;
+
+    auto *CI = cast<CallInst>(U);
+
+    if (!isa<ConstantInt>(CI->getArgOperand(0)))
+      continue;
+
+    unsigned ID = cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue();
+
+    if (ID != UINT32_MAX)
+      continue;
+
+    StringRef Name = CI->getName().substr(0, CI->getName().rfind('.'));
+    ID = IDs.try_emplace(Name, IDs.size()).first->second;
+
+    CI->setArgOperand(0, ConstantInt::get(CI->getArgOperand(0)->getType(), ID));
+  }
+
+  if (IDs.empty())
+    return false;
+
+  // Store the predicate -> ID mapping as a fixed format string
+  // (predicate ID\0...), for later use during SPIR-V consumption.
+  std::string Tmp;
+  for (auto &&[Predicate, SpecID] : IDs)
+    Tmp.append(Predicate).append(" ").append(utostr(SpecID)).push_back('\0');
+
+  Constant *PredSpecIDStr =
+      ConstantDataArray::getString(M.getContext(), Tmp, false);
+
+  new GlobalVariable(M, PredSpecIDStr->getType(), true,
+                     GlobalVariable::LinkageTypes::PrivateLinkage,
+                     PredSpecIDStr, "llvm.amdgcn.feature.predicate.ids");
+  M.dump();
+
+  return true;
+}
+
 bool SPIRVPrepareGlobals::runOnModule(Module &M) {
   bool Changed = false;
 
@@ -78,6 +133,15 @@ bool SPIRVPrepareGlobals::runOnModule(Module &M) {
     Changed |= tryReplaceAliasWithAliasee(GA);
   }
 
+  if (M.getTargetTriple().getVendor() != Triple::AMD)
+    return Changed;
+
+  // TODO: Currently the symbol can only be inserted via feature predicate use,
+  //       but in the future this will need revisiting if we start making more
+  //       liberal use of the intrinsic.
+  if (Function *F = M.getFunction("_Z20__spirv_SpecConstantib"))
+    Changed |= tryAssignPredicateSpecConstIDs(M, F);
+
   return Changed;
 }
 char SPIRVPrepareGlobals::ID = 0;

>From 5710e31aa3f2bf4d6a8c2f0f0f28b11f39fc8746 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 02:37:56 +0000
Subject: [PATCH 44/69] Remove noise.

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d2a2f81255344..80e722056a854 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -154,8 +154,10 @@ class AMDGPUCodeGenPassBuilder
   void addPostRegAlloc(PassManagerWrapper &PMW) const;
   void addPreEmitPass(PassManagerWrapper &PMWM) const;
   void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
+  Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
   Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
   void addPreRegAlloc(PassManagerWrapper &PMW) const;
+  Error addFastRegAlloc(PassManagerWrapper &PMW) const;
   void addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
   void addPreSched2(PassManagerWrapper &PMW) const;
   void addPostBBSections(PassManagerWrapper &PMW) const;
@@ -815,7 +817,8 @@ static bool mustPreserveGV(const GlobalValue &GV) {
 }
 
 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
-  AAM.registerFunctionAnalysis<AMDGPUAA>();
+  if (EnableAMDGPUAliasAnalysis)
+    AAM.registerFunctionAnalysis<AMDGPUAA>();
 }
 
 static Expected<ScanOptions>

>From 776c98b96cc4ce5ce2d8a003d3dce4016f5ef55d Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 02:38:36 +0000
Subject: [PATCH 45/69] Remove vestigial pass implementation.

---
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  | 195 ------------------
 1 file changed, 195 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
deleted file mode 100644
index e40b40c556576..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic feature
-// predicates into target specific quantities / sequences. In this context, a
-// generic feature predicate is an implementation detail global variable that
-// is inserted by the FE as a consequence of using either the __builtin_cpu_is
-// or the __builtin_amdgcn_is_invocable special builtins on an abstract target
-// (AMDGCNSPIRV). These placeholder globals are used to guide target specific
-// lowering, once the concrete target is known, by way of constant folding their
-// value all the way into a terminator (i.e. a controlled block) or into a no
-// live use scenario. We hard fail if the folding fails, to avoid obtuse BE
-// errors or opaque run time errors. This pass should run as early as possible /
-// immediately after Clang CodeGen, so that the optimisation pipeline and the BE
-// operate with concrete target data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-namespace {
-template <typename C> void collectUsers(Value *V, C &Container) {
-  assert(V && "Must pass an existing Value!");
-
-  for (auto &&U : V->users())
-    if (auto *I = dyn_cast<Instruction>(U))
-      Container.insert(Container.end(), I);
-}
-
-inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
-  const bool IsFeature = P->getName().starts_with("llvm.amdgcn.has");
-  const size_t Offset =
-      IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is");
-
-  std::string PV = P->getName().substr(Offset).str();
-  if (IsFeature) {
-    size_t Dx = PV.find(',');
-    while (Dx != std::string::npos) {
-      PV.insert(++Dx, {'+'});
-
-      Dx = PV.find(',', Dx);
-    }
-    PV.insert(PV.cbegin(), '+');
-  }
-
-  Type *PTy = P->getValueType();
-  P->setLinkage(GlobalValue::PrivateLinkage);
-  P->setExternallyInitialized(false);
-
-  if (IsFeature)
-    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV)));
-  else
-    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
-}
-
-std::pair<PreservedAnalyses, bool>
-unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
-  std::string W;
-  raw_string_ostream OS(W);
-
-  OS << "Impossible to constant fold feature predicate: " << *P << " used by "
-     << *NoFold << ", please simplify.\n";
-
-  Caller->getContext().diagnose(
-      DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
-
-  return {PreservedAnalyses::none(), false};
-}
-
-std::pair<PreservedAnalyses, bool>
-handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM,
-                SmallPtrSet<Function *, 32> &Predicated, GlobalVariable *P) {
-  setPredicate(ST, P);
-
-  SmallPtrSet<Instruction *, 32> ToFold;
-  collectUsers(P, ToFold);
-
-  if (ToFold.empty())
-    return {PreservedAnalyses::all(), true};
-
-  do {
-    Instruction *I = *ToFold.begin();
-    ToFold.erase(I);
-
-    I->dropDroppableUses();
-
-    Function *F = I->getParent()->getParent();
-    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-    if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) {
-      collectUsers(I, ToFold);
-      I->replaceAllUsesWith(C);
-      I->eraseFromParent();
-      continue;
-    } else if (I->isTerminator() &&
-               ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) {
-      Predicated.insert(F);
-
-      continue;
-    }
-
-    return unfoldableFound(I->getParent()->getParent(), P, I);
-  } while (!ToFold.empty());
-
-  return {PreservedAnalyses::none(), true};
-}
-} // Unnamed namespace.
-
-static inline SmallVector<Function *> collectUsedFunctions(Module &M) {
-  SmallVector<Function *> Ret;
-  for (auto &&F : M) {
-    if (F.isIntrinsic() || F.isDeclaration())
-      continue;
-    if (!F.hasInternalLinkage() && !F.hasPrivateLinkage())
-      continue;
-    if (F.hasNUndroppableUses(0))
-      continue;
-    Ret.push_back(&F);
-  }
-
-  return Ret;
-}
-
-template <typename Container0, typename Container1, typename Container2>
-static inline void removeUnreachable(const Container0 &Predicates,
-                                     const Container1 &PredicatedFns,
-                                     const Container2 &UnreachableFns) {
-  for_each(Predicates, [](auto &&P) { P->eraseFromParent(); });
-  for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); });
-  for_each(UnreachableFns, [](auto &&F) {
-    if (!F->hasNUndroppableUses(0))
-      return;
-    F->dropDroppableUses();
-    F->eraseFromParent();
-  });
-}
-
-PreservedAnalyses
-AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
-  if (M.empty())
-    return PreservedAnalyses::all();
-
-  SmallVector<GlobalVariable *> Predicates;
-  for (auto &&G : M.globals()) {
-    if (!G.isDeclaration() || !G.hasName())
-      continue;
-    if (G.getName().starts_with("llvm.amdgcn."))
-      Predicates.push_back(&G);
-  }
-
-  if (Predicates.empty())
-    return PreservedAnalyses::all();
-
-  const auto &ST = TM.getSubtarget<GCNSubtarget>(
-      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
-
-  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  SmallPtrSet<Function *, 32> Predicated;
-  SmallVector<Function *> MaybeUnreachable = collectUsedFunctions(M);
-  auto Ret = PreservedAnalyses::all();
-  for (auto &&P : Predicates) {
-    auto R = handlePredicate(ST, FAM, Predicated, P);
-
-    if (!R.second)
-      break;
-
-    Ret.intersect(R.first);
-  }
-
-  removeUnreachable(Predicates, Predicated, MaybeUnreachable);
-
-  return Ret;
-}

>From 915306ff9f08b20dc137bee4985f4692c3cdb019 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 02:43:48 +0000
Subject: [PATCH 46/69] Remove more noise.

---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 80e722056a854..f8a83e72bc3ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -2290,6 +2290,8 @@ void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
   if (EnableRegReassign) {
     addMachineFunctionPass(GCNNSAReassignPass(), PMW);
   }
+
+  addMachineFunctionPass(AMDGPURewriteAGPRCopyMFMAPass(), PMW);
 }
 
 void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
@@ -2311,6 +2313,42 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
   addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
 }
 
+Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
+  insertPass<PHIEliminationPass>(SILowerControlFlowPass());
+
+  insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass());
+
+  return Base::addFastRegAlloc(PMW);
+}
+
+Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
+    PassManagerWrapper &PMW) const {
+  // TODO: handle default regalloc override error (with regalloc-npm)
+
+  addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
+
+  addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
+                         PMW);
+
+  // Equivalent of PEI for SGPRs.
+  addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
+
+  // To Allocate wwm registers used in whole quad mode operations (for shaders).
+  addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
+
+  // For allocating other wwm register operands.
+  addMachineFunctionPass(RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}),
+                         PMW);
+
+  addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
+  addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
+
+  // For allocating per-thread VGPRs.
+  addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+
+  return Error::success();
+}
+
 void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
     PassManagerWrapper &PMW) const {
   if (EnableDCEInRA)

>From 5ac2a9e74e8976762eb1e6cc39b506021ddb6a30 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 02:45:42 +0000
Subject: [PATCH 47/69] Remove no longer useful tests.

---
 ...predicates-remove-unreachable-functions.ll | 104 ------------------
 ...pu-expand-feature-predicates-unfoldable.ll |  28 -----
 2 files changed, 132 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
deleted file mode 100644
index c5089de333849..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5
-; REQUIRES: amdgpu-registered-target
-
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
-
- at llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
-
-define external void @extern_linkage() {
-; GFX906-LABEL: define void @extern_linkage(
-; GFX906-SAME: ) #[[ATTR0:[0-9]+]] {
-; GFX906-NEXT:  [[ENTRY:.*:]]
-; GFX906-NEXT:    ret void
-;
-; GFX1201-LABEL: define void @extern_linkage(
-; GFX1201-SAME: ) #[[ATTR0:[0-9]+]] {
-; GFX1201-NEXT:  [[ENTRY:.*:]]
-; GFX1201-NEXT:    ret void
-;
-entry:
-  ret void
-}
-
-define private void @non_predicated_uses() {
-; GFX906-LABEL: define private void @non_predicated_uses(
-; GFX906-SAME: ) #[[ATTR0]] {
-; GFX906-NEXT:  [[ENTRY:.*:]]
-; GFX906-NEXT:    ret void
-;
-; GFX1201-LABEL: define private void @non_predicated_uses(
-; GFX1201-SAME: ) #[[ATTR0]] {
-; GFX1201-NEXT:  [[ENTRY:.*:]]
-; GFX1201-NEXT:    ret void
-;
-entry:
-  ret void
-}
-
-define internal void @remove_on_906() {
-; GFX1201-LABEL: define internal void @remove_on_906(
-; GFX1201-SAME: ) #[[ATTR0]] {
-; GFX1201-NEXT:  [[ENTRY:.*:]]
-; GFX1201-NEXT:    ret void
-;
-entry:
-  ret void
-}
-
-define internal void @remove_on_1201() {
-; GFX906-LABEL: define internal void @remove_on_1201(
-; GFX906-SAME: ) #[[ATTR0]] {
-; GFX906-NEXT:  [[ENTRY:.*:]]
-; GFX906-NEXT:    ret void
-;
-entry:
-  ret void
-}
-
-define void @foo() {
-; GFX906-LABEL: define void @foo(
-; GFX906-SAME: ) #[[ATTR0]] {
-; GFX906-NEXT:  [[ENTRY:.*:]]
-; GFX906-NEXT:    call void @non_predicated_uses()
-; GFX906-NEXT:    br label %[[NOT_GFX1201:.*]]
-; GFX906:       [[NOT_GFX1201]]:
-; GFX906-NEXT:    br label %[[GFX906:.*]]
-; GFX906:       [[GFX906]]:
-; GFX906-NEXT:    call void @remove_on_1201()
-; GFX906-NEXT:    br label %[[END:.*]]
-; GFX906:       [[END]]:
-; GFX906-NEXT:    ret void
-;
-; GFX1201-LABEL: define void @foo(
-; GFX1201-SAME: ) #[[ATTR0]] {
-; GFX1201-NEXT:  [[ENTRY:.*:]]
-; GFX1201-NEXT:    call void @non_predicated_uses()
-; GFX1201-NEXT:    br label %[[GFX1201:.*]]
-; GFX1201:       [[GFX1201]]:
-; GFX1201-NEXT:    call void @remove_on_906()
-; GFX1201-NEXT:    br label %[[END:.*]]
-; GFX1201:       [[END]]:
-; GFX1201-NEXT:    ret void
-;
-entry:
-  call void @non_predicated_uses()
-  %0 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
-  br i1 %0, label %gfx1201, label %not.gfx1201
-
-gfx1201:
-  call void @remove_on_906()
-  br label %end
-
-not.gfx1201:
-  %1 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
-  br i1 %1, label %gfx906, label %end
-
-gfx906:
-  call void @remove_on_1201()
-  br label %end
-
-end:
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
deleted file mode 100644
index bfc35d8c76e37..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; REQUIRES: amdgpu-registered-target
-
-; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s
-
-; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by   %call = call i1 %1(i1 zeroext false), please simplify.
-
- at llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1
-
-declare void @llvm.amdgcn.s.sleep(i32 immarg) #1
-
-define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) {
-entry:
-  %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
-  %1 = inttoptr i64 %0 to ptr
-  %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
-  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1
-  %call = call i1 %1(i1 zeroext %3)
-  br i1 %call, label %if.gfx803, label %if.end
-
-if.gfx803:
-  call void @llvm.amdgcn.s.sleep(i32 0)
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-attributes #1 = { nocallback nofree nosync nounwind willreturn }

>From 6eb319908a6cb5426019209eb73398da37c03ff4 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 03:01:43 +0000
Subject: [PATCH 48/69] Update tests.

---
 clang/test/CodeGen/amdgpu-builtin-is-invocable.c | 14 +++++++-------
 clang/test/CodeGen/amdgpu-builtin-processor-is.c | 13 +++++++------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index e51c391a46764..b46e0b83970fa 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -7,9 +7,9 @@
 // things happens:
 //    1) for gfx900 we emit an empty kernel (concrete target, lacks feature)
 //    2) for gfx1010 we emit a call to trap (concrete target, has feature)
-//    3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant
-//       externally initialised bool global, and load from it to provide the
-//       condition to a br (abstract target)
+//    3) for AMDGCNSPIRV we emit a boolean specialisation constant, via a call
+//       to __spirv_SpecConstant, with the id of UINT32_MAX, and the boolean
+//       value of false, which will yield an OpSpecConstantFalse in SPIR-V
 
 // AMDGCN-GFX900-LABEL: define dso_local void @foo(
 // AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -25,8 +25,8 @@
 // AMDGCNSPIRV-LABEL: define spir_func void @foo(
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT:    [[LLVM_AMDGCN_HAS_GFX10_INSTS:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
-// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[LLVM_AMDGCN_HAS_GFX10_INSTS]], false
+// AMDGCNSPIRV-NEXT:    [[HAS_GFX10_INSTS_:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[HAS_GFX10_INSTS_]], false
 // AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
@@ -39,9 +39,9 @@ void foo() {
         return __builtin_trap();
 }
 //.
-// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
 //.
-// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
 // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
 // AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 028f97f41181f..5a1f63c673127 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -7,8 +7,9 @@
 // things happens:
 //    1) for gfx900 we emit a call to trap (concrete target, matches)
 //    2) for gfx1010 we emit an empty kernel (concrete target, does not match)
-//    3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
-//       load from it to provide the condition a br (abstract target)
+//    3) for AMDGCNSPIRV we emit a boolean specialisation constant, via a call
+//       to __spirv_SpecConstant, with the id of UINT32_MAX, and the boolean
+//       value of false, which will yield an OpSpecConstantFalse in SPIR-V
 // AMDGCN-GFX900-LABEL: define dso_local void @foo(
 // AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
 // AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
@@ -23,8 +24,8 @@
 // AMDGCNSPIRV-LABEL: define spir_func void @foo(
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT:    [[LLVM_AMDGCN_IS_GFX900:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
-// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[LLVM_AMDGCN_IS_GFX900]], false
+// AMDGCNSPIRV-NEXT:    [[IS_GFX900_:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[IS_GFX900_]], false
 // AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
@@ -37,10 +38,10 @@ void foo() {
         return __builtin_trap();
 }
 //.
-// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dpp,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" }
 // AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
 //.
-// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+sad-insts,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
 //.
 // AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
 // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }

>From eb4beb5d8ec9a85180ec79bb274a4a266700d1ab Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 03:02:05 +0000
Subject: [PATCH 49/69] Include what we use.

---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index c40155497f443..8fb2fe1577e2f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -15,7 +15,6 @@
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/SyncScope.h"
 #include "clang/Basic/TargetBuiltins.h"
-#include "TargetInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"

>From e326aef6f61d2ce6517f04ecc74ebddb7c71fdb7 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 03:05:25 +0000
Subject: [PATCH 50/69] Tweak release notes.

---
 clang/docs/ReleaseNotes.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 148acf04000e2..9c8dfb1024abc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -206,7 +206,6 @@ AMDGPU Support
   a late / deferred query for the current target processor.
 - Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
   which enables fine-grained, per-builtin, feature availability.
-
 - Initial support for gfx1310
 
 NVPTX Support

>From 84d01ac4112247612354415b84518a1449518ad5 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sat, 24 Jan 2026 03:05:44 +0000
Subject: [PATCH 51/69] Update BI docs.

---
 clang/docs/LanguageExtensions.rst | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index cb278b696675c..3d4e201595b7a 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5445,11 +5445,12 @@ The boolean interpretation of the predicate values returned by the builtins:
 When invoked while compiling for a concrete target, the builtins are evaluated
 early by Clang, and never produce any CodeGen effects / have no observable
 side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
-which is an abstract target, a series of predicate values are implicitly
-created. These predicates get resolved when finalizing the compilation process
-for a concrete target, and shall reflect the latter's identity and features.
-Thus, it is possible to author high-level code, in e.g. HIP, that is target
-adaptive in a dynamic fashion, contrary to macro based mechanisms.
+which is an abstract target, a series of specialization constants are implicitly
+created, in correspondence with the predicates. These predicates get resolved
+when finalizing the compilation process for a concrete target, and shall reflect
+the latter's identity and features. Thus, it is possible to author high-level
+code, in e.g. HIP, that is target adaptive in a dynamic fashion, contrary to
+macro based mechanisms.
 
 __builtin_amdgcn_ballot_w{32,64}
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

>From 0a211ba88baea5092c6ead0903dd58243ea6be37 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sun, 25 Jan 2026 23:44:25 +0000
Subject: [PATCH 52/69] Remove vestigial test.

---
 .../amdgpu-expand-feature-predicates.ll       | 284 ------------------
 1 file changed, 284 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
deleted file mode 100644
index a16a7fc31da22..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
+++ /dev/null
@@ -1,284 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; REQUIRES: amdgpu-registered-target
-
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s
-
-;; The IR was derived from the following source:
-;; extern "C" __global__ void kernel(int* p, int x)
-;; {
-;;     if (__builtin_amdgcn_processor_is("gfx1201") ||
-;;         __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
-;;         __builtin_amdgcn_s_sleep_var(x);
-;;     if (!__builtin_amdgcn_processor_is("gfx906"))
-;;         __builtin_amdgcn_s_wait_event_export_ready();
-;;     else if (__builtin_amdgcn_processor_is("gfx1010") ||
-;;         __builtin_amdgcn_processor_is("gfx1101"))
-;;         __builtin_amdgcn_s_ttracedata_imm(1);
-;;     while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
-;;     do {
-;;         *p -= x;
-;;     } while (__builtin_amdgcn_processor_is("gfx1010"));
-;;     for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
-;;
-;;     if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
-;;         __builtin_amdgcn_s_wait_event_export_ready();
-;;     else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
-;;         __builtin_amdgcn_s_ttracedata_imm(1);
-;;
-;;     do {
-;;         *p -= x;
-;;     } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
-;;     for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
-;; }
-
- at llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1
- at llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
-@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1
-
-declare void @llvm.amdgcn.s.sleep.var(i32)
-declare void @llvm.amdgcn.s.wait.event.export.ready()
-declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg)
-
-define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
-; GFX906-LABEL: define amdgpu_kernel void @kernel(
-; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX906-NEXT:  [[ENTRY:.*:]]
-; GFX906-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
-; GFX906-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX906-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
-; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS]]:
-; GFX906-NEXT:    br label %[[IF_NOT_GFX907:.*]]
-; GFX906:       [[IF_NOT_GFX907]]:
-; GFX906-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
-; GFX906:       [[IF_GFX1010_OR_GFX1101]]:
-; GFX906-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
-; GFX906:       [[LOR_NOT_GFX1010]]:
-; GFX906-NEXT:    br label %[[FOR_COND:.*]]
-; GFX906:       [[FOR_COND]]:
-; GFX906-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX906-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
-; GFX906-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX906-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
-; GFX906:       [[IF_GFX11_INSTS]]:
-; GFX906-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
-; GFX906:       [[IF_GFX10_INSTS]]:
-; GFX906-NEXT:    call void @llvm.assume(i1 true)
-; GFX906-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX906-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
-; GFX906-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
-; GFX906-NEXT:    ret void
-;
-; GFX1010-LABEL: define amdgpu_kernel void @kernel(
-; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX1010-NEXT:  [[ENTRY:.*:]]
-; GFX1010-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
-; GFX1010-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1010-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
-; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS]]:
-; GFX1010-NEXT:    br label %[[IF_NOT_GFX906:.*]]
-; GFX1010:       [[IF_NOT_GFX906]]:
-; GFX1010-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
-; GFX1010:       [[LOR_NOT_GFX1010]]:
-; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1010-NEXT:    br label %[[IF_END6:.*]]
-; GFX1010:       [[IF_END6]]:
-; GFX1010-NEXT:    call void @llvm.assume(i1 true)
-; GFX1010-NEXT:    call void @llvm.assume(i1 true)
-; GFX1010-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1010:       [[FOR_COND]]:
-; GFX1010-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1010-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
-; GFX1010-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX1010-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
-; GFX1010:       [[IF_GFX11_INSTS]]:
-; GFX1010-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
-; GFX1010:       [[IF_GFX10_INSTS]]:
-; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-; GFX1010-NEXT:    br label %[[IF_END11:.*]]
-; GFX1010:       [[IF_END11]]:
-; GFX1010-NEXT:    call void @llvm.assume(i1 true)
-; GFX1010-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1010-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
-; GFX1010-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
-; GFX1010-NEXT:    ret void
-;
-; GFX1101-LABEL: define amdgpu_kernel void @kernel(
-; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX1101-NEXT:  [[ENTRY:.*:]]
-; GFX1101-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
-; GFX1101-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1101-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
-; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS]]:
-; GFX1101-NEXT:    br label %[[IF_END:.*]]
-; GFX1101:       [[IF_END]]:
-; GFX1101-NEXT:    br label %[[IF_NOT_GFX907:.*]]
-; GFX1101:       [[IF_NOT_GFX907]]:
-; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1101-NEXT:    br label %[[IF_NOT_GFX906:.*]]
-; GFX1101:       [[IF_NOT_GFX906]]:
-; GFX1101-NEXT:    call void @llvm.assume(i1 true)
-; GFX1101-NEXT:    call void @llvm.assume(i1 true)
-; GFX1101-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1101:       [[FOR_COND]]:
-; GFX1101-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1101-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
-; GFX1101-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX1101-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
-; GFX1101:       [[IF_GFX11_INSTS]]:
-; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1101-NEXT:    br label %[[IF_ELSE8:.*]]
-; GFX1101:       [[IF_ELSE8]]:
-; GFX1101-NEXT:    call void @llvm.assume(i1 true)
-; GFX1101-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1101-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
-; GFX1101-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
-; GFX1101-NEXT:    ret void
-;
-; GFX1201-LABEL: define amdgpu_kernel void @kernel(
-; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX1201-NEXT:  [[ENTRY:.*:]]
-; GFX1201-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
-; GFX1201-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1201-NEXT:    br label %[[LOR_NOT_GFX1201:.*]]
-; GFX1201:       [[LOR_NOT_GFX1201]]:
-; GFX1201-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX1201-NEXT:    br label %[[IF_NOT_GFX906:.*]]
-; GFX1201:       [[IF_NOT_GFX906]]:
-; GFX1201-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
-; GFX1201:       [[IF_GFX1010_OR_GFX1101]]:
-; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1201-NEXT:    br label %[[IF_END6:.*]]
-; GFX1201:       [[IF_END6]]:
-; GFX1201-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1201:       [[FOR_COND]]:
-; GFX1201-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1201-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
-; GFX1201-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX1201-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
-; GFX1201:       [[IF_GFX11_INSTS]]:
-; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1201-NEXT:    br label %[[IF_ELSE8:.*]]
-; GFX1201:       [[IF_ELSE8]]:
-; GFX1201-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1201-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
-; GFX1201-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
-; GFX1201-NEXT:    ret void
-;
-; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel(
-; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX1201-W64-NEXT:  [[ENTRY:.*:]]
-; GFX1201-W64-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
-; GFX1201-W64-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; GFX1201-W64-NEXT:    br label %[[LOR_NOT_GFX1201:.*]]
-; GFX1201-W64:       [[LOR_NOT_GFX1201]]:
-; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
-; GFX1201-W64-NEXT:    br label %[[IF_NOT_GFX906:.*]]
-; GFX1201-W64:       [[IF_NOT_GFX906]]:
-; GFX1201-W64-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
-; GFX1201-W64:       [[IF_GFX1010_OR_GFX1101]]:
-; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1201-W64-NEXT:    br label %[[IF_END6:.*]]
-; GFX1201-W64:       [[IF_END6]]:
-; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-W64-NEXT:    br label %[[FOR_COND:.*]]
-; GFX1201-W64:       [[FOR_COND]]:
-; GFX1201-W64-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1201-W64-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
-; GFX1201-W64-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
-; GFX1201-W64-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
-; GFX1201-W64:       [[IF_GFX11_INSTS]]:
-; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
-; GFX1201-W64-NEXT:    br label %[[IF_ELSE8:.*]]
-; GFX1201-W64:       [[IF_ELSE8]]:
-; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
-; GFX1201-W64-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
-; GFX1201-W64-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
-; GFX1201-W64-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
-; GFX1201-W64-NEXT:    ret void
-;
-entry:
-  %0 = ptrtoint ptr addrspace(1) %p.coerce to i64
-  %1 = inttoptr i64 %0 to ptr
-  %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
-  br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201
-
-lor.not.gfx1201:
-  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1
-  br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end
-
-if.gfx1201.or.gfx12-insts:
-  call void @llvm.amdgcn.s.sleep.var(i32 %x)
-  br label %if.end
-
-if.end:
-  %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
-  br i1 %4, label %if.gfx906, label %if.not.gfx906
-
-if.not.gfx906:
-  call void @llvm.amdgcn.s.wait.event.export.ready()
-  br label %if.end6
-
-if.gfx906:
-  %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
-  br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010
-
-lor.not.gfx1010:
-  %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
-  br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond
-
-if.gfx1010.or.gfx1101:
-  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-  br label %if.end6
-
-if.end6:
-  %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
-  %7 = icmp ne i1 %.pr.pr, true
-  call void @llvm.assume(i1 %7)
-  %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
-  %8 = icmp ne i1 %.pr6.pr, true
-  call void @llvm.assume(i1 %8)
-  br label %for.cond
-
-for.cond:
-  %.promoted = load i32, ptr %1, align 4
-  %sub.peel = sub nsw i32 %.promoted, %x
-  store i32 %sub.peel, ptr %1, align 4
-  %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1
-  br i1 %9, label %if.gfx11-insts, label %if.else8
-
-if.gfx11-insts:
-  call void @llvm.amdgcn.s.wait.event.export.ready()
-  br label %if.end11
-
-if.else8:
-  %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
-  br i1 %10, label %if.gfx10-insts, label %if.end11
-
-if.gfx10-insts:
-  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
-  br label %if.end11
-
-if.end11:
-  %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1
-  %11 = icmp ne i1 %.pr7, true
-  call void @llvm.assume(i1 %11)
-  %.promoted9 = load i32, ptr %1, align 4
-  %sub13.peel = sub nsw i32 %.promoted9, %x
-  store i32 %sub13.peel, ptr %1, align 4
-  ret void
-}
-
-declare void @llvm.assume(i1 noundef)

>From 89e5d01ccec926af268af2d10b140640b4a598c1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 26 Jan 2026 00:29:05 +0000
Subject: [PATCH 53/69] Remove debugging bit.

---
 llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
index bf97a1f571abf..45016f19cd64b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -121,7 +121,6 @@ bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
   new GlobalVariable(M, PredSpecIDStr->getType(), true,
                      GlobalVariable::LinkageTypes::PrivateLinkage,
                      PredSpecIDStr, "llvm.amdgcn.feature.predicate.ids");
-  M.dump();
 
   return true;
 }

>From 097f2744e1170c088fd4b77b2f55b483e33b1f0c Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 27 Jan 2026 02:59:58 +0000
Subject: [PATCH 54/69] Emit decoration AFTER defining the decorated register.

---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 1fbcf50ba02d7..c98003bef8d21 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2394,11 +2394,6 @@ static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call,
 
   switch (Opcode) {
   case SPIRV::OpSpecConstant: {
-    // Build the SpecID decoration.
-    unsigned SpecId =
-        static_cast<unsigned>(getIConstVal(Call->Arguments[0], MRI));
-    buildOpDecorate(Call->ReturnRegister, MIRBuilder, SPIRV::Decoration::SpecId,
-                    {SpecId});
     // Determine the constant MI.
     Register ConstRegister = Call->Arguments[1];
     const MachineInstr *Const = getDefInstrMaybeConstant(ConstRegister, MRI);
@@ -2424,6 +2419,11 @@ static bool generateSpecConstantInst(const SPIRV::IncomingCall *Call,
       else
         addNumImm(ConstOperand.getFPImm()->getValueAPF().bitcastToAPInt(), MIB);
     }
+    // Build the SpecID decoration.
+    unsigned SpecId =
+        static_cast<unsigned>(getIConstVal(Call->Arguments[0], MRI));
+    buildOpDecorate(Call->ReturnRegister, MIRBuilder, SPIRV::Decoration::SpecId,
+                    {SpecId});
     return true;
   }
   case SPIRV::OpSpecConstantComposite: {

>From d9012fafb5f425e8959e0458a59b39151ff28055 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 27 Jan 2026 16:16:17 +0000
Subject: [PATCH 55/69] Add BE test.

---
 ...gcnspirv-feature-predicate-specconstant.ll | 246 ++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll

diff --git a/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll b/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
new file mode 100644
index 0000000000000..32b18dc1023e4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
@@ -0,0 +1,246 @@
+; RUN: llc -O0 --verify-machineinstrs -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[#KERNEL:]] "kernel"
+; CHECK: OpName %[[#FEATURE_PREDICATE_IDS:]] "llvm.amdgcn.feature.predicate.ids"
+; CHECK: OpName %[[#SET_FPENV_I64:]] "spirv.llvm_set_fpenv_i64"
+; CHECK: OpName %[[#ASHR_PK_I8_I32:]] "spirv.llvm_amdgcn_ashr_pk_i8_i32"
+; CHECK: OpName %[[#S_SLEEP_VAR:]] "spirv.llvm_amdgcn_s_sleep_var"
+; CHECK: OpName %[[#S_WAIT_EVENT_EXPORT_READY:]] "spirv.llvm_amdgcn_s_wait_event_export_ready"
+; CHECK: OpName %[[#S_TTRACEDATA_IMM:]] "spirv.llvm_amdgcn_s_ttracedata_imm"
+; CHECK: OpDecorate %[[#IS_GFX950:]] SpecId 7
+; CHECK: OpDecorate %[[#IS_GFX1201:]] SpecId 2
+; CHECK: OpDecorate %[[#HAS_GFX12_INSTS:]] SpecId 6
+; CHECK: OpDecorate %[[#IS_GFX906:]] SpecId 5
+; CHECK: OpDecorate %[[#IS_GFX1010:]] SpecId 4
+; CHECK: OpDecorate %[[#IS_GFX1101:]] SpecId 3
+; CHECK: OpDecorate %[[#IS_GFX1101_1:]] SpecId 3
+; CHECK: OpDecorate %[[#IS_GFX1201_1:]] SpecId 2
+; CHECK: OpDecorate %[[#HAS_GFX11_INSTS:]] SpecId 0
+; CHECK: OpDecorate %[[#HAS_GFX10_INSTS:]] SpecId 1
+; CHECK: OpDecorate %[[#HAS_GFX11_INSTS_1:]] SpecId 0
+; CHECK: %[[#BOOL:]] = OpTypeBool
+; CHECK: %[[#UCHAR:]] = OpTypeInt 8
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN:]] = OpConstant %[[#]] 117
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRTY:]] = OpTypeArray %[[#UCHAR]] %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN]]
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRVAL:]] = OpConstantComposite %[[#FEATURE_PREDICATE_IDS_MAP_STRTY]]
+; CHECK: %[[#FEATURE_PREDICATE_IDS]] = OpVariable %[[#]] CrossWorkgroup %[[#FEATURE_PREDICATE_IDS_MAP_STRVAL]]
+; CHECK: %[[#IS_GFX950]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1201]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX12_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX906]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1010]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1101]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1101_1]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#IS_GFX1201_1]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX11_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX10_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
+; CHECK: %[[#HAS_GFX11_INSTS_1]] = OpSpecConstantFalse %[[#BOOL]]
+
+declare void @llvm.amdgcn.s.monitor.sleep(i16 immarg) addrspace(4)
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) addrspace(4)
+
+declare i1 @_Z20__spirv_SpecConstantib(i32, i1) addrspace(4)
+
+declare i16 @llvm.amdgcn.ashr.pk.i8.i32(i32, i32, i32) addrspace(4) #3
+
+declare void @llvm.set.fpenv.i64(i64) addrspace(4) #4
+
+declare void @llvm.amdgcn.s.sleep.var(i32) addrspace(4) #5
+
+declare void @llvm.amdgcn.s.wait.event.export.ready() addrspace(4) #5
+
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) addrspace(4) #6
+
+ at p = external addrspace(1) global i32
+ at g = external addrspace(1) constant i32
+
+define void @kernel() addrspace(4) {
+; CHECK-DAG: %[[#KERNEL]] = OpFunction %33 None %34 ; -- Begin function kernel
+; CHECK-NEXT: %2 = OpLabel
+; CHECK-NEXT: %100 = OpLoad %36 %75 Aligned 4
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX950]] %4 %3
+; CHECK-NEXT: %3 = OpLabel
+; CHECK-NEXT: %101 = OpFunctionCall %33 %[[#SET_FPENV_I64]] %50
+; CHECK-NEXT: OpBranch %5
+; CHECK-NEXT: %4 = OpLabel
+; CHECK-NEXT: %102 = OpFunctionCall %39 %[[#ASHR_PK_I8_I32]] %49 %49 %49
+; CHECK-NEXT: OpBranch %5
+; CHECK-NEXT: %5 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201]] %7 %6
+; CHECK-NEXT: %6 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX12_INSTS]] %7 %8
+; CHECK-NEXT: %7 = OpLabel
+; CHECK-NEXT: %103 = OpFunctionCall %33 %[[#S_SLEEP_VAR]] %100
+; CHECK-NEXT: OpBranch %8
+; CHECK-NEXT: %8 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX906]] %10 %9
+; CHECK-NEXT: %9 = OpLabel
+; CHECK-NEXT: %104 = OpFunctionCall %33 %[[#S_WAIT_EVENT_EXPORT_READY]]
+; CHECK-NEXT: OpBranch %14
+; CHECK-NEXT: %10 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1010]] %12 %11
+; CHECK-NEXT: %11 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101]] %12 %13
+; CHECK-NEXT: %12 = OpLabel
+; CHECK-NEXT: %105 = OpFunctionCall %33 %[[#S_TTRACEDATA_IMM]] %48
+; CHECK-NEXT: OpBranch %13
+; CHECK-NEXT: %13 = OpLabel
+; CHECK-NEXT: OpBranch %14
+; CHECK-NEXT: %14 = OpLabel
+; CHECK-NEXT: OpBranch %15
+; CHECK-NEXT: %15 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101_1]] %16 %17
+; CHECK-NEXT: %16 = OpLabel
+; CHECK-NEXT: %106 = OpLoad %36 %87 Aligned 4
+; CHECK-NEXT: %107 = OpIAdd %36 %106 %100
+; CHECK-NEXT: OpStore %87 %107 Aligned 4
+; CHECK-NEXT: OpBranch %17
+; CHECK-NEXT: %17 = OpLabel
+; CHECK-NEXT: OpBranch %18
+; CHECK-NEXT: %18 = OpLabel
+; CHECK-NEXT: %108 = OpLoad %36 %87 Aligned 4
+; CHECK-NEXT: %109 = OpISub %36 %108 %100
+; CHECK-NEXT: OpStore %87 %109 Aligned 4
+; CHECK-NEXT: OpBranch %19
+; CHECK-NEXT: %19 = OpLabel
+; CHECK-NEXT: OpBranch %20
+; CHECK-NEXT: %20 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201_1]] %21 %22
+; CHECK-NEXT: %21 = OpLabel
+; CHECK-NEXT: OpBranch %22
+; CHECK-NEXT: %22 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS]] %26 %23
+; CHECK-NEXT: %23 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX10_INSTS]] %24 %25
+; CHECK-NEXT: %24 = OpLabel
+; CHECK-NEXT: %110 = OpFunctionCall %33 %[[#S_TTRACEDATA_IMM]] %48
+; CHECK-NEXT: OpBranch %25
+; CHECK-NEXT: %25 = OpLabel
+; CHECK-NEXT: OpBranch %27
+; CHECK-NEXT: %26 = OpLabel
+; CHECK-NEXT: %111 = OpFunctionCall %33 %[[#S_WAIT_EVENT_EXPORT_READY]]
+; CHECK-NEXT: OpBranch %27
+; CHECK-NEXT: %27 = OpLabel
+; CHECK-NEXT: OpBranch %28
+; CHECK-NEXT: %28 = OpLabel
+; CHECK-NEXT: %112 = OpLoad %36 %87 Aligned 4
+; CHECK-NEXT: %113 = OpISub %36 %112 %100
+; CHECK-NEXT: OpStore %87 %113 Aligned 4
+; CHECK-NEXT: OpBranch %29
+; CHECK-NEXT: %29 = OpLabel
+; CHECK-NEXT: OpBranch %30
+; CHECK-NEXT: %30 = OpLabel
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS_1]] %31 %32
+; CHECK-NEXT: %31 = OpLabel
+; CHECK-NEXT: OpBranch %32
+; CHECK-NEXT: %32 = OpLabel
+
+entry:
+  %x = load i32, ptr addrspace(1) @g
+  %is.gfx950. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx950., label %cond.true, label %cond.false
+cond.true:
+  %0 = call addrspace(4) i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 8, i32 8, i32 8)
+  br label %cond.end
+cond.false:
+  call addrspace(4) void @llvm.set.fpenv.i64(i64 -1)
+  br label %cond.end
+cond.end:
+  %is.gfx1201. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx1201., label %if.then, label %lor.lhs.false
+lor.lhs.false:
+  %has.gfx12-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %has.gfx12-insts., label %if.then, label %if.end
+if.then:
+  call addrspace(4) void @llvm.amdgcn.s.sleep.var(i32 %x)
+  br label %if.end
+if.end:
+  %is.gfx906. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx906., label %if.else, label %if.then2
+if.then2:
+  call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end6
+if.else:
+  %is.gfx1010. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx1010., label %if.then4, label %lor.lhs.false3
+lor.lhs.false3:
+  %is.gfx1101. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx1101., label %if.then4, label %if.end5
+if.then4:
+  call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end5
+if.end5:
+  br label %if.end6
+if.end6:
+  br label %while.cond
+while.cond:
+  %is.gfx1101.7 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx1101.7, label %while.body, label %while.end
+while.body:
+  %4 = load i32, ptr addrspace(1) @p
+  %add = add i32 %4, %x
+  store i32 %add, ptr addrspace(1) @p
+  br label %while.end
+while.end:
+  br label %do.body
+do.body:
+  %7 = load i32, ptr addrspace(1) @p
+  %sub = sub i32 %7, %x
+  store i32 %sub, ptr addrspace(1) @p
+  br label %do.end
+do.cond:
+  %is.gfx1010.8 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx1010.8, label %do.body, label %do.end
+do.end:
+  br label %for.cond
+for.cond:
+  %is.gfx1201.9 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %is.gfx1201.9, label %for.body, label %for.end
+for.body:
+  br label %for.end
+for.inc:
+  %9 = load i32, ptr addrspace(1) @p
+  %inc = add i32 %9, 1
+  store i32 %inc, ptr addrspace(1) @p
+  br label %for.cond
+for.end:
+  %has.gfx11-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %has.gfx11-insts., label %if.then10, label %if.else11
+if.then10:
+  call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end14
+if.else11:
+  %has.gfx10-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %has.gfx10-insts., label %if.then12, label %if.end13
+if.then12:
+  call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end13
+if.end13:
+  br label %if.end14
+if.end14:
+  br label %do.body15
+do.body15:
+  %12 = load i32, ptr addrspace(1) @p
+  %sub16 = sub i32 %12, %x
+  store i32 %sub16, ptr addrspace(1) @p
+  br label %do.end18
+do.cond17:
+  %has.gfx1250-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %has.gfx1250-insts., label %do.body15, label %do.end18
+do.end18:
+  br label %for.cond19
+for.cond19:
+  %has.gfx11-insts.20 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  br i1 %has.gfx11-insts.20, label %for.body21, label %for.end24
+for.body21:
+  br label %for.end24
+for.inc22:
+  %14 = load i32, ptr addrspace(1) @p
+  %inc23 = add i32 %14, 1
+  store i32 %inc23, ptr addrspace(1) @p
+  br label %for.cond19
+for.end24:
+  ret void
+}

>From c76c741d5717d2a25c672bbb0fc71bfa7f33df6f Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 00:10:38 +0000
Subject: [PATCH 56/69] Finalise diagnostics.

---
 .../clang/Basic/DiagnosticSemaKinds.td        | 16 ++-
 clang/lib/Sema/SemaAMDGPU.cpp                 | 99 +++++++++++++++----
 ...amdgpu-is-invocable-guards-builtin-use.hip | 47 ---------
 .../SemaHIP/amdgpu-predicates-guard-use.hip   | 70 +++++++++++++
 4 files changed, 163 insertions(+), 69 deletions(-)
 delete mode 100644 clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
 create mode 100644 clang/test/SemaHIP/amdgpu-predicates-guard-use.hip

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 58b4ac9a9cf33..f317232984553 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13899,7 +13899,7 @@ def note_amdgcn_processor_is_valid_options
     : Note<"valid AMDGCN processor identifiers are: %0">;
 def err_amdgcn_is_invocable_arg_invalid_value
     : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
-            "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
+            "target agnostic builtin or an AMDGCN target specific builtin; '%0'"
             " is not valid">;
 def err_amdgcn_predicate_type_is_not_constructible
     : Error<"%0 has type __amdgpu_feature_predicate_t, which is not"
@@ -13910,6 +13910,20 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast
             "guarding of target dependent code, and thus should be avoided">;
 def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled"
                                               " by AMDGPU feature predicate">;
+def err_amdgcn_conflicting_is_processor_options
+    : Error<"conflicting check for AMDGCN processor %0 found in a scope already"
+            " controlled by a check for AMDGCN processor">;
+def note_amdgcn_previous_is_processor_guard
+    : Note<"predicate guard, with establishes the context, inserted here">;
+def warn_amdgcn_unguarded_asm_stmt
+    : Warning<"the '%0' ASM sequence might be invalid for some AMDGPU targets">,
+    InGroup<UnguardedBuiltinUsageAMDGPU>, DefaultIgnore;
+def note_amdgcn_unguarded_asm_silence
+  : Note<"enclose the '%0' ASM sequence in a scope controlled by a "
+         "__builtin_amdgcn_is_processor check to silence this warning">;
+def err_amdgcn_incompatible_builtin
+  : Error<"%0 cannot be invoked in the current context, as it requires the "
+          "'%1' feature(s)%select{|, which '%3' does not provide}2">;
 def warn_amdgcn_unguarded_builtin :
   Warning<"%0 might be unavailable on some AMDGPU targets">,
   InGroup<UnguardedBuiltinUsageAMDGPU>, DefaultIgnore;
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index f63e5a599a58f..cd7d6f6bb2bb4 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -21,9 +21,14 @@
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/TargetParser/TargetParser.h"
 #include <cstdint>
+#include <utility>
 
 namespace clang {
 
@@ -723,21 +728,14 @@ namespace {
 /// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
 /// guarded.
 class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
-  // TODO: this is conservative, and should be extended to:
-  //       - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the
-  //         guard);
-  //       - build sets of builtins which are invocable from nested
-  //         if (__builtin_amdgcn_is_invocable) calls, rather than assume
-  //         sanity / that the existence of a guard implies its correctness;
-  //       - derive the set of available builtins / valid ASM constraints from
-  //         the target architecture passed to __builtin_amdgcn_processor_is;
-  //       - consider attributes such as target.
+  // TODO: this could eventually be extended to consider attributes such as
+  //       target.
   Sema &SemaRef;
 
-  unsigned Guards;
-
+  SmallVector<std::pair<CallExpr *, StringRef>> CurrentGFXIP;
+  SmallVector<std::pair<unsigned, StringRef>> GuardedBuiltins;
 public:
-  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {}
+  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef) {}
 
   bool TraverseLambdaExpr(LambdaExpr *LE) override {
     if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(
@@ -760,12 +758,14 @@ class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
     return TraverseStmt(CS->getSubStmt());
   }
 
+  bool VisitAsmStmt(AsmStmt *ASM) override;
   bool VisitCallExpr(CallExpr *CE) override;
 };
 
 inline Expr *FindPredicate(Expr *Cond) {
   if (auto *CE = dyn_cast<CallExpr>(Cond)) {
-    if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable)
+    if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable ||
+        CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is)
       return Cond;
   } else if (auto *UO = dyn_cast<UnaryOperator>(Cond)) {
     return FindPredicate(UO->getSubExpr());
@@ -779,9 +779,34 @@ inline Expr *FindPredicate(Expr *Cond) {
 
 bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
   if (FindPredicate(If->getCond())) {
-    ++Guards;
+    auto *CE = cast<CallExpr>(If->getCond());
+    bool IsProcessorCheck =
+        CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is;
+
+    if (IsProcessorCheck) {
+      StringRef G = cast<clang::StringLiteral>(CE->getArg(0))->getString();
+      // TODO: handle generic ISAs.
+      if (!CurrentGFXIP.empty() && G != CurrentGFXIP.back().second) {
+        SemaRef.Diag(CE->getExprLoc(),
+                     diag::err_amdgcn_conflicting_is_processor_options) << CE;
+        SemaRef.Diag(CurrentGFXIP.back().first->getExprLoc(),
+                     diag::note_amdgcn_previous_is_processor_guard);
+      }
+      CurrentGFXIP.emplace_back(CE, G);
+    } else {
+      auto *FD = cast<FunctionDecl>(
+          cast<DeclRefExpr>(CE->getArg(0))->getReferencedDeclOfCallee());
+      unsigned ID = FD->getBuiltinID();
+      StringRef F = SemaRef.getASTContext().BuiltinInfo.getRequiredFeatures(ID);
+      GuardedBuiltins.emplace_back(ID, F);
+    }
+
     bool Continue = TraverseStmt(If->getThen());
-    --Guards;
+
+    if (IsProcessorCheck)
+      CurrentGFXIP.pop_back();
+    else
+      GuardedBuiltins.pop_back();
 
     return Continue && TraverseStmt(If->getElse());
   }
@@ -789,10 +814,19 @@ bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
   return DynamicRecursiveASTVisitor::TraverseIfStmt(If);
 }
 
-bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
-  if (Guards)
+bool DiagnoseUnguardedBuiltins::VisitAsmStmt(AsmStmt *ASM) {
+  // TODO: should we check if the ASM is valid for the target? Can we?
+  if (!CurrentGFXIP.empty())
     return true;
 
+  std::string S = ASM->generateAsmString(SemaRef.getASTContext());
+  SemaRef.Diag(ASM->getAsmLoc(), diag::warn_amdgcn_unguarded_asm_stmt) << S;
+  SemaRef.Diag(ASM->getAsmLoc(), diag::note_amdgcn_unguarded_asm_silence) << S;
+
+  return true;
+}
+
+bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
   unsigned ID = CE->getBuiltinCallee();
 
   if (!ID)
@@ -802,11 +836,34 @@ bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
   if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
       ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
     return true;
+  if (llvm::any_of(GuardedBuiltins, [ID](auto &&B) { return B.first == ID; }))
+    return true;
+
+  StringRef FL(SemaRef.getASTContext().BuiltinInfo.getRequiredFeatures(ID));
+  llvm::StringMap<bool> FeatureMap;
+  if (CurrentGFXIP.empty()) {
+    for (auto &&[ID, RequiredFeatures] : GuardedBuiltins)
+      for (auto &&F : llvm::split(RequiredFeatures, ','))
+        FeatureMap[F] = true;
+  } else {
+    llvm::AMDGPU::fillAMDGPUFeatureMap(CurrentGFXIP.back().second,
+                                       llvm::Triple("amdgcn-amd-amdhsa"),
+                                       FeatureMap);
+  }
 
-  SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
-      << CE->getDirectCallee();
-  SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
-      << CE->getDirectCallee();
+  if (Builtin::evaluateRequiredTargetFeatures(FL, FeatureMap)) {
+    SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
+        << CE->getDirectCallee();
+    SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
+        << CE->getDirectCallee();
+  } else {
+    StringRef GFXIP = CurrentGFXIP.empty() ? "" : CurrentGFXIP.back().second;
+    SemaRef.Diag(CE->getExprLoc(), diag::err_amdgcn_incompatible_builtin)
+        << CE->getDirectCallee() << FL << !CurrentGFXIP.empty() << GFXIP;
+    if (!CurrentGFXIP.empty())
+      SemaRef.Diag(CurrentGFXIP.back().first->getExprLoc(),
+                   diag::note_amdgcn_previous_is_processor_guard);
+  }
 
   return true;
 }
diff --git a/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
deleted file mode 100644
index 26544590f7536..0000000000000
--- a/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
+++ /dev/null
@@ -1,47 +0,0 @@
-// REQUIRES: amdgpu-registered-target
-// REQUIRES: spirv-registered-target
-// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s
-
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-
-__device__ void g();
-
-__device__ void f(int x, bool b) {
-    const auto lambda = [=] __device__  () {
-        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
-        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
-
-        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
-            __builtin_amdgcn_s_sleep_var(x);
-    };
-
-    const auto generic_lambda = [] __device__ (auto&& y) {
-        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
-        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
-
-        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
-            __builtin_amdgcn_s_sleep_var(y);
-    };
-
-    __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
-    // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
-
-    // processor_is does not (yet) guard
-    if (__builtin_amdgcn_processor_is("gfx900"))
-        __builtin_amdgcn_s_sleep_var(x); // expected-warning {{'__builtin_amdgcn_s_sleep_var' might be unavailable on some AMDGPU targets}}
-        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep_var' in a __builtin_amdgcn_is_invocable check to silence this warning}}
-
-    // Direct guard
-    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep))
-        __builtin_amdgcn_s_sleep(42);
-
-    // Guarded scope
-    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
-        if (b) {
-            g();
-            while (--x > 42)
-                __builtin_amdgcn_s_sleep_var(x);
-        }
-    }
-}
diff --git a/clang/test/SemaHIP/amdgpu-predicates-guard-use.hip b/clang/test/SemaHIP/amdgpu-predicates-guard-use.hip
new file mode 100644
index 0000000000000..2c79ccc724c99
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-predicates-guard-use.hip
@@ -0,0 +1,70 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void g();
+
+__device__ void f(int x, bool b) {
+    long v15_16;
+    __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x)); // expected-warning {{the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence might be invalid for some AMDGPU targets}}
+    // expected-note at -1 {{enclose the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence in a scope controlled by a __builtin_amdgcn_is_processor check to silence this warning}}
+
+    if (__builtin_amdgcn_processor_is("gfx90a")) {
+        long v15_16;
+        __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x));
+    }
+
+    const auto lambda = [=] __device__  () {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+            __builtin_amdgcn_s_sleep_var(x);
+    };
+
+    const auto generic_lambda = [] __device__ (auto&& y) {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
+            __builtin_amdgcn_s_sleep_var(y);
+            // Has the same requirements - gfx12-insts, thus correct, but we should still warn.
+            __builtin_amdgcn_s_barrier_signal_var(nullptr, y); // expected-warning {{'__builtin_amdgcn_s_barrier_signal_var' might be unavailable on some AMDGPU targets}}
+            // expected-note at -1 {{enclose '__builtin_amdgcn_s_barrier_signal_var' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+        }
+    };
+
+    __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+    // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    if (__builtin_amdgcn_processor_is("gfx1201")) {
+        if (__builtin_amdgcn_processor_is("gfx906")) // expected-error {{conflicting check for AMDGCN processor '__builtin_amdgcn_processor_is("gfx906")' found in a scope already controlled by a check for AMDGCN processor}}
+        // expected-note at -2 {{predicate guard, with establishes the context, inserted here}}
+            __builtin_trap();
+    }
+
+    if (__builtin_amdgcn_processor_is("gfx900")) {
+        if (__builtin_amdgcn_processor_is("gfx900")) // This is fine, albeit potentially spurious.
+            ++x;
+    }
+
+    if (__builtin_amdgcn_processor_is("gfx1030"))
+        __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s), which 'gfx1030' does not provide}}
+        // expected-note at -2 {{predicate guard, with establishes the context, inserted here}}
+
+    // Direct guard
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep))
+        __builtin_amdgcn_s_sleep(42);
+
+    // Guarded scope
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
+        if (b) {
+            g();
+            while (--x > 42)
+                __builtin_amdgcn_s_sleep_var(x);
+        }
+    }
+}

>From b84d483c917762f7e7b3dc3ef992e3561178c3d0 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 00:33:00 +0000
Subject: [PATCH 57/69] Fix formatting.

---
 clang/lib/Sema/SemaAMDGPU.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index cd7d6f6bb2bb4..f3d347a36e472 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -788,7 +788,8 @@ bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
       // TODO: handle generic ISAs.
       if (!CurrentGFXIP.empty() && G != CurrentGFXIP.back().second) {
         SemaRef.Diag(CE->getExprLoc(),
-                     diag::err_amdgcn_conflicting_is_processor_options) << CE;
+                     diag::err_amdgcn_conflicting_is_processor_options)
+            << CE;
         SemaRef.Diag(CurrentGFXIP.back().first->getExprLoc(),
                      diag::note_amdgcn_previous_is_processor_guard);
       }

>From cc21799efa8ddcda92363be1003816f35f3d3770 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 00:43:17 +0000
Subject: [PATCH 58/69] Update release notes, fix more formatting.

---
 clang/docs/ReleaseNotes.rst   | 4 ++--
 clang/lib/Sema/SemaAMDGPU.cpp | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c376d24128173..4134aaebd7366 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -253,7 +253,7 @@ AMDGPU Support
 - Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
   a late / deferred query for the current target processor.
 - Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
-  which enables fine-grained, per-builtin, feature availability.
+  a late / deferred query for the availability of target specific builtins.
 - Initial support for gfx1310
 
 NVPTX Support
@@ -325,7 +325,7 @@ AST Matchers
 
 clang-format
 ------------
-- Add ``ObjCSpaceAfterMethodDeclarationPrefix`` option to control space between the 
+- Add ``ObjCSpaceAfterMethodDeclarationPrefix`` option to control space between the
   '-'/'+' and the return type in Objective-C method declarations
 
 libclang
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index f3d347a36e472..d1f9170881a43 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -734,6 +734,7 @@ class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
 
   SmallVector<std::pair<CallExpr *, StringRef>> CurrentGFXIP;
   SmallVector<std::pair<unsigned, StringRef>> GuardedBuiltins;
+
 public:
   DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef) {}
 

>From 1711293a96722c6172878c964fcdf77129fdfae0 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 02:54:34 +0000
Subject: [PATCH 59/69] Add dedicated test for `__amdgpu_feature_predicate_t`.

---
 .../test/SemaHIP/amdgpu-feature-predicate.hip | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 clang/test/SemaHIP/amdgpu-feature-predicate.hip

diff --git a/clang/test/SemaHIP/amdgpu-feature-predicate.hip b/clang/test/SemaHIP/amdgpu-feature-predicate.hip
new file mode 100644
index 0000000000000..f3d333d63e51d
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-feature-predicate.hip
@@ -0,0 +1,40 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRED: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo(__amdgpu_feature_predicate_t l) {
+  decltype(__builtin_amdgcn_processor_is("gfx900")) what; // expected-error {{'what' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  typeof(__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep)) why; // expected-error {{'why' has type __amdgpu_feature_predicate_t, which is not constructible}}
+
+  bool b = true;
+  __amdgpu_feature_predicate_t v = false; // expected-error {{'v' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  static_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{static_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  dynamic_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{invalid target type '__amdgpu_feature_predicate_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{reinterpret_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  __amdgpu_feature_predicate_t k; // expected-error {{'k' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  int *ip = (int *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'int *'}}
+  void *vp = (void *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'void *'}}
+}
+
+__global__ void bar(__amdgpu_feature_predicate_t l) {
+  decltype(__builtin_amdgcn_processor_is("gfx900")) what; // expected-error {{'what' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  typeof(__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep)) why; // expected-error {{'why' has type __amdgpu_feature_predicate_t, which is not constructible}}
+
+  bool b = true;
+  __amdgpu_feature_predicate_t v = false; // expected-error {{'v' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  static_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{static_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  dynamic_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{invalid target type '__amdgpu_feature_predicate_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_feature_predicate_t>(b); // expected-error {{reinterpret_cast from 'bool' to '__amdgpu_feature_predicate_t' is not allowed}}
+  __amdgpu_feature_predicate_t k; // expected-error {{'k' has type __amdgpu_feature_predicate_t, which is not constructible}}
+  int *ip = (int *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'int *'}}
+  void *vp = (void *)l; // expected-error {{cannot cast from type '__amdgpu_feature_predicate_t' to pointer type 'void *'}}
+}
+
+static_assert(sizeof(__amdgpu_feature_predicate_t) == 0); // expected-error {{invalid application of 'sizeof' to sizeless type '__amdgpu_feature_predicate_t'}}
+static_assert(alignof(__amdgpu_feature_predicate_t) == 0); // expected-error {{invalid application of 'alignof' to sizeless type '__amdgpu_feature_predicate_t'}};

>From 5d0c1897a3aacc386b97f00159fced2398e3154e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 13:48:51 +0000
Subject: [PATCH 60/69] Unbreak some test.

---
 .../CodeGenOpenCL/builtins-amdgcn-gfx10.cl    | 33 ++++---
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl    | 18 ++--
 .../test/CodeGenOpenCL/builtins-amdgcn-vi.cl  | 96 ++++++++++++-------
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   | 15 ++-
 4 files changed, 108 insertions(+), 54 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index a4054cba236dd..f3884c1e2bea7 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -12,27 +12,31 @@ typedef unsigned long ulong;
 // CHECK-LABEL: @test_permlane16(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
-  *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane16))
+    *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
 }
 
 // CHECK-LABEL: @test_permlanex16(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
 void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
-  *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+    *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_uint(
 // CHECK:      {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
 // CHECK-NEXT: store i32 %0,
 void test_mov_dpp8_uint(global uint* out, uint a) {
-  *out = __builtin_amdgcn_mov_dpp8(a, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_long(
 // CHECK:      {{.*}}call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %a, i32 1)
 // CHECK-NEXT: store i64 %0,
 void test_mov_dpp8_long(global long* out, long a) {
-  *out = __builtin_amdgcn_mov_dpp8(a, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_float(
@@ -40,7 +44,8 @@ void test_mov_dpp8_long(global long* out, long a) {
 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
 // CHECK-NEXT: store i32 %1,
 void test_mov_dpp8_float(global float* out, float a) {
-  *out = __builtin_amdgcn_mov_dpp8(a, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_double
@@ -48,7 +53,8 @@ void test_mov_dpp8_float(global float* out, float a) {
 // CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
 // CHECK-NEXT: store i64 %1,
 void test_mov_dpp8_double(double x, global double *p) {
-  *p = __builtin_amdgcn_mov_dpp8(x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_short
@@ -57,7 +63,8 @@ void test_mov_dpp8_double(double x, global double *p) {
 // CHECK-NEXT: %2 = trunc i32 %1 to i16
 // CHECK-NEXT: store i16 %2,
 void test_mov_dpp8_short(short x, global short *p) {
-  *p = __builtin_amdgcn_mov_dpp8(x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_char
@@ -66,7 +73,8 @@ void test_mov_dpp8_short(short x, global short *p) {
 // CHECK-NEXT: %2 = trunc i32 %1 to i8
 // CHECK-NEXT: store i8 %2,
 void test_mov_dpp8_char(char x, global char *p) {
-  *p = __builtin_amdgcn_mov_dpp8(x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_half
@@ -76,14 +84,16 @@ void test_mov_dpp8_char(char x, global char *p) {
 // CHECK-NEXT: %3 = trunc i32 %2 to i16
 // CHECK-NEXT: store i16 %3,
 void test_mov_dpp8_half(half *x, global half *p) {
-  *p = __builtin_amdgcn_mov_dpp8(*x, 1);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
+    *p = __builtin_amdgcn_mov_dpp8(*x, 1);
 }
 
 // CHECK-LABEL: @test_s_memtime
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime()
 void test_s_memtime(global ulong* out)
 {
-  *out = __builtin_amdgcn_s_memtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memtime))
+    *out = __builtin_amdgcn_s_memtime();
 }
 
 // CHECK-LABEL: @test_groupstaticsize
@@ -97,5 +107,6 @@ void test_groupstaticsize(global uint* out)
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}})
 void test_ballot_wave32(global uint* out, int a, int b)
 {
-  *out = __builtin_amdgcn_ballot_w32(a == b);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ballot_w32))
+    *out = __builtin_amdgcn_ballot_w32(a == b);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index 7cd3f1417844c..e66187a1f69f0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -17,13 +17,15 @@ typedef uint uint4 __attribute__((ext_vector_type(4)));
 // CHECK-LABEL: @test_s_sendmsg_rtn(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0)
 void test_s_sendmsg_rtn(global uint* out) {
-  *out = __builtin_amdgcn_s_sendmsg_rtn(0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sendmsg_rtn))
+    *out = __builtin_amdgcn_s_sendmsg_rtn(0);
 }
 
 // CHECK-LABEL: @test_s_sendmsg_rtnl(
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 0)
 void test_s_sendmsg_rtnl(global ulong* out) {
-  *out = __builtin_amdgcn_s_sendmsg_rtnl(0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sendmsg_rtnl))
+    *out = __builtin_amdgcn_s_sendmsg_rtnl(0);
 }
 
 // CHECK-LABEL: @test_ds_bvh_stack_rtn(
@@ -34,19 +36,22 @@ void test_s_sendmsg_rtnl(global ulong* out) {
 // CHECK: %4 = insertelement <2 x i32> %3, i32 %2, i64 1
 void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
 {
-  *out = __builtin_amdgcn_ds_bvh_stack_rtn(addr, data, data1, 128);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_bvh_stack_rtn))
+    *out = __builtin_amdgcn_ds_bvh_stack_rtn(addr, data, data1, 128);
 }
 
 // CHECK-LABEL: @test_permlane64(
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
 void test_permlane64(global uint* out, uint a) {
-  *out = __builtin_amdgcn_permlane64(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64))
+    *out = __builtin_amdgcn_permlane64(a);
 }
 
 // CHECK-LABEL: @test_s_wait_event_export_ready
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.wait.event.export.ready
 void test_s_wait_event_export_ready() {
-  __builtin_amdgcn_s_wait_event_export_ready();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+    __builtin_amdgcn_s_wait_event_export_ready();
 }
 
 // CHECK-LABEL: @test_global_add_f32
@@ -57,5 +62,6 @@ void test_global_add_f32(float *rtn, global float *addr, float x) {
 #else
 void test_global_add_f32(float *rtn, __attribute__((address_space(1))) float *addr, float x) {
 #endif
-  *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_atomic_fadd_f32))
+    *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index faf6a7d44fee2..922d8f0dd4790 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -16,42 +16,48 @@ typedef unsigned int  uint;
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.div.fixup.f16
 void test_div_fixup_f16(global half* out, half a, half b, half c)
 {
-  *out = __builtin_amdgcn_div_fixuph(a, b, c);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_div_fixuph))
+    *out = __builtin_amdgcn_div_fixuph(a, b, c);
 }
 
 // CHECK-LABEL: @test_rcp_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rcp.f16
 void test_rcp_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_rcph(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_rcph))
+    *out = __builtin_amdgcn_rcph(a);
 }
 
 // CHECK-LABEL: @test_sqrt_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.{{((amdgcn.){0,1})}}sqrt.f16
 void test_sqrt_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_sqrth(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_sqrth))
+    *out = __builtin_amdgcn_sqrth(a);
 }
 
 // CHECK-LABEL: @test_rsq_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rsq.f16
 void test_rsq_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_rsqh(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_rsqh))
+    *out = __builtin_amdgcn_rsqh(a);
 }
 
 // CHECK-LABEL: @test_sin_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.sin.f16
 void test_sin_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_sinh(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_sinh))
+    *out = __builtin_amdgcn_sinh(a);
 }
 
 // CHECK-LABEL: @test_cos_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.cos.f16
 void test_cos_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_cosh(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_cosh))
+    *out = __builtin_amdgcn_cosh(a);
 }
 
 // CHECK-LABEL: @test_ldexp_f16
@@ -59,63 +65,72 @@ void test_cos_f16(global half* out, half a)
 // CHECK: {{.*}}call{{.*}} half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]])
 void test_ldexp_f16(global half* out, half a, int b)
 {
-  *out = __builtin_amdgcn_ldexph(a, b);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ldexph))
+    *out = __builtin_amdgcn_ldexph(a, b);
 }
 
 // CHECK-LABEL: @test_frexp_mant_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.frexp.mant.f16
 void test_frexp_mant_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_frexp_manth(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_frexp_manth))
+    *out = __builtin_amdgcn_frexp_manth(a);
 }
 
 // CHECK-LABEL: @test_frexp_exp_f16
 // CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.frexp.exp.i16.f16
 void test_frexp_exp_f16(global short* out, half a)
 {
-  *out = __builtin_amdgcn_frexp_exph(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_frexp_exph))
+    *out = __builtin_amdgcn_frexp_exph(a);
 }
 
 // CHECK-LABEL: @test_fract_f16
 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.fract.f16
 void test_fract_f16(global half* out, half a)
 {
-  *out = __builtin_amdgcn_fracth(a);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_fracth))
+    *out = __builtin_amdgcn_fracth(a);
 }
 
 // CHECK-LABEL: @test_class_f16
 // CHECK: {{.*}}call{{.*}} i1 @llvm.amdgcn.class.f16
 void test_class_f16(global half* out, half a, int b)
 {
-  *out = __builtin_amdgcn_classh(a, b);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_classh))
+    *out = __builtin_amdgcn_classh(a, b);
 }
 
 // CHECK-LABEL: @test_s_memrealtime
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memrealtime()
 void test_s_memrealtime(global ulong* out)
 {
-  *out = __builtin_amdgcn_s_memrealtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memrealtime))
+    *out = __builtin_amdgcn_s_memrealtime();
 }
 
 // CHECK-LABEL: @test_s_dcache_wb()
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.dcache.wb()
 void test_s_dcache_wb()
 {
-  __builtin_amdgcn_s_dcache_wb();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_dcache_wb))
+    __builtin_amdgcn_s_dcache_wb();
 }
 
 // CHECK-LABEL: @test_mov_dpp_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false)
 void test_mov_dpp_int(global int* out, int src)
 {
-  *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_mov_dpp_long
 // CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
 // CHECK-NEXT: store i64 %0,
 void test_mov_dpp_long(long x, global long *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_float
@@ -123,7 +138,8 @@ void test_mov_dpp_long(long x, global long *p) {
 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
 // CHECK-NEXT: store i32 %1,
 void test_mov_dpp_float(float x, global float *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_double
@@ -131,7 +147,8 @@ void test_mov_dpp_float(float x, global float *p) {
 // CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
 // CHECK-NEXT: store i64 %1,
 void test_mov_dpp_double(double x, global double *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_short
@@ -140,7 +157,8 @@ void test_mov_dpp_double(double x, global double *p) {
 // CHECK-NEXT: %2 = trunc i32 %1 to i16
 // CHECK-NEXT: store i16 %2,
 void test_mov_dpp_short(short x, global short *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_char
@@ -149,7 +167,8 @@ void test_mov_dpp_short(short x, global short *p) {
 // CHECK-NEXT: %2 = trunc i32 %1 to i8
 // CHECK-NEXT: store i8 %2,
 void test_mov_dpp_char(char x, global char *p) {
-  *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_half
@@ -159,21 +178,24 @@ void test_mov_dpp_char(char x, global char *p) {
 // CHECK-NEXT: %3 = trunc i32 %2 to i16
 // CHECK-NEXT: store i16 %3,
 void test_mov_dpp_half(half *x, global half *p) {
-  *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
+    *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_int(global int* out, int arg1, int arg2)
 {
-  *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_update_dpp_long
 // CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
 // CHECK-NEXT: store i64 %0,
 void test_update_dpp_long(long x, global long *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_float
@@ -181,7 +203,8 @@ void test_update_dpp_long(long x, global long *p) {
 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
 // CHECK-NEXT: store i32 %1,
 void test_update_dpp_float(float x, global float *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_double
@@ -189,7 +212,8 @@ void test_update_dpp_float(float x, global float *p) {
 // CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
 // CHECK-NEXT: store i64 %1,
 void test_update_dpp_double(double x, global double *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_short
@@ -198,7 +222,8 @@ void test_update_dpp_double(double x, global double *p) {
 // CHECK-NEXT: %2 = trunc i32 %1 to i16
 // CHECK-NEXT: store i16 %2,
 void test_update_dpp_short(short x, global short *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_char
@@ -207,7 +232,8 @@ void test_update_dpp_short(short x, global short *p) {
 // CHECK-NEXT: %2 = trunc i32 %1 to i8
 // CHECK-NEXT: store i8 %2,
 void test_update_dpp_char(char x, global char *p) {
-  *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_half
@@ -217,21 +243,24 @@ void test_update_dpp_char(char x, global char *p) {
 // CHECK-NEXT: %3 = trunc i32 %2 to i16
 // CHECK-NEXT: store i16 %3,
 void test_update_dpp_half(half *x, global half *p) {
-  *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_int_uint
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_int_uint(global int* out, int arg1, unsigned int arg2)
 {
-  *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_update_dpp_lit_int
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 5, i32 %arg1, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_lit_int(global int* out, int arg1)
 {
-  *out = __builtin_amdgcn_update_dpp(5, arg1, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(5, arg1, 0, 0, 0, false);
 }
 
 __constant int gi = 5;
@@ -240,7 +269,8 @@ __constant int gi = 5;
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 5, i32 %arg1, i32 0, i32 0, i32 0, i1 false)
 void test_update_dpp_const_int(global int* out, int arg1)
 {
-  *out = __builtin_amdgcn_update_dpp(gi, arg1, 0, 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(gi, arg1, 0, 0, 0, false);
 }
 
 // CHECK-LABEL: @test_ds_fadd
@@ -397,14 +427,16 @@ void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime()
 void test_s_memtime(global ulong* out)
 {
-  *out = __builtin_amdgcn_s_memtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memtime))
+    *out = __builtin_amdgcn_s_memtime();
 }
 
 // CHECK-LABEL: @test_perm
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s)
 void test_perm(global uint* out, uint a, uint b, uint s)
 {
-  *out = __builtin_amdgcn_perm(a, b, s);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_perm))
+    *out = __builtin_amdgcn_perm(a, b, s);
 }
 
 // CHECK-LABEL: @test_groupstaticsize
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 04140ed3f10b0..9664ef18580fc 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1123,31 +1123,36 @@ kernel void test_ds_consume_lds(__attribute__((address_space(1))) int* out, __at
 // CHECK-LABEL: @test_gws_init(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.init(i32 %value, i32 %id)
 kernel void test_gws_init(uint value, uint id) {
-  __builtin_amdgcn_ds_gws_init(value, id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_init))
+    __builtin_amdgcn_ds_gws_init(value, id);
 }
 
 // CHECK-LABEL: @test_gws_barrier(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.barrier(i32 %value, i32 %id)
 kernel void test_gws_barrier(uint value, uint id) {
-  __builtin_amdgcn_ds_gws_barrier(value, id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_barrier))
+    __builtin_amdgcn_ds_gws_barrier(value, id);
 }
 
 // CHECK-LABEL: @test_gws_sema_v(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.v(i32 %id)
 kernel void test_gws_sema_v(uint id) {
-  __builtin_amdgcn_ds_gws_sema_v(id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_sema_v))
+    __builtin_amdgcn_ds_gws_sema_v(id);
 }
 
 // CHECK-LABEL: @test_gws_sema_br(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.br(i32 %value, i32 %id)
 kernel void test_gws_sema_br(uint value, uint id) {
-  __builtin_amdgcn_ds_gws_sema_br(value, id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_sema_br))
+    __builtin_amdgcn_ds_gws_sema_br(value, id);
 }
 
 // CHECK-LABEL: @test_gws_sema_p(
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.p(i32 %id)
 kernel void test_gws_sema_p(uint id) {
-  __builtin_amdgcn_ds_gws_sema_p(id);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_gws_sema_p))
+    __builtin_amdgcn_ds_gws_sema_p(id);
 }
 
 // CHECK-LABEL: @test_mbcnt_lo(

>From 47cc6d75ba746a392e0362d357a3466afc8d4a5e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 13:50:40 +0000
Subject: [PATCH 61/69] Add missing file.

---
 clang/lib/AST/Type.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 53082bcf78f6a..342fefdda1f4e 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2541,6 +2541,9 @@ bool Type::isSizelessBuiltinType() const {
       // HLSL intangible types
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
+      // AMDGPU feature predicate type
+    case BuiltinType::AMDGPUFeaturePredicate:
+// #include "clang/Basic/AMDGPUTypes.def"
       return true;
     default:
       return false;

>From a6020cf92fc5aa1603b15d37311646a07caa8f2c Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 13:51:30 +0000
Subject: [PATCH 62/69] Remove noise.

---
 clang/lib/AST/Type.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 342fefdda1f4e..7767d54e50f9b 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2543,7 +2543,6 @@ bool Type::isSizelessBuiltinType() const {
 #include "clang/Basic/HLSLIntangibleTypes.def"
       // AMDGPU feature predicate type
     case BuiltinType::AMDGPUFeaturePredicate:
-// #include "clang/Basic/AMDGPUTypes.def"
       return true;
     default:
       return false;

>From 62bd27a3c8df22413149a551773d3c5cc73c940e Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 30 Jan 2026 14:29:10 +0000
Subject: [PATCH 63/69] Unbreak more tests.

---
 .../test/CodeGenCUDA/builtins-spirv-amdgcn.cu | 226 +++++++++++++++++-
 .../spirv-amdgcn-dpp-const-fold.hip           |  12 +-
 2 files changed, 227 insertions(+), 11 deletions(-)

diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
index 1cbe358910b85..dbb8ce2bd0773 100644
--- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
@@ -5,7 +5,7 @@
 
 // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \
 // RUN:  -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \
-// RUN:  -o - | FileCheck %s
+// RUN:  -o - | FileCheck %s --check-prefix=AMDGCNSPIRV
 
 #include "Inputs/cuda.h"
 
@@ -28,6 +28,25 @@
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z16use_dispatch_ptrPi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void use_dispatch_ptr(int* out) {
   const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr();
   *out = *dispatch_ptr;
@@ -52,6 +71,25 @@ __global__ void use_dispatch_ptr(int* out) {
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z13use_queue_ptrPi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void use_queue_ptr(int* out) {
   const int* queue_ptr = (const int*)__builtin_amdgcn_queue_ptr();
   *out = *queue_ptr;
@@ -76,14 +114,30 @@ __global__ void use_queue_ptr(int* out) {
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z19use_implicitarg_ptrPi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[TMP3]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void use_implicitarg_ptr(int* out) {
   const int* implicitarg_ptr = (const int*)__builtin_amdgcn_implicitarg_ptr();
   *out = *implicitarg_ptr;
 }
 
-__global__
-    //
-    void
 // CHECK-LABEL: @_Z12test_ds_fmaxf(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
@@ -96,7 +150,21 @@ __global__
 // CHECK-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
-    test_ds_fmax(float src) {
+// AMDGCNSPIRV-LABEL: @_Z12test_ds_fmaxf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
+__global__ void test_ds_fmax(float src) {
+//
+//
   __shared__ float shared;
   volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false);
 }
@@ -113,6 +181,18 @@ __global__
 // CHECK-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z12test_ds_faddf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_ds_fadd(float src) {
   __shared__ float shared;
   volatile float x = __builtin_amdgcn_ds_faddf(&shared, src, 0, 0, false);
@@ -139,6 +219,27 @@ __global__ void test_ds_fadd(float src) {
 // CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z12test_ds_fminfPf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SHARED:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_ds_fmin(float src, float *shared) {
   volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
 }
@@ -155,6 +256,11 @@ __device__ void test_ret_builtin_nondef_addrspace() {
 // CHECK-NEXT:    call addrspace(4) void @llvm.amdgcn.endpgm()
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z6endpgmv(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.amdgcn.endpgm()
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void endpgm() {
   __builtin_amdgcn_endpgm();
 }
@@ -183,6 +289,28 @@ __global__ void endpgm() {
 // CHECK-NEXT:    store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z14test_uicmp_i64Pyyy(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP0]], i64 [[TMP1]], i32 35)
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP2]], ptr addrspace(4) [[TMP3]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
 {
   *out = __builtin_amdgcn_uicmpl(a, b, 30+5);
@@ -199,14 +327,39 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un
 // CHECK-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[HAS_S_MEMTIME_INST_:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+// CHECK-NEXT:    br i1 [[HAS_S_MEMTIME_INST_]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+// CHECK:       if.then:
 // CHECK-NEXT:    [[TMP0:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(4) [[TMP1]], align 8
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z14test_s_memtimePy(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[OUT:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[HAS_S_MEMTIME_INST_:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+// AMDGCNSPIRV-NEXT:    br i1 [[HAS_S_MEMTIME_INST_]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+// AMDGCNSPIRV:       if.then:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime()
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP0]], ptr addrspace(4) [[TMP1]], align 8
+// AMDGCNSPIRV-NEXT:    br label [[IF_END]]
+// AMDGCNSPIRV:       if.end:
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_s_memtime(unsigned long long* out)
 {
-  *out = __builtin_amdgcn_s_memtime();
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_memtime))
+    *out = __builtin_amdgcn_s_memtime();
 }
 
 // Check a generic pointer can be passed as a shared pointer and a generic pointer.
@@ -232,9 +385,32 @@ __device__ void func(float *x);
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
 // CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR6:[0-9]+]]
+// CHECK-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z17test_ds_fmin_funcfPf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[SHARED:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca float, align 4
+// AMDGCNSPIRV-NEXT:    [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
+// AMDGCNSPIRV-NEXT:    store volatile float [[TMP3]], ptr addrspace(4) [[X_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP4]]) #[[ATTR7:[0-9]+]]
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
   volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
   func(shared);
@@ -258,6 +434,24 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
 // CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z14test_is_sharedPf(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RET:%.*]] = alloca i8, align 1
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP1]])
+// AMDGCNSPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// AMDGCNSPIRV-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_is_shared(float *x){
   bool ret = __builtin_amdgcn_is_shared(x);
 }
@@ -280,6 +474,24 @@ __global__ void test_is_shared(float *x){
 // CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
 // CHECK-NEXT:    ret void
 //
+// AMDGCNSPIRV-LABEL: @_Z15test_is_privatePi(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RET:%.*]] = alloca i8, align 1
+// AMDGCNSPIRV-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP1]])
+// AMDGCNSPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP2]] to i8
+// AMDGCNSPIRV-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[RET_ASCAST]], align 1
+// AMDGCNSPIRV-NEXT:    ret void
+//
 __global__ void test_is_private(int *x){
   bool ret = __builtin_amdgcn_is_private(x);
 }
diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
index 71270bc1c68d8..63391220fd0e6 100644
--- a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
@@ -24,23 +24,27 @@ constexpr static bool BountCtrl()
 // CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
 }
 
 // CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
 }
 
 // CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
 }
 
 // CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
 {
-  *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
+    *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl());
 }

>From 48bb06da0a44ae733036ba6501662653a0f66351 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Mon, 2 Feb 2026 01:52:53 +0000
Subject: [PATCH 64/69] Enhance diagnostics.

---
 clang/lib/Sema/SemaAMDGPU.cpp                 | 45 ++++++++++++-------
 ...> amdgpu-feature-predicates-guard-use.hip} | 36 +++++++++++++++
 2 files changed, 65 insertions(+), 16 deletions(-)
 rename clang/test/SemaHIP/{amdgpu-predicates-guard-use.hip => amdgpu-feature-predicates-guard-use.hip} (59%)

diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index d1f9170881a43..30868da3606ae 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -728,15 +728,25 @@ namespace {
 /// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
 /// guarded.
 class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
-  // TODO: this could eventually be extended to consider attributes such as
-  //       target.
+  // TODO: this could eventually be extended to consider what happens when there
+  //       are multiple target architectures specified via target("arch=gfxXXX")
+  //       target("arch=gfxyyy") etc., as well as feature disabling via "-XXX".
   Sema &SemaRef;
 
-  SmallVector<std::pair<CallExpr *, StringRef>> CurrentGFXIP;
-  SmallVector<std::pair<unsigned, StringRef>> GuardedBuiltins;
+  SmallVector<StringRef> TargetFeatures;
+  SmallVector<std::pair<SourceLocation, StringRef>> CurrentGFXIP;
+  SmallVector<unsigned> GuardedBuiltins;
 
 public:
-  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef) {}
+  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef) {
+    if (auto *TAT = SemaRef.getCurFunctionDecl(true)->getAttr<TargetAttr>()) {
+      // We use the somewhat misnamed x86 accessors because they provide exactly
+      // what we require.
+      TAT->getX86AddedFeatures(TargetFeatures);
+      if (auto GFXIP = TAT->getX86Architecture())
+        CurrentGFXIP.emplace_back(TAT->getLocation(), *GFXIP);
+    }
+  }
 
   bool TraverseLambdaExpr(LambdaExpr *LE) override {
     if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(
@@ -791,16 +801,16 @@ bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
         SemaRef.Diag(CE->getExprLoc(),
                      diag::err_amdgcn_conflicting_is_processor_options)
             << CE;
-        SemaRef.Diag(CurrentGFXIP.back().first->getExprLoc(),
+        SemaRef.Diag(CurrentGFXIP.back().first,
                      diag::note_amdgcn_previous_is_processor_guard);
       }
-      CurrentGFXIP.emplace_back(CE, G);
+      CurrentGFXIP.emplace_back(CE->getExprLoc(), G);
     } else {
       auto *FD = cast<FunctionDecl>(
           cast<DeclRefExpr>(CE->getArg(0))->getReferencedDeclOfCallee());
       unsigned ID = FD->getBuiltinID();
       StringRef F = SemaRef.getASTContext().BuiltinInfo.getRequiredFeatures(ID);
-      GuardedBuiltins.emplace_back(ID, F);
+      GuardedBuiltins.push_back(ID);
     }
 
     bool Continue = TraverseStmt(If->getThen());
@@ -830,26 +840,29 @@ bool DiagnoseUnguardedBuiltins::VisitAsmStmt(AsmStmt *ASM) {
 
 bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
   unsigned ID = CE->getBuiltinCallee();
+  Builtin::Context &BInfo = SemaRef.getASTContext().BuiltinInfo;
 
   if (!ID)
     return true;
-  if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID))
+  if (!BInfo.isTSBuiltin(ID))
     return true;
   if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
       ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
     return true;
-  if (llvm::any_of(GuardedBuiltins, [ID](auto &&B) { return B.first == ID; }))
+  if (llvm::find(GuardedBuiltins, ID) != GuardedBuiltins.end())
     return true;
 
-  StringRef FL(SemaRef.getASTContext().BuiltinInfo.getRequiredFeatures(ID));
+  StringRef FL(BInfo.getRequiredFeatures(ID));
   llvm::StringMap<bool> FeatureMap;
   if (CurrentGFXIP.empty()) {
-    for (auto &&[ID, RequiredFeatures] : GuardedBuiltins)
-      for (auto &&F : llvm::split(RequiredFeatures, ','))
+    for (auto &&F : TargetFeatures)
+      FeatureMap[F] = true;
+    for (auto &&GID : GuardedBuiltins)
+      for (auto &&F : llvm::split(BInfo.getRequiredFeatures(GID), ','))
         FeatureMap[F] = true;
   } else {
-    llvm::AMDGPU::fillAMDGPUFeatureMap(CurrentGFXIP.back().second,
-                                       llvm::Triple("amdgcn-amd-amdhsa"),
+    static const llvm::Triple AMDGCN("amdgcn-amd-amdhsa");
+    llvm::AMDGPU::fillAMDGPUFeatureMap(CurrentGFXIP.back().second, AMDGCN,
                                        FeatureMap);
   }
 
@@ -863,7 +876,7 @@ bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
     SemaRef.Diag(CE->getExprLoc(), diag::err_amdgcn_incompatible_builtin)
         << CE->getDirectCallee() << FL << !CurrentGFXIP.empty() << GFXIP;
     if (!CurrentGFXIP.empty())
-      SemaRef.Diag(CurrentGFXIP.back().first->getExprLoc(),
+      SemaRef.Diag(CurrentGFXIP.back().first,
                    diag::note_amdgcn_previous_is_processor_guard);
   }
 
diff --git a/clang/test/SemaHIP/amdgpu-predicates-guard-use.hip b/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
similarity index 59%
rename from clang/test/SemaHIP/amdgpu-predicates-guard-use.hip
rename to clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
index 2c79ccc724c99..7418be2aa4c43 100644
--- a/clang/test/SemaHIP/amdgpu-predicates-guard-use.hip
+++ b/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
@@ -68,3 +68,39 @@ __device__ void f(int x, bool b) {
         }
     }
 }
+
+__attribute__((target("arch=gfx1030")))
+__device__ void h(int x) {
+    if (__builtin_amdgcn_processor_is("gfx1030")) // Fine, same processor
+        return;
+
+    long v15_16;
+    __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x)); // "Fine", explicit gfx target
+
+    __builtin_amdgcn_s_ttracedata_imm(42); // expected-warning {{'__builtin_amdgcn_s_ttracedata_imm' might be unavailable on some AMDGPU targets}}
+    // expected-note at -1 {{enclose '__builtin_amdgcn_s_ttracedata_imm' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s), which 'gfx1030' does not provide}}
+    // expected-note at -12 {{predicate guard, with establishes the context, inserted here}}
+
+    if (__builtin_amdgcn_processor_is("gfx906")) // expected-error {{conflicting check for AMDGCN processor '__builtin_amdgcn_processor_is("gfx906")' found in a scope already controlled by a check for AMDGCN processor}}
+    // expected-note at -15 {{predicate guard, with establishes the context, inserted here}}
+        __builtin_trap();
+}
+
+__attribute__((target("gfx11-insts")))
+__device__ void i(int x) {
+    __builtin_amdgcn_s_wait_event_export_ready(); // expected-warning {{'__builtin_amdgcn_s_wait_event_export_ready' might be unavailable on some AMDGPU targets}}
+    // expected-note at -1 {{enclose '__builtin_amdgcn_s_wait_event_export_ready' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s)}}
+}
+
+__attribute__((target("gfx11-insts,gfx12-insts")))
+__device__ void j(int x) {
+    __builtin_amdgcn_s_wait_event_export_ready(); // expected-warning {{'__builtin_amdgcn_s_wait_event_export_ready' might be unavailable on some AMDGPU targets}}
+    // expected-note at -1 {{enclose '__builtin_amdgcn_s_wait_event_export_ready' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    __builtin_amdgcn_s_barrier_signal_isfirst(42); // expected-warning {{'__builtin_amdgcn_s_barrier_signal_isfirst' might be unavailable on some AMDGPU targets}}
+    // expected-note at -1 {{enclose '__builtin_amdgcn_s_barrier_signal_isfirst' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+}

>From 875bf6899e6834c508068daea8c84905cde26cae Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 6 Feb 2026 14:25:12 +0000
Subject: [PATCH 65/69] Update llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adopt stylistic suggesion.

Co-authored-by: Juan Manuel Martinez CaamaƱo <jmartinezcaamao at gmail.com>
---
 llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
index 45016f19cd64b..4c8bfdf4ed792 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -87,15 +87,14 @@ bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
   // Replace placeholder Specialisation Constant IDs with unique IDs associated
   // with the predicate being evaluated, which is encoded in the call name.
   for (auto &&U : F->users()) {
-    if (!isa<CallInst>(U))
+    auto* CI = dyn_cast<CallInst>(U);
+    if (!CI)
       continue;
-
-    auto *CI = cast<CallInst>(U);
-
-    if (!isa<ConstantInt>(CI->getArgOperand(0)))
+    auto* Arg0 = dyn_cast<ConstantInt>(CI->getArgOperand(0));
+    if (!Arg0)
       continue;
 
-    unsigned ID = cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue();
+    unsigned ID = Arg0->getZExtValue();
 
     if (ID != UINT32_MAX)
       continue;

>From b5a6fe302a0c279c503ff5acff556a91a57534cd Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Fri, 6 Feb 2026 18:11:18 +0000
Subject: [PATCH 66/69] Fix formatting.

---
 llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
index 4c8bfdf4ed792..93f963ec70c69 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -87,10 +87,10 @@ bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
   // Replace placeholder Specialisation Constant IDs with unique IDs associated
   // with the predicate being evaluated, which is encoded in the call name.
   for (auto &&U : F->users()) {
-    auto* CI = dyn_cast<CallInst>(U);
+    auto *CI = dyn_cast<CallInst>(U);
     if (!CI)
       continue;
-    auto* Arg0 = dyn_cast<ConstantInt>(CI->getArgOperand(0));
+    auto *Arg0 = dyn_cast<ConstantInt>(CI->getArgOperand(0));
     if (!Arg0)
       continue;
 

>From ccc8f9472a5686eced06aea5b6bcd5a2fab6c6b2 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sun, 8 Feb 2026 21:50:13 +0000
Subject: [PATCH 67/69] Do not rely on `CallInst` names, use metadata instead;
 handle negated predicates.

---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  19 ++-
 clang/lib/Sema/SemaAMDGPU.cpp                 | 147 +++++++++++-------
 clang/lib/Sema/SemaExpr.cpp                   |   8 +-
 .../CodeGen/amdgpu-builtin-is-invocable.c     |   5 +-
 .../CodeGen/amdgpu-builtin-processor-is.c     |   5 +-
 .../spirv-amdgcn-dpp-const-fold.hip           |   8 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx10.cl    |  42 ++---
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl    |  10 +-
 .../test/CodeGenOpenCL/builtins-amdgcn-vi.cl  |  94 +++++------
 .../amdgpu-feature-predicates-guard-use.hip   |   9 ++
 llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp |   7 +-
 ...gcnspirv-feature-predicate-specconstant.ll | 111 +++++++------
 12 files changed, 270 insertions(+), 195 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 539a8dca33bed..2781fe0058f07 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -366,13 +366,20 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
   Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
 }
 
-static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
+static Value *GetAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
   Function *SpecConstFn = CGF.getSpecConstantFunction(CGF.getContext().BoolTy);
   llvm::Type *SpecIdTy = SpecConstFn->getArg(0)->getType();
   Constant *SpecId = ConstantInt::getAllOnesValue(SpecIdTy);
-  return CGF.Builder.CreateCall(
-      SpecConstFn, {SpecId, ConstantInt::getFalse(CGF.getLLVMContext())},
-      Name + ".");
+  CallInst *Call = CGF.Builder.CreateCall(
+      SpecConstFn, {SpecId, ConstantInt::getFalse(CGF.getLLVMContext())});
+
+  // Encode the predicate as metadata, making it available to
+  // SPIRVPrepareGlobals.
+  LLVMContext &Ctx = CGF.getLLVMContext();
+  MDNode *Predicate = MDNode::get(Ctx, MDString::get(Ctx, Name.str()));
+  Call->setMetadata("llvm.amdgcn.feature.predicate", Predicate);
+
+  return Call;
 }
 
 static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
@@ -918,7 +925,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
            "__builtin_amdgcn_processor_is should never reach CodeGen for "
            "concrete targets!");
     StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
-    return GetOrInsertAMDGPUPredicate(*this, "is." + Proc);
+    return GetAMDGPUPredicate(*this, "is." + Proc);
   }
   case AMDGPU::BI__builtin_amdgcn_is_invocable: {
     assert(CGM.getTriple().isSPIRV() &&
@@ -928,7 +935,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
         cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
     StringRef RF =
         getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
-    return GetOrInsertAMDGPUPredicate(*this, "has." + RF);
+    return GetAMDGPUPredicate(*this, "has." + RF);
   }
   case AMDGPU::BI__builtin_amdgcn_read_exec:
     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 30868da3606ae..d05cff4d71d21 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -667,7 +667,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
       SmallVector<StringRef, 32> ValidList;
       if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD)
         TI.fillValidCPUList(ValidList);
-      else if (AuxTI) // Since the BI is present it must be and AMDGPU triple.
+      else if (AuxTI) // Since the BI is present it must be an AMDGPU triple.
         AuxTI->fillValidCPUList(ValidList);
       if (!ValidList.empty())
         Diag(Loc, diag::note_amdgcn_processor_is_valid_options)
@@ -737,6 +737,25 @@ class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
   SmallVector<std::pair<SourceLocation, StringRef>> CurrentGFXIP;
   SmallVector<unsigned> GuardedBuiltins;
 
+  static Expr *FindPredicate(Expr *Cond) {
+    if (auto *CE = dyn_cast<CallExpr>(Cond)) {
+      if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable ||
+          CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is)
+        return Cond;
+    } else if (auto *UO = dyn_cast<UnaryOperator>(Cond)) {
+      return FindPredicate(UO->getSubExpr());
+    } else if (auto *BO = dyn_cast<BinaryOperator>(Cond)) {
+      if ((Cond = FindPredicate(BO->getLHS())))
+        return Cond;
+      return FindPredicate(BO->getRHS());
+    }
+    return nullptr;
+  }
+
+  bool EnterPredicateGuardedContext(CallExpr *P);
+  void ExitPredicateGuardedContext(bool WasProcessorCheck);
+  bool TraverseGuardedStmt(Stmt *S, CallExpr *P);
+
 public:
   DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef) {
     if (auto *TAT = SemaRef.getCurFunctionDecl(true)->getAttr<TargetAttr>()) {
@@ -763,67 +782,91 @@ class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
 
   void IssueDiagnostics(Stmt *S) { TraverseStmt(S); }
 
-  bool TraverseIfStmt(IfStmt *If) override;
+  bool TraverseIfStmt(IfStmt *If) override {
+    if (auto *CE = dyn_cast_or_null<CallExpr>(FindPredicate(If->getCond())))
+      return TraverseGuardedStmt(If, CE);
+    return DynamicRecursiveASTVisitor::TraverseIfStmt(If);
+  }
 
   bool TraverseCaseStmt(CaseStmt *CS) override {
     return TraverseStmt(CS->getSubStmt());
   }
 
+  bool TraverseConditionalOperator(ConditionalOperator *CO) override {
+    if (auto *CE = dyn_cast_or_null<CallExpr>(FindPredicate(CO->getCond())))
+      return TraverseGuardedStmt(CO, CE);
+    return DynamicRecursiveASTVisitor::TraverseConditionalOperator(CO);
+  }
+
   bool VisitAsmStmt(AsmStmt *ASM) override;
   bool VisitCallExpr(CallExpr *CE) override;
 };
 
-inline Expr *FindPredicate(Expr *Cond) {
-  if (auto *CE = dyn_cast<CallExpr>(Cond)) {
-    if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable ||
-        CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is)
-      return Cond;
-  } else if (auto *UO = dyn_cast<UnaryOperator>(Cond)) {
-    return FindPredicate(UO->getSubExpr());
-  } else if (auto *BO = dyn_cast<BinaryOperator>(Cond)) {
-    if ((Cond = FindPredicate(BO->getLHS())))
-      return Cond;
-    return FindPredicate(BO->getRHS());
+bool DiagnoseUnguardedBuiltins::EnterPredicateGuardedContext(CallExpr *P) {
+  bool IsProcessorCheck =
+      P->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is;
+
+  if (IsProcessorCheck) {
+    StringRef G = cast<clang::StringLiteral>(P->getArg(0))->getString();
+    // TODO: handle generic ISAs.
+    if (!CurrentGFXIP.empty() && G != CurrentGFXIP.back().second) {
+      SemaRef.Diag(P->getExprLoc(),
+                   diag::err_amdgcn_conflicting_is_processor_options)
+          << P;
+      SemaRef.Diag(CurrentGFXIP.back().first,
+                   diag::note_amdgcn_previous_is_processor_guard);
+    }
+    CurrentGFXIP.emplace_back(P->getExprLoc(), G);
+  } else {
+    auto *FD = cast<FunctionDecl>(
+        cast<DeclRefExpr>(P->getArg(0))->getReferencedDeclOfCallee());
+    unsigned ID = FD->getBuiltinID();
+    StringRef F = SemaRef.getASTContext().BuiltinInfo.getRequiredFeatures(ID);
+    GuardedBuiltins.push_back(ID);
   }
-  return nullptr;
-}
 
-bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
-  if (FindPredicate(If->getCond())) {
-    auto *CE = cast<CallExpr>(If->getCond());
-    bool IsProcessorCheck =
-        CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_processor_is;
-
-    if (IsProcessorCheck) {
-      StringRef G = cast<clang::StringLiteral>(CE->getArg(0))->getString();
-      // TODO: handle generic ISAs.
-      if (!CurrentGFXIP.empty() && G != CurrentGFXIP.back().second) {
-        SemaRef.Diag(CE->getExprLoc(),
-                     diag::err_amdgcn_conflicting_is_processor_options)
-            << CE;
-        SemaRef.Diag(CurrentGFXIP.back().first,
-                     diag::note_amdgcn_previous_is_processor_guard);
-      }
-      CurrentGFXIP.emplace_back(CE->getExprLoc(), G);
-    } else {
-      auto *FD = cast<FunctionDecl>(
-          cast<DeclRefExpr>(CE->getArg(0))->getReferencedDeclOfCallee());
-      unsigned ID = FD->getBuiltinID();
-      StringRef F = SemaRef.getASTContext().BuiltinInfo.getRequiredFeatures(ID);
-      GuardedBuiltins.push_back(ID);
-    }
+  return IsProcessorCheck;
+}
 
-    bool Continue = TraverseStmt(If->getThen());
+void DiagnoseUnguardedBuiltins::ExitPredicateGuardedContext(bool WasProcCheck) {
+  if (WasProcCheck)
+    CurrentGFXIP.pop_back();
+  else
+    GuardedBuiltins.pop_back();
+}
 
-    if (IsProcessorCheck)
-      CurrentGFXIP.pop_back();
-    else
-      GuardedBuiltins.pop_back();
+inline std::pair<Stmt *, Stmt *> GetTraversalOrder(Stmt *S) {
+  std::pair<Stmt *, Stmt *> Ordered;
+  Expr *Condition;
 
-    return Continue && TraverseStmt(If->getElse());
+  if (auto *CO = dyn_cast<ConditionalOperator>(S)) {
+    Condition = CO->getCond();
+    Ordered = {CO->getTrueExpr(), CO->getFalseExpr()};
+  } else if (auto *If = dyn_cast<IfStmt>(S)) {
+    Condition = If->getCond();
+    Ordered = {If->getThen(), If->getElse()};
   }
 
-  return DynamicRecursiveASTVisitor::TraverseIfStmt(If);
+  if (auto *UO = dyn_cast<UnaryOperator>(Condition))
+    if (UO->getOpcode() == UnaryOperatorKind::UO_LNot)
+      std::swap(Ordered.first, Ordered.second);
+
+  return Ordered;
+}
+
+bool DiagnoseUnguardedBuiltins::TraverseGuardedStmt(Stmt *S, CallExpr *P) {
+  assert(S && "Unexpected missing Statement!");
+  assert(P && "Unexpected missing Predicate!");
+
+  auto [Guarded, Unguarded] = GetTraversalOrder(S);
+
+  bool WasProcessorCheck = EnterPredicateGuardedContext(P);
+
+  bool Continue = TraverseStmt(Guarded);
+
+  ExitPredicateGuardedContext(WasProcessorCheck);
+
+  return Continue && TraverseStmt(Unguarded);
 }
 
 bool DiagnoseUnguardedBuiltins::VisitAsmStmt(AsmStmt *ASM) {
@@ -866,15 +909,15 @@ bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
                                        FeatureMap);
   }
 
+  FunctionDecl *BI = CE->getDirectCallee();
+  SourceLocation BICallLoc = CE->getExprLoc();
   if (Builtin::evaluateRequiredTargetFeatures(FL, FeatureMap)) {
-    SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
-        << CE->getDirectCallee();
-    SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
-        << CE->getDirectCallee();
+    SemaRef.Diag(BICallLoc, diag::warn_amdgcn_unguarded_builtin) << BI;
+    SemaRef.Diag(BICallLoc, diag::note_amdgcn_unguarded_builtin_silence) << BI;
   } else {
     StringRef GFXIP = CurrentGFXIP.empty() ? "" : CurrentGFXIP.back().second;
-    SemaRef.Diag(CE->getExprLoc(), diag::err_amdgcn_incompatible_builtin)
-        << CE->getDirectCallee() << FL << !CurrentGFXIP.empty() << GFXIP;
+    SemaRef.Diag(BICallLoc, diag::err_amdgcn_incompatible_builtin)
+        << BI << FL << !CurrentGFXIP.empty() << GFXIP;
     if (!CurrentGFXIP.empty())
       SemaRef.Diag(CurrentGFXIP.back().first,
                    diag::note_amdgcn_previous_is_processor_guard);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index a1f76d282dc3f..ff077a39a7924 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6788,11 +6788,13 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
 
     FunctionDecl *FDecl = dyn_cast<FunctionDecl>(NDecl);
     if (FDecl && FDecl->getBuiltinID()) {
-      if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID())) {
-        const llvm::Triple &Triple = Context.getTargetInfo().getTriple();
-        if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD)
+      const llvm::Triple &Triple = Context.getTargetInfo().getTriple();
+      if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD) {
+        if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID()) &&
+            !Context.BuiltinInfo.isAuxBuiltinID(FDecl->getBuiltinID())) {
           AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast<FunctionDecl>(
               getFunctionLevelDeclContext(/*AllowLambda=*/true)));
+        }
       }
 
       // Rewrite the function decl for this builtin by replacing parameters
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index b46e0b83970fa..030ae9a4e6dc9 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -25,8 +25,8 @@
 // AMDGCNSPIRV-LABEL: define spir_func void @foo(
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT:    [[HAS_GFX10_INSTS_:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
-// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[HAS_GFX10_INSTS_]], false
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false), !llvm.amdgcn.feature.predicate [[META3:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
 // AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
@@ -58,4 +58,5 @@ void foo() {
 // AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
 // AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// AMDGCNSPIRV: [[META3]] = !{!"has.gfx10-insts"}
 //.
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 5a1f63c673127..01507aac754b1 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -24,8 +24,8 @@
 // AMDGCNSPIRV-LABEL: define spir_func void @foo(
 // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
-// AMDGCNSPIRV-NEXT:    [[IS_GFX900_:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false)
-// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[IS_GFX900_]], false
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = call addrspace(4) i1 @_Z20__spirv_SpecConstant(i32 -1, i1 false), !llvm.amdgcn.feature.predicate [[META3:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
 // AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 // AMDGCNSPIRV:       [[IF_THEN]]:
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
@@ -57,4 +57,5 @@ void foo() {
 // AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
 // AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// AMDGCNSPIRV: [[META3]] = !{!"is.gfx900"}
 //.
diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
index 63391220fd0e6..ffd07364c2d1c 100644
--- a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
+++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip
@@ -21,28 +21,28 @@ constexpr static bool BountCtrl()
     return true & false;
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 16, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b)
 {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false);
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 4, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b)
 {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false);
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 3, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b)
 {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false);
 }
 
-// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %1, i32 0, i32 0, i32 0, i1 false)
+// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false)
 __attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b)
 {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index f3884c1e2bea7..91c367fa4a425 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -25,7 +25,7 @@ void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
 
 // CHECK-LABEL: @test_mov_dpp8_uint(
 // CHECK:      {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
-// CHECK-NEXT: store i32 %0,
+// CHECK-NEXT: store i32 %[[#]],
 void test_mov_dpp8_uint(global uint* out, uint a) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *out = __builtin_amdgcn_mov_dpp8(a, 1);
@@ -33,56 +33,56 @@ void test_mov_dpp8_uint(global uint* out, uint a) {
 
 // CHECK-LABEL: @test_mov_dpp8_long(
 // CHECK:      {{.*}}call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %a, i32 1)
-// CHECK-NEXT: store i64 %0,
+// CHECK-NEXT: store i64 %[[#]],
 void test_mov_dpp8_long(global long* out, long a) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_float(
-// CHECK:      %0 = bitcast float %a to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast float %a to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[BC]], i32 1)
+// CHECK-NEXT: store i32 %[[DPP_RET]],
 void test_mov_dpp8_float(global float* out, float a) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast double %x to i64
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %[[BC]], i32 1)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_mov_dpp8_double(double x, global double *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[ZEXT]], i32 1)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp8_short(short x, global short *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i8 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[ZEXT]], i32 1)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i8
+// CHECK-NEXT: store i8 %[[TRUNC]],
 void test_mov_dpp8_char(char x, global char *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %[[LD:[0-9]+]] = load i16,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %[[LD]] to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %[[ZEXT]], i32 1)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp8_half(half *x, global half *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp8))
     *p = __builtin_amdgcn_mov_dpp8(*x, 1);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index ccdc3b538a1d2..00127e8c8a3aa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -30,11 +30,11 @@ void test_s_sendmsg_rtnl(global ulong* out) {
 }
 
 // CHECK-LABEL: @test_ds_bvh_stack_rtn(
-// CHECK: %0 = tail call{{.*}} { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data, <4 x i32> %data1, i32 128)
-// CHECK: %1 = extractvalue { i32, i32 } %0, 0
-// CHECK: %2 = extractvalue { i32, i32 } %0, 1
-// CHECK: %3 = insertelement <2 x i32> poison, i32 %1, i64 0
-// CHECK: %4 = insertelement <2 x i32> %3, i32 %2, i64 1
+// CHECK: %[[#BVH_STACK:]] = tail call{{.*}} { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data, <4 x i32> %data1, i32 128)
+// CHECK: %[[#RET_FIRST:]] = extractvalue { i32, i32 } %[[#BVH_STACK]], 0
+// CHECK: %[[#RET_SECOND:]] = extractvalue { i32, i32 } %[[#BVH_STACK]], 1
+// CHECK: %[[#OUT:]] = insertelement <2 x i32> poison, i32 %[[#RET_FIRST]], i64 0
+// CHECK: %{{.*}} = insertelement <2 x i32> %[[#OUT]], i32 %[[#RET_SECOND]], i64 1
 void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
 {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_ds_bvh_stack_rtn))
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 922d8f0dd4790..34482a6df240e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -1,9 +1,9 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV --enable-var-scope %s
 
 #define INVALID_MEMORY_SCOPE (__MEMORY_SCOPE_CLUSTR+1)
 
@@ -126,57 +126,57 @@ void test_mov_dpp_int(global int* out, int src)
 }
 
 // CHECK-LABEL: @test_mov_dpp_long
-// CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %0,
+// CHECK:      %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_mov_dpp_long(long x, global long *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
     *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast float %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %[[DPP_RET]],
 void test_mov_dpp_float(float x, global float *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
     *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast double %x to i64
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_mov_dpp_double(double x, global double *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
     *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp_short(short x, global short *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
     *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i8 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i8
+// CHECK-NEXT: store i8 %[[TRUNC]],
 void test_mov_dpp_char(char x, global char *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
     *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %[[LD:[0-9]+]] = load i16,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %[[LD]] to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_mov_dpp_half(half *x, global half *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_mov_dpp))
     *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
@@ -191,57 +191,57 @@ void test_update_dpp_int(global int* out, int arg1, int arg2)
 }
 
 // CHECK-LABEL: @test_update_dpp_long
-// CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %0,
+// CHECK:      %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_update_dpp_long(long x, global long *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast float %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[BC]], i32 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i32 %[[DPP_RET]],
 void test_update_dpp_float(float x, global float *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %[[BC:[0-9]+]] = bitcast double %x to i64
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %[[BC]], i64 %[[BC]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i64 %[[DPP_RET]],
 void test_update_dpp_double(double x, global double *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[ZEXT]], i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_update_dpp_short(short x, global short *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i8 %x to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[ZEXT]], i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i8
+// CHECK-NEXT: store i8 %[[TRUNC]],
 void test_update_dpp_char(char x, global char *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %[[LD:[0-9]+]] = load i16,
+// CHECK:      %[[ZEXT:[0-9]+]] = zext i16 %[[LD]] to i32
+// CHECK-NEXT: %[[DPP_RET:[0-9]+]] = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %[[ZEXT]], i32 %[[ZEXT]], i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: %[[TRUNC:[0-9]+]] = trunc i32 %[[DPP_RET]] to i16
+// CHECK-NEXT: store i16 %[[TRUNC]],
 void test_update_dpp_half(half *x, global half *p) {
   if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_update_dpp))
     *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
diff --git a/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip b/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
index 7418be2aa4c43..1566bc9aa0be5 100644
--- a/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
+++ b/clang/test/SemaHIP/amdgpu-feature-predicates-guard-use.hip
@@ -17,6 +17,15 @@ __device__ void f(int x, bool b) {
         __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x));
     }
 
+    if (!__builtin_amdgcn_processor_is("gfx90a")) {
+        long v15_16;
+        __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(x)); // expected-warning {{the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence might be invalid for some AMDGPU targets}}
+        // expected-note at -1 {{enclose the 'v_lshlrev_b64 v[15:16], 0, $0' ASM sequence in a scope controlled by a __builtin_amdgcn_is_processor check to silence this warning}}
+    }
+
+    __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var) ? __builtin_amdgcn_s_sleep_var(x) : __builtin_trap();
+    !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var) ? __builtin_amdgcn_s_sleep_var(x) : __builtin_trap(); // expected-error {{'__builtin_amdgcn_s_sleep_var' cannot be invoked in the current context, as it requires the 'gfx12-insts' feature(s)}}
+
     const auto lambda = [=] __device__  () {
         __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
         // expected-note at -1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
index 93f963ec70c69..6e7603597be85 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -99,8 +99,11 @@ bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
     if (ID != UINT32_MAX)
       continue;
 
-    StringRef Name = CI->getName().substr(0, CI->getName().rfind('.'));
-    ID = IDs.try_emplace(Name, IDs.size()).first->second;
+    assert(CI->getMetadata("llvm.amdgcn.feature.predicate") &&
+           "Feature predicates must be encoded into metadata!");
+    auto *P = cast<MDString>(
+        CI->getMetadata("llvm.amdgcn.feature.predicate")->getOperand(0));
+    ID = IDs.try_emplace(P->getString(), IDs.size()).first->second;
 
     CI->setArgOperand(0, ConstantInt::get(CI->getArgOperand(0)->getType(), ID));
   }
diff --git a/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll b/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
index 32b18dc1023e4..4b9119b98cab6 100644
--- a/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
+++ b/llvm/test/CodeGen/SPIRV/SpecConstants/amdgcnspirv-feature-predicate-specconstant.ll
@@ -8,20 +8,20 @@
 ; CHECK: OpName %[[#S_SLEEP_VAR:]] "spirv.llvm_amdgcn_s_sleep_var"
 ; CHECK: OpName %[[#S_WAIT_EVENT_EXPORT_READY:]] "spirv.llvm_amdgcn_s_wait_event_export_ready"
 ; CHECK: OpName %[[#S_TTRACEDATA_IMM:]] "spirv.llvm_amdgcn_s_ttracedata_imm"
-; CHECK: OpDecorate %[[#IS_GFX950:]] SpecId 7
-; CHECK: OpDecorate %[[#IS_GFX1201:]] SpecId 2
-; CHECK: OpDecorate %[[#HAS_GFX12_INSTS:]] SpecId 6
-; CHECK: OpDecorate %[[#IS_GFX906:]] SpecId 5
-; CHECK: OpDecorate %[[#IS_GFX1010:]] SpecId 4
+; CHECK: OpDecorate %[[#IS_GFX950:]] SpecId 6
+; CHECK: OpDecorate %[[#IS_GFX950_1:]] SpecId 6
+; CHECK: OpDecorate %[[#IS_GFX1201:]] SpecId 1
+; CHECK: OpDecorate %[[#HAS_GFX12_INSTS:]] SpecId 5
+; CHECK: OpDecorate %[[#IS_GFX906:]] SpecId 4
+; CHECK: OpDecorate %[[#IS_GFX1010:]] SpecId 2
 ; CHECK: OpDecorate %[[#IS_GFX1101:]] SpecId 3
-; CHECK: OpDecorate %[[#IS_GFX1101_1:]] SpecId 3
-; CHECK: OpDecorate %[[#IS_GFX1201_1:]] SpecId 2
+; CHECK: OpDecorate %[[#IS_GFX1101_1:]] SpecId 2
+; CHECK: OpDecorate %[[#IS_GFX1201_1:]] SpecId 1
 ; CHECK: OpDecorate %[[#HAS_GFX11_INSTS:]] SpecId 0
-; CHECK: OpDecorate %[[#HAS_GFX10_INSTS:]] SpecId 1
-; CHECK: OpDecorate %[[#HAS_GFX11_INSTS_1:]] SpecId 0
+; CHECK: OpDecorate %[[#HAS_GFX10_INSTS:]] SpecId 0
 ; CHECK: %[[#BOOL:]] = OpTypeBool
 ; CHECK: %[[#UCHAR:]] = OpTypeInt 8
-; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN:]] = OpConstant %[[#]] 117
+; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN:]] = OpConstant %[[#]] 99
 ; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRTY:]] = OpTypeArray %[[#UCHAR]] %[[#FEATURE_PREDICATE_IDS_MAP_STRLEN]]
 ; CHECK: %[[#FEATURE_PREDICATE_IDS_MAP_STRVAL:]] = OpConstantComposite %[[#FEATURE_PREDICATE_IDS_MAP_STRTY]]
 ; CHECK: %[[#FEATURE_PREDICATE_IDS]] = OpVariable %[[#]] CrossWorkgroup %[[#FEATURE_PREDICATE_IDS_MAP_STRVAL]]
@@ -35,7 +35,6 @@
 ; CHECK: %[[#IS_GFX1201_1]] = OpSpecConstantFalse %[[#BOOL]]
 ; CHECK: %[[#HAS_GFX11_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
 ; CHECK: %[[#HAS_GFX10_INSTS]] = OpSpecConstantFalse %[[#BOOL]]
-; CHECK: %[[#HAS_GFX11_INSTS_1]] = OpSpecConstantFalse %[[#BOOL]]
 
 declare void @llvm.amdgcn.s.monitor.sleep(i16 immarg) addrspace(4)
 
@@ -59,87 +58,87 @@ declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) addrspace(4) #6
 define void @kernel() addrspace(4) {
 ; CHECK-DAG: %[[#KERNEL]] = OpFunction %33 None %34 ; -- Begin function kernel
 ; CHECK-NEXT: %2 = OpLabel
-; CHECK-NEXT: %100 = OpLoad %36 %75 Aligned 4
+; CHECK-NEXT: %99 = OpLoad %36 %74 Aligned 4
 ; CHECK-NEXT: OpBranchConditional %[[#IS_GFX950]] %4 %3
 ; CHECK-NEXT: %3 = OpLabel
-; CHECK-NEXT: %101 = OpFunctionCall %33 %[[#SET_FPENV_I64]] %50
+; CHECK-NEXT: %100 = OpFunctionCall %33 %[[#SET_FPENV_I64]] %50
 ; CHECK-NEXT: OpBranch %5
 ; CHECK-NEXT: %4 = OpLabel
-; CHECK-NEXT: %102 = OpFunctionCall %39 %[[#ASHR_PK_I8_I32]] %49 %49 %49
+; CHECK-NEXT: %101 = OpFunctionCall %39 %[[#ASHR_PK_I8_I32]] %49 %49 %49
 ; CHECK-NEXT: OpBranch %5
 ; CHECK-NEXT: %5 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201]] %7 %6
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX950_1]] %7 %6
 ; CHECK-NEXT: %6 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX12_INSTS]] %7 %8
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201]] %7 %8
 ; CHECK-NEXT: %7 = OpLabel
-; CHECK-NEXT: %103 = OpFunctionCall %33 %[[#S_SLEEP_VAR]] %100
+; CHECK-NEXT: %102 = OpFunctionCall %33 %[[#S_SLEEP_VAR]] %99
 ; CHECK-NEXT: OpBranch %8
 ; CHECK-NEXT: %8 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#IS_GFX906]] %10 %9
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX12_INSTS]] %10 %9
 ; CHECK-NEXT: %9 = OpLabel
-; CHECK-NEXT: %104 = OpFunctionCall %33 %[[#S_WAIT_EVENT_EXPORT_READY]]
+; CHECK-NEXT: %103 = OpFunctionCall %33 %[[#S_WAIT_EVENT_EXPORT_READY]]
 ; CHECK-NEXT: OpBranch %14
 ; CHECK-NEXT: %10 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1010]] %12 %11
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX906]] %12 %11
 ; CHECK-NEXT: %11 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101]] %12 %13
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1010]] %12 %13
 ; CHECK-NEXT: %12 = OpLabel
-; CHECK-NEXT: %105 = OpFunctionCall %33 %[[#S_TTRACEDATA_IMM]] %48
+; CHECK-NEXT: %104 = OpFunctionCall %33 %[[#S_TTRACEDATA_IMM]] %48
 ; CHECK-NEXT: OpBranch %13
 ; CHECK-NEXT: %13 = OpLabel
 ; CHECK-NEXT: OpBranch %14
 ; CHECK-NEXT: %14 = OpLabel
 ; CHECK-NEXT: OpBranch %15
 ; CHECK-NEXT: %15 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101_1]] %16 %17
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101]] %16 %17
 ; CHECK-NEXT: %16 = OpLabel
-; CHECK-NEXT: %106 = OpLoad %36 %87 Aligned 4
-; CHECK-NEXT: %107 = OpIAdd %36 %106 %100
-; CHECK-NEXT: OpStore %87 %107 Aligned 4
+; CHECK-NEXT: %105 = OpLoad %36 %86 Aligned 4
+; CHECK-NEXT: %106 = OpIAdd %36 %105 %99
+; CHECK-NEXT: OpStore %86 %106 Aligned 4
 ; CHECK-NEXT: OpBranch %17
 ; CHECK-NEXT: %17 = OpLabel
 ; CHECK-NEXT: OpBranch %18
 ; CHECK-NEXT: %18 = OpLabel
-; CHECK-NEXT: %108 = OpLoad %36 %87 Aligned 4
-; CHECK-NEXT: %109 = OpISub %36 %108 %100
-; CHECK-NEXT: OpStore %87 %109 Aligned 4
+; CHECK-NEXT: %107 = OpLoad %36 %86 Aligned 4
+; CHECK-NEXT: %108 = OpISub %36 %107 %99
+; CHECK-NEXT: OpStore %86 %108 Aligned 4
 ; CHECK-NEXT: OpBranch %19
 ; CHECK-NEXT: %19 = OpLabel
 ; CHECK-NEXT: OpBranch %20
 ; CHECK-NEXT: %20 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201_1]] %21 %22
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1101_1]] %21 %22
 ; CHECK-NEXT: %21 = OpLabel
 ; CHECK-NEXT: OpBranch %22
 ; CHECK-NEXT: %22 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS]] %26 %23
+; CHECK-NEXT: OpBranchConditional %[[#IS_GFX1201_1]] %26 %23
 ; CHECK-NEXT: %23 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX10_INSTS]] %24 %25
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS]] %24 %25
 ; CHECK-NEXT: %24 = OpLabel
-; CHECK-NEXT: %110 = OpFunctionCall %33 %[[#S_TTRACEDATA_IMM]] %48
+; CHECK-NEXT: %109 = OpFunctionCall %33 %[[#S_TTRACEDATA_IMM]] %48
 ; CHECK-NEXT: OpBranch %25
 ; CHECK-NEXT: %25 = OpLabel
 ; CHECK-NEXT: OpBranch %27
 ; CHECK-NEXT: %26 = OpLabel
-; CHECK-NEXT: %111 = OpFunctionCall %33 %[[#S_WAIT_EVENT_EXPORT_READY]]
+; CHECK-NEXT: %110 = OpFunctionCall %33 %[[#S_WAIT_EVENT_EXPORT_READY]]
 ; CHECK-NEXT: OpBranch %27
 ; CHECK-NEXT: %27 = OpLabel
 ; CHECK-NEXT: OpBranch %28
 ; CHECK-NEXT: %28 = OpLabel
-; CHECK-NEXT: %112 = OpLoad %36 %87 Aligned 4
-; CHECK-NEXT: %113 = OpISub %36 %112 %100
-; CHECK-NEXT: OpStore %87 %113 Aligned 4
+; CHECK-NEXT: %111 = OpLoad %36 %86 Aligned 4
+; CHECK-NEXT: %112 = OpISub %36 %111 %99
+; CHECK-NEXT: OpStore %86 %112 Aligned 4
 ; CHECK-NEXT: OpBranch %29
 ; CHECK-NEXT: %29 = OpLabel
 ; CHECK-NEXT: OpBranch %30
 ; CHECK-NEXT: %30 = OpLabel
-; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX11_INSTS_1]] %31 %32
+; CHECK-NEXT: OpBranchConditional %[[#HAS_GFX10_INSTS]] %31 %32
 ; CHECK-NEXT: %31 = OpLabel
 ; CHECK-NEXT: OpBranch %32
 ; CHECK-NEXT: %32 = OpLabel
 
 entry:
   %x = load i32, ptr addrspace(1) @g
-  %is.gfx950. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx950. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !9
   br i1 %is.gfx950., label %cond.true, label %cond.false
 cond.true:
   %0 = call addrspace(4) i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 8, i32 8, i32 8)
@@ -148,25 +147,25 @@ cond.false:
   call addrspace(4) void @llvm.set.fpenv.i64(i64 -1)
   br label %cond.end
 cond.end:
-  %is.gfx1201. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx1201. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !9
   br i1 %is.gfx1201., label %if.then, label %lor.lhs.false
 lor.lhs.false:
-  %has.gfx12-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %has.gfx12-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !10
   br i1 %has.gfx12-insts., label %if.then, label %if.end
 if.then:
   call addrspace(4) void @llvm.amdgcn.s.sleep.var(i32 %x)
   br label %if.end
 if.end:
-  %is.gfx906. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx906. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !11
   br i1 %is.gfx906., label %if.else, label %if.then2
 if.then2:
   call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
   br label %if.end6
 if.else:
-  %is.gfx1010. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx1010. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !12
   br i1 %is.gfx1010., label %if.then4, label %lor.lhs.false3
 lor.lhs.false3:
-  %is.gfx1101. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx1101. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !13
   br i1 %is.gfx1101., label %if.then4, label %if.end5
 if.then4:
   call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
@@ -176,7 +175,7 @@ if.end5:
 if.end6:
   br label %while.cond
 while.cond:
-  %is.gfx1101.7 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx1101.7 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !14
   br i1 %is.gfx1101.7, label %while.body, label %while.end
 while.body:
   %4 = load i32, ptr addrspace(1) @p
@@ -191,12 +190,12 @@ do.body:
   store i32 %sub, ptr addrspace(1) @p
   br label %do.end
 do.cond:
-  %is.gfx1010.8 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx1010.8 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !14
   br i1 %is.gfx1010.8, label %do.body, label %do.end
 do.end:
   br label %for.cond
 for.cond:
-  %is.gfx1201.9 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %is.gfx1201.9 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !13
   br i1 %is.gfx1201.9, label %for.body, label %for.end
 for.body:
   br label %for.end
@@ -206,13 +205,13 @@ for.inc:
   store i32 %inc, ptr addrspace(1) @p
   br label %for.cond
 for.end:
-  %has.gfx11-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %has.gfx11-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !10
   br i1 %has.gfx11-insts., label %if.then10, label %if.else11
 if.then10:
   call addrspace(4) void @llvm.amdgcn.s.wait.event.export.ready()
   br label %if.end14
 if.else11:
-  %has.gfx10-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %has.gfx10-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !18
   br i1 %has.gfx10-insts., label %if.then12, label %if.end13
 if.then12:
   call addrspace(4) void @llvm.amdgcn.s.ttracedata.imm(i16 1)
@@ -227,12 +226,12 @@ do.body15:
   store i32 %sub16, ptr addrspace(1) @p
   br label %do.end18
 do.cond17:
-  %has.gfx1250-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %has.gfx1250-insts. = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !20
   br i1 %has.gfx1250-insts., label %do.body15, label %do.end18
 do.end18:
   br label %for.cond19
 for.cond19:
-  %has.gfx11-insts.20 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false)
+  %has.gfx11-insts.20 = call addrspace(4) i1 @_Z20__spirv_SpecConstantib(i32 -1, i1 false), !llvm.amdgcn.feature.predicate !18
   br i1 %has.gfx11-insts.20, label %for.body21, label %for.end24
 for.body21:
   br label %for.end24
@@ -244,3 +243,13 @@ for.inc22:
 for.end24:
   ret void
 }
+
+!9 = !{!"is.gfx950"}
+!10 = !{!"is.gfx1201"}
+!11 = !{!"has.gfx12-insts"}
+!12 = !{!"is.gfx906"}
+!13 = !{!"is.gfx1010"}
+!14 = !{!"is.gfx1101"}
+!18 = !{!"has.gfx11-insts"}
+!19 = !{!"has.gfx10-insts"}
+!20 = !{!"has.gfx1250-insts"}

>From 8ef2cf43a43929920ba79b7a7b7e457f44959ba4 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sun, 8 Feb 2026 22:06:48 +0000
Subject: [PATCH 68/69] Fix oversight.

---
 clang/lib/Sema/SemaAMDGPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index d05cff4d71d21..e5bb3e5cc1290 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -837,7 +837,7 @@ void DiagnoseUnguardedBuiltins::ExitPredicateGuardedContext(bool WasProcCheck) {
 
 inline std::pair<Stmt *, Stmt *> GetTraversalOrder(Stmt *S) {
   std::pair<Stmt *, Stmt *> Ordered;
-  Expr *Condition;
+  Expr *Condition = nullptr;
 
   if (auto *CO = dyn_cast<ConditionalOperator>(S)) {
     Condition = CO->getCond();

>From e215226cb137f6b4fd0f450578b277a366673d99 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Tue, 10 Feb 2026 00:47:06 +0000
Subject: [PATCH 69/69] Clarify AMDGCN specific handling.

---
 llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
index 6e7603597be85..c5255e2286b8f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareGlobals.cpp
@@ -121,7 +121,7 @@ bool tryAssignPredicateSpecConstIDs(Module &M, Function *F) {
       ConstantDataArray::getString(M.getContext(), Tmp, false);
 
   new GlobalVariable(M, PredSpecIDStr->getType(), true,
-                     GlobalVariable::LinkageTypes::PrivateLinkage,
+                     GlobalVariable::LinkageTypes::ExternalLinkage,
                      PredSpecIDStr, "llvm.amdgcn.feature.predicate.ids");
 
   return true;
@@ -137,9 +137,9 @@ bool SPIRVPrepareGlobals::runOnModule(Module &M) {
   if (M.getTargetTriple().getVendor() != Triple::AMD)
     return Changed;
 
-  // TODO: Currently the symbol can only be inserted via feature predicate use,
-  //       but in the future this will need revisiting if we start making more
-  //       liberal use of the intrinsic.
+  // TODO: Currently, for AMDGCN flavoured SPIR-V, the symbol can only be
+  //       inserted via feature predicate use, but in the future this will need
+  //       revisiting if we start making more liberal use of the intrinsic.
   if (Function *F = M.getFunction("_Z20__spirv_SpecConstantib"))
     Changed |= tryAssignPredicateSpecConstIDs(M, F);
 



More information about the llvm-commits mailing list