[clang] [llvm] [AMDGPU][clang][CodeGen][opt] Add late-resolved feature identifying predicates (PR #134016)

Alex Voicu via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 1 19:51:21 PDT 2025


https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/134016

>From 91eeaf02336e539f14dcb0a79ff15dbe8befe6f1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 02:47:42 +0100
Subject: [PATCH 1/4] Add the functional identity and feature queries.

---
 clang/docs/LanguageExtensions.rst             | 110 ++++++
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   5 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  10 +
 clang/lib/Basic/Targets/SPIR.cpp              |   4 +
 clang/lib/Basic/Targets/SPIR.h                |   4 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  29 ++
 clang/lib/Sema/SemaExpr.cpp                   | 157 ++++++++
 clang/test/CodeGen/amdgpu-builtin-cpu-is.c    |  65 ++++
 .../CodeGen/amdgpu-builtin-is-invocable.c     |  64 ++++
 .../amdgpu-feature-builtins-invalid-use.cpp   |  43 +++
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   9 +
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp   | 207 ++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   3 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 ...pu-expand-feature-predicates-unfoldable.ll |  28 ++
 .../amdgpu-expand-feature-predicates.ll       | 359 ++++++++++++++++++
 17 files changed, 1099 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/amdgpu-builtin-cpu-is.c
 create mode 100644 clang/test/CodeGen/amdgpu-builtin-is-invocable.c
 create mode 100644 clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 3b8a9cac6587a..8a7cb75af13e5 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4920,6 +4920,116 @@ If no address spaces names are provided, all address spaces are fenced.
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
 
+__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide
+a functional mechanism for programatically querying:
+
+* the identity of the current target processor;
+* the capability of the current target processor to invoke a particular builtin.
+
+**Syntax**:
+
+.. code-block:: c
+
+  // When used as the predicate for a control structure
+  bool __builtin_amdgcn_processor_is(const char*);
+  bool __builtin_amdgcn_is_invocable(builtin_name);
+  // Otherwise
+  void __builtin_amdgcn_processor_is(const char*);
+  void __builtin_amdgcn_is_invocable(void);
+
+**Example of use**:
+
+.. code-block:: c++
+
+  if (__builtin_amdgcn_processor_is("gfx1201") ||
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+    __builtin_amdgcn_s_sleep_var(x);
+
+  if (!__builtin_amdgcn_processor_is("gfx906"))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_processor_is("gfx1010") ||
+           __builtin_amdgcn_processor_is("gfx1101"))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+
+  do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010"));
+
+  for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  do {
+    *p -= x;
+  } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+
+  for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+
+**Description**:
+
+When used as the predicate value of the following control structures:
+
+.. code-block:: c++
+
+  if (...)
+  while (...)
+  do { } while (...)
+  for (...)
+
+be it directly, or as arguments to logical operators such as ``!, ||, &&``, the
+builtins return a boolean value that:
+
+* indicates whether the current target matches the argument; the argument MUST
+  be a string literal and a valid AMDGPU target
+* indicates whether the builtin function passed as the argument can be invoked
+  by the current target; the argument MUST be either a generic or AMDGPU
+  specific builtin name
+
+Outside of these contexts, the builtins have a ``void`` returning signature
+which prevents their misuse.
+
+**Example of invalid use**:
+
+.. code-block:: c++
+
+  void kernel(int* p, int x, bool (*pfn)(bool), const char* str) {
+    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+    else if (__builtin_amdgcn_processor_is(str)) __builtin_trap();
+
+    bool a = __builtin_amdgcn_processor_is("gfx906");
+    const bool b = !__builtin_amdgcn_processor_is("gfx906");
+    const bool c = !__builtin_amdgcn_processor_is("gfx906");
+    bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    const auto f =
+        !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
+        || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    const auto g =
+        !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)
+        || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    __builtin_amdgcn_processor_is("gfx1201")
+      ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
+    if (pfn(__builtin_amdgcn_processor_is("gfx1200")))
+      __builtin_amdgcn_s_sleep_var(x);
+
+    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+    else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap();
+  }
+
+When invoked while compiling for a concrete target, the builtins are evaluated
+early by Clang, and never produce any CodeGen effects / have no observable
+side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
+which is an abstract target, a series of predicate values are implicitly
+created. These predicates get resolved when finalizing the compilation process
+for a concrete target, and shall reflect the latter's identity and features.
+Thus, it is possible to author high-level code, in e.g. HIP, that is target
+adaptive in a dynamic fashion, contrary to macro based mechanisms.
 
 ARM/AArch64 Language Extensions
 -------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 44ef404aee72f..5d01a7e75f7e7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,6 +346,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
 BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
 BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
 
+// These are special FE only builtins intended for forwarding the requirements
+// to the ME.
+BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu")
+BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu")
+
 //===----------------------------------------------------------------------===//
 // R600-NI only builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5e45482584946..45f0f9eb88e55 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13054,4 +13054,14 @@ def err_acc_decl_for_routine
 // AMDGCN builtins diagnostics
 def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
 def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+def err_amdgcn_processor_is_arg_not_literal
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a string "
+            "literal">;
+def err_amdgcn_processor_is_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a valid "
+            "AMDGCN processor identifier; '%0' is not valid">;
+def err_amdgcn_is_invocable_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
+            "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
+            " is not valid">;
 } // end of sema component.
diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp
index 5b5f47f9647a2..eb43d9b0be283 100644
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@@ -152,3 +152,7 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) {
     Float128Format = DoubleFormat;
   }
 }
+
+bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const {
+  return AMDGPUTI.isValidCPUName(CPU);
+}
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 78505d66d6f2f..7aa13cbeb89fd 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -432,6 +432,10 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
   }
 
   bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+  // This is only needed for validating arguments passed to
+  // __builtin_amdgcn_processor_is
+  bool isValidCPUName(StringRef Name) const override;
 };
 
 } // namespace targets
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index b56b739094ff3..7b1a3815144b4 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -284,6 +284,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
   Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
 }
 
+static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
+  auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
+
+  auto P = cast<GlobalVariable>(
+      CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
+  P->setConstant(true);
+  P->setExternallyInitialized(true);
+
+  return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(),
+                                           KnownNonNull));
+}
+
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -585,6 +597,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Value *Env = EmitScalarExpr(E->getArg(0));
     return Builder.CreateCall(F, {Env});
   }
+  case AMDGPU::BI__builtin_amdgcn_processor_is: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_processor_is should never reach CodeGen for "
+             "concrete targets!");
+    StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
+    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
+  }
+  case AMDGPU::BI__builtin_amdgcn_is_invocable: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_is_invocable should never reach CodeGen for "
+           "concrete targets!");
+    auto FD = cast<FunctionDecl>(
+      cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+    StringRef RF =
+        getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
+  }
   case AMDGPU::BI__builtin_amdgcn_read_exec:
     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7cc8374e69d73..24f5262ab3cf4 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6541,6 +6541,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
   if (Result.isInvalid()) return ExprError();
   Fn = Result.get();
 
+  // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved
+  // later, when we check boolean conditions, for now we merely forward it
+  // without any additional checking.
+  if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
+      ArgExprs[0]->getType() == Context.BuiltinFnTy) {
+    auto FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+
+    if (FD->getName() == "__builtin_amdgcn_is_invocable") {
+      auto FnPtrTy = Context.getPointerType(FD->getType());
+      auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+      return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy,
+                              ExprValueKind::VK_PRValue, RParenLoc,
+                              FPOptionsOverride());
+    }
+  }
+
   if (CheckArgsForPlaceholders(ArgExprs))
     return ExprError();
 
@@ -13234,6 +13250,20 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
   return InvalidOperands(Loc, LHS, RHS);
 }
 
+static inline bool IsAMDGPUPredicateBI(Expr *E) {
+  if (!E->getType()->isVoidType())
+    return false;
+
+  if (auto CE = dyn_cast<CallExpr>(E)) {
+    if (auto BI = CE->getDirectCallee())
+      if (BI->getName() == "__builtin_amdgcn_processor_is" ||
+          BI->getName() == "__builtin_amdgcn_is_invocable")
+        return true;
+  }
+
+  return false;
+}
+
 // C99 6.5.[13,14]
 inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                            SourceLocation Loc,
@@ -13329,6 +13359,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
   // The following is safe because we only use this method for
   // non-overloadable operands.
 
+  if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get()))
+    return Context.VoidTy;
+
   // C++ [expr.log.and]p1
   // C++ [expr.log.or]p1
   // The operands are both contextually converted to type bool.
@@ -15576,6 +15609,38 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) {
   return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy);
 }
 
+static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) {
+  if (!CE->getBuiltinCallee())
+    return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc());
+
+  if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+    CE->setType(Ctx.getLogicalOperationType());
+    return CE;
+  }
+
+  bool P = false;
+  auto &TI = Ctx.getTargetInfo();
+
+  if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
+    auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    auto TID = TI.getTargetID();
+    if (GFX && TID) {
+      auto N = GFX->getString();
+      P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0;
+    }
+  } else {
+    auto FD = cast<FunctionDecl>(CE->getArg(0)->getReferencedDeclOfCallee());
+
+    StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    llvm::StringMap<bool> CF;
+    Ctx.getFunctionFeatureMap(CF, FD);
+
+    P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+  }
+
+  return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc());
+}
+
 ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
                                       UnaryOperatorKind Opc, Expr *InputExpr,
                                       bool IsAfterAmp) {
@@ -15753,6 +15818,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
         // Vector logical not returns the signed variant of the operand type.
         resultType = GetSignedVectorType(resultType);
         break;
+      } else if (IsAMDGPUPredicateBI(InputExpr)) {
+        break;
       } else {
         return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                          << resultType << Input.get()->getSourceRange());
@@ -20469,6 +20536,88 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
     }
 }
 
+static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
+  if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") {
+    auto GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    if (!GFX) {
+      Sema.Diag(CE->getExprLoc(),
+                diag::err_amdgcn_processor_is_arg_not_literal);
+      return false;
+    }
+    auto N = GFX->getString();
+    if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) &&
+        (!Sema.getASTContext().getAuxTargetInfo() ||
+         !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
+      Sema.Diag(CE->getExprLoc(),
+                diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+      return false;
+    }
+  } else {
+    auto Arg = CE->getArg(0);
+    if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
+      Sema.Diag(CE->getExprLoc(),
+                diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
+  if (auto UO = dyn_cast<UnaryOperator>(E)) {
+    auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
+    if (IsAMDGPUPredicateBI(SE)) {
+      assert(
+        UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
+        "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+          "can only be used as operands of logical ops!");
+
+      if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
+        Invalid = true;
+        return nullptr;
+      }
+
+      UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE));
+      UO->setType(Sema.getASTContext().getLogicalOperationType());
+
+      return UO;
+    }
+  }
+  if (auto BO = dyn_cast<BinaryOperator>(E)) {
+    auto LHS = dyn_cast<CallExpr>(BO->getLHS());
+    auto RHS = dyn_cast<CallExpr>(BO->getRHS());
+    if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
+      assert(
+          BO->isLogicalOp() &&
+          "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+            "can only be used as operands of logical ops!");
+
+      if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
+          !ValidateAMDGPUPredicateBI(Sema, RHS)) {
+        Invalid = true;
+        return nullptr;
+      }
+
+      BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS));
+      BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS));
+      BO->setType(Sema.getASTContext().getLogicalOperationType());
+
+      return BO;
+    }
+  }
+  if (auto CE = dyn_cast<CallExpr>(E))
+    if (IsAMDGPUPredicateBI(CE)) {
+      if (!ValidateAMDGPUPredicateBI(Sema, CE)) {
+        Invalid = true;
+        return nullptr;
+      }
+      return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE);
+    }
+
+  return nullptr;
+}
+
 ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
                                        bool IsConstexpr) {
   DiagnoseAssignmentAsCondition(E);
@@ -20480,6 +20629,14 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
   E = result.get();
 
   if (!E->isTypeDependent()) {
+    if (E->getType()->isVoidType()) {
+      bool IsInvalidPredicate = false;
+      if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate))
+        return BIC;
+      else if (IsInvalidPredicate)
+        return ExprError();
+    }
+
     if (getLangOpts().CPlusPlus)
       return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
 
diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
new file mode 100644
index 0000000000000..6e261d9f5d239
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c
@@ -0,0 +1,65 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit a call to trap (concrete target, matches)
+//    2) for gfx1010 we emit an empty kernel (concrete target, does not match)
+//    3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
+//       load from it to provide the condition a br (abstract target)
+//.
+// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
+//.
+// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_cpu_is("gfx900"))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
new file mode 100644
index 0000000000000..6d2690cb75b7c
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -0,0 +1,64 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit an empty kernel (concrete target, lacks feature)
+//    2) for gfx1010 we emit a call to trap (concrete target, has feature)
+//    3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant
+//       externally initialised bool global, and load from it to provide the
+//       condition to a br (abstract target)
+
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
new file mode 100644
index 0000000000000..f618f54909b00
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -0,0 +1,43 @@
+// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+bool predicate(bool x) { return x; }
+
+void invalid_uses(int* p, int x, bool (*pfn)(bool)) {
+    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+    bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void'
+    const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+    bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void'
+    bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: variable has incomplete type 'const void'
+    const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: variable has incomplete type 'const void'
+    const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900");
+    // CHECK: error: variable has incomplete type 'const void'
+    const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
+    // CHECK: error: value of type 'void' is not contextually convertible to 'bool'
+    __builtin_amdgcn_processor_is("gfx1201")
+        ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42);
+    // CHECK: error: no matching function for call to 'predicate'
+    if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
+    // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument
+}
+
+void invalid_invocations(int x, const char* str) {
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid
+    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
+    if (__builtin_amdgcn_processor_is(str)) return;
+
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid
+    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid
+    else if (__builtin_amdgcn_is_invocable(str)) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid
+    else if (__builtin_amdgcn_is_invocable(x)) return;
+    // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
+    else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index a8e4ea9429f50..1fe0016723a30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -408,6 +408,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
+struct AMDGPUExpandFeaturePredicatesPass
+    : PassInfoMixin<AMDGPUExpandFeaturePredicatesPass> {
+  const AMDGPUTargetMachine &TM;
+  AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
new file mode 100644
index 0000000000000..125051c6aa0cf
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
@@ -0,0 +1,207 @@
+//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic pseudo-
+// intrinsics into target specific quantities / sequences. In this context, a
+// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
+// specific instruction, but rather is intended as a mechanism for abstractly
+// conveying target specific info to a HLL / the FE, without concretely
+// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
+// This pass should run as early as possible / immediately after Clang CodeGen,
+// so that the optimisation pipeline and the BE operate with concrete target
+// data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+inline Function *getCloneForInlining(Function *OldF) {
+  assert(OldF && "Must pass an existing Function!");
+
+  // TODO - Alias Value to clone arg.
+  ValueToValueMapTy VMap;
+
+  auto NewF = CloneFunction(OldF, VMap);
+
+  NewF->removeFnAttr(Attribute::OptimizeNone);
+  NewF->removeFnAttr(Attribute::NoInline);
+  NewF->addFnAttr(Attribute::AlwaysInline);
+
+  return NewF;
+}
+
+template <typename C>
+inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner,
+                         ModuleAnalysisManager &MAM,
+                         SmallDenseMap<Function *, Function *> &InlinableClones,
+                         C &Container) {
+  assert(V && "Must pass an existing Value!");
+
+  auto A = PreservedAnalyses::all();
+
+  constexpr auto IsValidCall = [](auto &&U) {
+    if (auto CB = dyn_cast<CallBase>(U))
+      if (auto F = CB->getCalledFunction())
+        if (!F->isIntrinsic() && !F->isDeclaration())
+          return true;
+    return false;
+  };
+
+  SmallVector<User *> Calls{};
+  copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
+
+  while (!Calls.empty()) {
+    for (auto &&Call : Calls) {
+      auto CB = cast<CallBase>(Call);
+      auto &TempF = InlinableClones[CB->getCalledFunction()];
+
+      if (!TempF)
+        TempF = getCloneForInlining(CB->getCalledFunction());
+
+      CB->setCalledFunction(TempF);
+      CB->removeFnAttr(Attribute::NoInline);
+      CB->addFnAttr(Attribute::AlwaysInline);
+
+      AlwaysInliner.run(*TempF->getParent(), MAM);
+    }
+
+    Calls.clear();
+
+    copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
+  }
+
+  for (auto &&U : V->users())
+    if (auto I = dyn_cast<Instruction>(U)) {
+      if (auto CB = dyn_cast<CallBase>(I)) {
+        if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic())
+          Container.insert(Container.end(), I);
+      } else {
+        Container.insert(Container.end(), I);
+      }
+    }
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM,
+                SmallDenseMap<Function *, Function *>& InlinableClones,
+                GlobalVariable *P) {
+  auto PV = P->getName().substr(P->getName().rfind('.') + 1).str();
+  auto Dx = PV.find(',');
+  while (Dx != std::string::npos) {
+    PV.insert(++Dx, {'+'});
+
+    Dx = PV.find(',', Dx);
+  }
+
+  auto PTy = P->getValueType();
+  P->setLinkage(GlobalValue::PrivateLinkage);
+  P->setExternallyInitialized(false);
+
+  if (P->getName().starts_with("llvm.amdgcn.is"))
+    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+  else
+    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV)));
+
+  ModulePassManager MPM;
+  MPM.addPass(AlwaysInlinerPass());
+
+  SmallPtrSet<Instruction *, 32> ToFold;
+  collectUsers(P, MPM, MAM, InlinableClones, ToFold);
+
+  if (ToFold.empty())
+    return {PreservedAnalyses::all(), true};
+
+  do {
+    auto I = *ToFold.begin();
+    ToFold.erase(I);
+
+    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+      collectUsers(I, MPM, MAM, InlinableClones, ToFold);
+      I->replaceAllUsesWith(C);
+      I->eraseFromParent();
+      continue;
+    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
+      continue;
+    } else if (I->users().empty()) {
+      continue;
+    }
+
+    std::string W;
+    raw_string_ostream OS(W);
+
+    auto Caller = I->getParent()->getParent();
+
+    OS << "Impossible to constant fold feature predicate: " << P->getName()
+       << ", please simplify.\n";
+
+    Caller->getContext().diagnose(
+        DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error));
+
+    return {PreservedAnalyses::none(), false};
+  } while (!ToFold.empty());
+
+  return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+PreservedAnalyses
+AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  SmallVector<GlobalVariable *> Predicates;
+  for (auto &&G : M.globals()) {
+    if (!G.isDeclaration() || !G.hasName())
+      continue;
+    if (G.getName().starts_with("llvm.amdgcn."))
+      Predicates.push_back(&G);
+  }
+
+  if (Predicates.empty())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses Ret = PreservedAnalyses::all();
+
+  SmallDenseMap<Function *, Function *> InlinableClones;
+  const auto &ST = TM.getSubtarget<GCNSubtarget>(
+      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+  for (auto &&P : Predicates) {
+    auto R = handlePredicate(ST, MAM, InlinableClones, P);
+
+    if (!R.second)
+      return PreservedAnalyses::none();
+
+    Ret.intersect(R.first);
+  }
+
+  for (auto &&C : InlinableClones)
+    C.second->eraseFromParent();
+
+  return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6a45392b5f099..c3c9e24c2efa4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,8 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-expand-feature-predicates",
+            AMDGPUExpandFeaturePredicatesPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4937b434bc955..8e8a6e1eda437 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -803,7 +803,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #include "llvm/Passes/TargetPassRegistry.inc"
 
   PB.registerPipelineStartEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level) {
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
         if (EnableHipStdPar)
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
       });
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..a389200f0db8e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUExpandFeaturePredicates.cpp
   AMDGPUExportClustering.cpp
   AMDGPUExportKernelRuntimeHandles.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
new file mode 100644
index 0000000000000..bfc35d8c76e37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
@@ -0,0 +1,28 @@
+; REQUIRES: amdgpu-registered-target
+
+; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s
+
+; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by   %call = call i1 %1(i1 zeroext false), please simplify.
+
+ at llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) #1
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) {
+entry:
+  %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1
+  %call = call i1 %1(i1 zeroext %3)
+  br i1 %call, label %if.gfx803, label %if.end
+
+if.gfx803:
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
new file mode 100644
index 0000000000000..277323c353260
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s
+
+;; The IR was derived from the following source:
+;; extern "C" __global__ void kernel(int* p, int x)
+;; {
+;;     if (__builtin_amdgcn_processor_is("gfx1201") ||
+;;         __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+;;         __builtin_amdgcn_s_sleep_var(x);
+;;     if (!__builtin_amdgcn_processor_is("gfx906"))
+;;         __builtin_amdgcn_s_wait_event_export_ready();
+;;     else if (__builtin_amdgcn_processor_is("gfx1010") ||
+;;         __builtin_amdgcn_processor_is("gfx1101"))
+;;         __builtin_amdgcn_s_ttracedata_imm(1);
+;;     while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+;;     do {
+;;         *p -= x;
+;;     } while (__builtin_amdgcn_processor_is("gfx1010"));
+;;     for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+;;
+;;     if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+;;         __builtin_amdgcn_s_wait_event_export_ready();
+;;     else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+;;         __builtin_amdgcn_s_ttracedata_imm(1);
+;;
+;;     do {
+;;         *p -= x;
+;;     } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+;;     for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+;; }
+
+ at llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1
+ at llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep.var(i32)
+declare void @llvm.amdgcn.s.wait.event.export.ready()
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg)
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
+; GFX906-LABEL: define amdgpu_kernel void @kernel(
+; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX906-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX906-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX906-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX906-NEXT:    br label %[[IF_NOT_GFX906]]
+; GFX906:       [[IF_NOT_GFX906]]:
+; GFX906-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX906:       [[IF_NOT_GFX907:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX906-NEXT:    br label %[[IF_END6:.*]]
+; GFX906:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX906-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX906:       [[LOR_NOT_GFX1010]]:
+; GFX906-NEXT:    br label %[[FOR_COND:.*]]
+; GFX906:       [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX906-NEXT:    br label %[[IF_END6]]
+; GFX906:       [[IF_END6]]:
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    br label %[[FOR_COND]]
+; GFX906:       [[FOR_COND]]:
+; GFX906-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX906-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT:    br label %[[IF_GFX10_INSTS1:.*]]
+; GFX906:       [[IF_GFX11_INSTS:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX906-NEXT:    br label %[[IF_END11:.*]]
+; GFX906:       [[IF_GFX10_INSTS1]]:
+; GFX906-NEXT:    br label %[[IF_END11]]
+; GFX906:       [[IF_GFX10_INSTS:.*:]]
+; GFX906-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX906-NEXT:    br label %[[IF_END11]]
+; GFX906:       [[IF_END11]]:
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX906-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT:    ret void
+;
+; GFX1010-LABEL: define amdgpu_kernel void @kernel(
+; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1010-NEXT:  [[ENTRY:.*:]]
+; GFX1010-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1010-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1010-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1010-NEXT:    br label %[[IF_END:.*]]
+; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1010-NEXT:    br label %[[IF_END]]
+; GFX1010:       [[IF_END]]:
+; GFX1010-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1010:       [[IF_NOT_GFX907]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT:    br label %[[IF_END6:.*]]
+; GFX1010:       [[IF_NOT_GFX906:.*:]]
+; GFX1010-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1010:       [[LOR_NOT_GFX1010:.*:]]
+; GFX1010-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1010:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT:    br label %[[IF_END6]]
+; GFX1010:       [[IF_END6]]:
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    call void @llvm.assume(i1 false)
+; GFX1010-NEXT:    br label %[[FOR_COND]]
+; GFX1010:       [[FOR_COND]]:
+; GFX1010-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1010-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1010:       [[IF_GFX11_INSTS:.*:]]
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT:    br label %[[IF_END11:.*]]
+; GFX1010:       [[IF_ELSE8]]:
+; GFX1010-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1010:       [[IF_GFX10_INSTS]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT:    br label %[[IF_END11]]
+; GFX1010:       [[IF_END11]]:
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1010-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT:    ret void
+;
+; GFX1101-LABEL: define amdgpu_kernel void @kernel(
+; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1101-NEXT:  [[ENTRY:.*:]]
+; GFX1101-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1101-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1101-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]]
+; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS1]]:
+; GFX1101-NEXT:    br label %[[IF_END:.*]]
+; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS:.*:]]
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1101-NEXT:    br label %[[IF_END]]
+; GFX1101:       [[IF_END]]:
+; GFX1101-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1101:       [[IF_NOT_GFX907]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT:    br label %[[IF_END6:.*]]
+; GFX1101:       [[IF_NOT_GFX906:.*:]]
+; GFX1101-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX1101:       [[LOR_NOT_GFX1010]]:
+; GFX1101-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1101:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1101-NEXT:    br label %[[IF_END6]]
+; GFX1101:       [[IF_END6]]:
+; GFX1101-NEXT:    call void @llvm.assume(i1 false)
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1101:       [[FOR_COND]]:
+; GFX1101-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1101-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1101:       [[IF_GFX11_INSTS]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT:    br label %[[IF_END11:.*]]
+; GFX1101:       [[IF_ELSE8:.*:]]
+; GFX1101-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1101:       [[IF_GFX10_INSTS]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1101-NEXT:    br label %[[IF_END11]]
+; GFX1101:       [[IF_END11]]:
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1101-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT:    ret void
+;
+; GFX1201-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1201:       [[LOR_NOT_GFX1201:.*:]]
+; GFX1201-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS]]
+; GFX1201:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-NEXT:    br label %[[IF_END:.*]]
+; GFX1201:       [[IF_END]]:
+; GFX1201-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1201:       [[IF_NOT_GFX907]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT:    br label %[[IF_END6:.*]]
+; GFX1201:       [[IF_NOT_GFX906:.*:]]
+; GFX1201-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX1201:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX1201-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1201:       [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-NEXT:    br label %[[IF_END6]]
+; GFX1201:       [[IF_END6]]:
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    br label %[[FOR_COND]]
+; GFX1201:       [[FOR_COND]]:
+; GFX1201-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201:       [[IF_GFX11_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT:    br label %[[IF_END11:.*]]
+; GFX1201:       [[IF_ELSE8:.*:]]
+; GFX1201-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1201:       [[IF_GFX10_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-NEXT:    br label %[[IF_END11]]
+; GFX1201:       [[IF_END11]]:
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT:    ret void
+;
+; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-W64-NEXT:  [[ENTRY:.*:]]
+; GFX1201-W64-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-W64-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1201-W64:       [[LOR_NOT_GFX1201:.*:]]
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS]]
+; GFX1201-W64:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-W64-NEXT:    br label %[[IF_END:.*]]
+; GFX1201-W64:       [[IF_END]]:
+; GFX1201-W64-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1201-W64:       [[IF_NOT_GFX907]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT:    br label %[[IF_END6:.*]]
+; GFX1201-W64:       [[IF_NOT_GFX906:.*:]]
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1010_OR_GFX1102:.*]]
+; GFX1201-W64:       [[IF_GFX1010_OR_GFX1102]]:
+; GFX1201-W64-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1201-W64:       [[IF_GFX1010_OR_GFX1101:.*:]]
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-W64-NEXT:    br label %[[IF_END6]]
+; GFX1201-W64:       [[IF_END6]]:
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    br label %[[FOR_COND]]
+; GFX1201-W64:       [[FOR_COND]]:
+; GFX1201-W64-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-W64-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201-W64:       [[IF_GFX11_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT:    br label %[[IF_END11:.*]]
+; GFX1201-W64:       [[IF_ELSE8:.*:]]
+; GFX1201-W64-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1201-W64:       [[IF_GFX10_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1201-W64-NEXT:    br label %[[IF_END11]]
+; GFX1201-W64:       [[IF_END11]]:
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 false)
+; GFX1201-W64-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-W64-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    ret void
+;
+entry:
+  %0 = ptrtoint ptr addrspace(1) %p.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
+  br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201
+
+lor.not.gfx1201:
+  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1
+  br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end
+
+if.gfx1201.or.gfx12-insts:
+  call void @llvm.amdgcn.s.sleep.var(i32 %x)
+  br label %if.end
+
+if.end:
+  %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
+  br i1 %4, label %if.gfx906, label %if.not.gfx906
+
+if.not.gfx906:
+  call void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end6
+
+if.gfx906:
+  %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+  br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010
+
+lor.not.gfx1010:
+  %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+  br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond
+
+if.gfx1010.or.gfx1101:
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end6
+
+if.end6:
+  %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+  %7 = icmp ne i1 %.pr.pr, true
+  call void @llvm.assume(i1 %7)
+  %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+  %8 = icmp ne i1 %.pr6.pr, true
+  call void @llvm.assume(i1 %8)
+  br label %for.cond
+
+for.cond:
+  %.promoted = load i32, ptr %1, align 4
+  %sub.peel = sub nsw i32 %.promoted, %x
+  store i32 %sub.peel, ptr %1, align 4
+  %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1
+  br i1 %9, label %if.gfx11-insts, label %if.else8
+
+if.gfx11-insts:
+  call void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end11
+
+if.else8:
+  %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+  br i1 %10, label %if.gfx10-insts, label %if.end11
+
+if.gfx10-insts:
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end11
+
+if.end11:
+  %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1
+  %11 = icmp ne i1 %.pr7, true
+  call void @llvm.assume(i1 %11)
+  %.promoted9 = load i32, ptr %1, align 4
+  %sub13.peel = sub nsw i32 %.promoted9, %x
+  store i32 %sub13.peel, ptr %1, align 4
+  ret void
+}
+
+declare void @llvm.assume(i1 noundef)

>From 8bf116837e2bd77ff5906d025fdb80bfa5507382 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:39:32 +0100
Subject: [PATCH 2/4] Fix format.

---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp |  8 ++++----
 clang/lib/Sema/SemaExpr.cpp                 | 20 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 7b1a3815144b4..8ad1ab74f221d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -292,8 +292,8 @@ static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
   P->setConstant(true);
   P->setExternallyInitialized(true);
 
-  return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(),
-                                           KnownNonNull));
+  return CGF.Builder.CreateLoad(
+      RawAddress(P, PTy, CharUnits::One(), KnownNonNull));
 }
 
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
@@ -600,7 +600,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_processor_is: {
     assert(CGM.getTriple().isSPIRV() &&
            "__builtin_amdgcn_processor_is should never reach CodeGen for "
-             "concrete targets!");
+           "concrete targets!");
     StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
     return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
   }
@@ -609,7 +609,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
            "__builtin_amdgcn_is_invocable should never reach CodeGen for "
            "concrete targets!");
     auto FD = cast<FunctionDecl>(
-      cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+        cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
     StringRef RF =
         getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
     return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 24f5262ab3cf4..bd0183ae4fb82 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20549,14 +20549,16 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) {
         (!Sema.getASTContext().getAuxTargetInfo() ||
          !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) {
       Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+                diag::err_amdgcn_processor_is_arg_invalid_value)
+          << N;
       return false;
     }
   } else {
     auto Arg = CE->getArg(0);
     if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) {
       Sema.Diag(CE->getExprLoc(),
-                diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+                diag::err_amdgcn_is_invocable_arg_invalid_value)
+          << Arg;
       return false;
     }
   }
@@ -20568,10 +20570,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
   if (auto UO = dyn_cast<UnaryOperator>(E)) {
     auto SE = dyn_cast<CallExpr>(UO->getSubExpr());
     if (IsAMDGPUPredicateBI(SE)) {
-      assert(
-        UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
-        "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
-          "can only be used as operands of logical ops!");
+      assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot &&
+             "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+             "can only be used as operands of logical ops!");
 
       if (!ValidateAMDGPUPredicateBI(Sema, SE)) {
         Invalid = true;
@@ -20588,10 +20589,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) {
     auto LHS = dyn_cast<CallExpr>(BO->getLHS());
     auto RHS = dyn_cast<CallExpr>(BO->getRHS());
     if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) {
-      assert(
-          BO->isLogicalOp() &&
-          "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
-            "can only be used as operands of logical ops!");
+      assert(BO->isLogicalOp() &&
+             "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable "
+             "can only be used as operands of logical ops!");
 
       if (!ValidateAMDGPUPredicateBI(Sema, LHS) ||
           !ValidateAMDGPUPredicateBI(Sema, RHS)) {

>From 3421292b6e3261410734fb5a324f7dec79080fc1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:42:24 +0100
Subject: [PATCH 3/4] Fix broken patch merge.

---
 .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp  | 159 ++++++++++++++
 .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp   | 207 ------------------
 2 files changed, 159 insertions(+), 207 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
new file mode 100644
index 0000000000000..17357c452b6d3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -0,0 +1,159 @@
+//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic feature
+// predicates into target specific quantities / sequences. In this context, a
+// generic feature predicate is an implementation detail global variable that
+// is inserted by the FE as a consequence of using either the __builtin_cpu_is
+// or the __builtin_amdgcn_is_invocable special builtins on an abstract target
+// (AMDGCNSPIRV). These placeholder globals are used to guide target specific
+// lowering, once the concrete target is known, by way of constant folding their
+// value all the way into a terminator (i.e. a controlled block) or into a no
+// live use scenario. The pass makes a best effort attempt to look through
+// calls, i.e. a constant evaluatable passthrough of a predicate value will
+// generally work, however we hard fail if the folding fails, to avoid obtuse
+// BE errors or opaque run time errors. This pass should run as early as
+// possible / immediately after Clang CodeGen, so that the optimisation pipeline
+// and the BE operate with concrete target data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+template <typename C>
+void collectUsers(Value *V, C &Container) {
+  assert(V && "Must pass an existing Value!");
+
+  for (auto &&U : V->users())
+    if (auto I = dyn_cast<Instruction>(U))
+      Container.insert(Container.end(), I);
+}
+
+inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+  const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has");
+  const auto Offset =
+      IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is");
+
+  auto PV = P->getName().substr(Offset).str();
+  if (IsFeature) {
+    auto Dx = PV.find(',');
+    while (Dx != std::string::npos) {
+      PV.insert(++Dx, {'+'});
+
+      Dx = PV.find(',', Dx);
+    }
+    PV.insert(PV.cbegin(), '+');
+  }
+
+  auto PTy = P->getValueType();
+  P->setLinkage(GlobalValue::PrivateLinkage);
+  P->setExternallyInitialized(false);
+
+  if (IsFeature)
+    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV)));
+  else
+    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+}
+
+std::pair<PreservedAnalyses, bool>
+unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
+  std::string W;
+  raw_string_ostream OS(W);
+
+  OS << "Impossible to constant fold feature predicate: " << *P
+     << " used by " << *NoFold << ", please simplify.\n";
+
+  Caller->getContext().diagnose(
+      DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
+
+  return {PreservedAnalyses::none(), false};
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+  setPredicate(ST, P);
+
+  SmallPtrSet<Instruction *, 32> ToFold;
+  collectUsers(P, ToFold);
+
+  if (ToFold.empty())
+    return {PreservedAnalyses::all(), true};
+
+  do {
+    auto I = *ToFold.begin();
+    ToFold.erase(I);
+
+    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
+      collectUsers(I, ToFold);
+      I->replaceAllUsesWith(C);
+      I->eraseFromParent();
+      continue;
+    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
+      continue;
+    } else if (I->users().empty()) {
+      continue;
+    }
+
+    return unfoldableFound(I->getParent()->getParent(), P, I);
+  } while (!ToFold.empty());
+
+  return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+PreservedAnalyses
+AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  SmallVector<GlobalVariable *> Predicates;
+  for (auto &&G : M.globals()) {
+    if (!G.isDeclaration() || !G.hasName())
+      continue;
+    if (G.getName().starts_with("llvm.amdgcn."))
+      Predicates.push_back(&G);
+  }
+
+  if (Predicates.empty())
+    return PreservedAnalyses::all();
+
+  const auto &ST = TM.getSubtarget<GCNSubtarget>(
+      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+  auto Ret = PreservedAnalyses::all();
+  for (auto &&P : Predicates) {
+    auto R = handlePredicate(ST, P);
+
+    if (!R.second)
+      break;
+
+    Ret.intersect(R.first);
+  }
+
+  for (auto &&P : Predicates)
+    P->eraseFromParent();
+
+  return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
deleted file mode 100644
index 125051c6aa0cf..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic pseudo-
-// intrinsics into target specific quantities / sequences. In this context, a
-// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
-// specific instruction, but rather is intended as a mechanism for abstractly
-// conveying target specific info to a HLL / the FE, without concretely
-// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
-// This pass should run as early as possible / immediately after Clang CodeGen,
-// so that the optimisation pipeline and the BE operate with concrete target
-// data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-namespace {
-inline Function *getCloneForInlining(Function *OldF) {
-  assert(OldF && "Must pass an existing Function!");
-
-  // TODO - Alias Value to clone arg.
-  ValueToValueMapTy VMap;
-
-  auto NewF = CloneFunction(OldF, VMap);
-
-  NewF->removeFnAttr(Attribute::OptimizeNone);
-  NewF->removeFnAttr(Attribute::NoInline);
-  NewF->addFnAttr(Attribute::AlwaysInline);
-
-  return NewF;
-}
-
-template <typename C>
-inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner,
-                         ModuleAnalysisManager &MAM,
-                         SmallDenseMap<Function *, Function *> &InlinableClones,
-                         C &Container) {
-  assert(V && "Must pass an existing Value!");
-
-  auto A = PreservedAnalyses::all();
-
-  constexpr auto IsValidCall = [](auto &&U) {
-    if (auto CB = dyn_cast<CallBase>(U))
-      if (auto F = CB->getCalledFunction())
-        if (!F->isIntrinsic() && !F->isDeclaration())
-          return true;
-    return false;
-  };
-
-  SmallVector<User *> Calls{};
-  copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
-
-  while (!Calls.empty()) {
-    for (auto &&Call : Calls) {
-      auto CB = cast<CallBase>(Call);
-      auto &TempF = InlinableClones[CB->getCalledFunction()];
-
-      if (!TempF)
-        TempF = getCloneForInlining(CB->getCalledFunction());
-
-      CB->setCalledFunction(TempF);
-      CB->removeFnAttr(Attribute::NoInline);
-      CB->addFnAttr(Attribute::AlwaysInline);
-
-      AlwaysInliner.run(*TempF->getParent(), MAM);
-    }
-
-    Calls.clear();
-
-    copy_if(V->users(), std::back_inserter(Calls), IsValidCall);
-  }
-
-  for (auto &&U : V->users())
-    if (auto I = dyn_cast<Instruction>(U)) {
-      if (auto CB = dyn_cast<CallBase>(I)) {
-        if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic())
-          Container.insert(Container.end(), I);
-      } else {
-        Container.insert(Container.end(), I);
-      }
-    }
-}
-
-std::pair<PreservedAnalyses, bool>
-handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM,
-                SmallDenseMap<Function *, Function *>& InlinableClones,
-                GlobalVariable *P) {
-  auto PV = P->getName().substr(P->getName().rfind('.') + 1).str();
-  auto Dx = PV.find(',');
-  while (Dx != std::string::npos) {
-    PV.insert(++Dx, {'+'});
-
-    Dx = PV.find(',', Dx);
-  }
-
-  auto PTy = P->getValueType();
-  P->setLinkage(GlobalValue::PrivateLinkage);
-  P->setExternallyInitialized(false);
-
-  if (P->getName().starts_with("llvm.amdgcn.is"))
-    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
-  else
-    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV)));
-
-  ModulePassManager MPM;
-  MPM.addPass(AlwaysInlinerPass());
-
-  SmallPtrSet<Instruction *, 32> ToFold;
-  collectUsers(P, MPM, MAM, InlinableClones, ToFold);
-
-  if (ToFold.empty())
-    return {PreservedAnalyses::all(), true};
-
-  do {
-    auto I = *ToFold.begin();
-    ToFold.erase(I);
-
-    if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) {
-      collectUsers(I, MPM, MAM, InlinableClones, ToFold);
-      I->replaceAllUsesWith(C);
-      I->eraseFromParent();
-      continue;
-    } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) {
-      continue;
-    } else if (I->users().empty()) {
-      continue;
-    }
-
-    std::string W;
-    raw_string_ostream OS(W);
-
-    auto Caller = I->getParent()->getParent();
-
-    OS << "Impossible to constant fold feature predicate: " << P->getName()
-       << ", please simplify.\n";
-
-    Caller->getContext().diagnose(
-        DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error));
-
-    return {PreservedAnalyses::none(), false};
-  } while (!ToFold.empty());
-
-  return {PreservedAnalyses::none(), true};
-}
-} // Unnamed namespace.
-
-PreservedAnalyses
-AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) {
-  if (M.empty())
-    return PreservedAnalyses::all();
-
-  SmallVector<GlobalVariable *> Predicates;
-  for (auto &&G : M.globals()) {
-    if (!G.isDeclaration() || !G.hasName())
-      continue;
-    if (G.getName().starts_with("llvm.amdgcn."))
-      Predicates.push_back(&G);
-  }
-
-  if (Predicates.empty())
-    return PreservedAnalyses::all();
-
-  PreservedAnalyses Ret = PreservedAnalyses::all();
-
-  SmallDenseMap<Function *, Function *> InlinableClones;
-  const auto &ST = TM.getSubtarget<GCNSubtarget>(
-      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
-
-  for (auto &&P : Predicates) {
-    auto R = handlePredicate(ST, MAM, InlinableClones, P);
-
-    if (!R.second)
-      return PreservedAnalyses::none();
-
-    Ret.intersect(R.first);
-  }
-
-  for (auto &&C : InlinableClones)
-    C.second->eraseFromParent();
-
-  return Ret;
-}

>From 539c7e6c6357fa7330de9e23fa13cf795061b85b Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 2 Apr 2025 03:51:08 +0100
Subject: [PATCH 4/4] Add release notes.

---
 clang/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c4e82678949ff..005b33da29d2d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -418,6 +418,10 @@ AMDGPU Support
 ^^^^^^^^^^^^^^
 
 - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6.
+- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
+  a late / deferred query for the current target processor
+- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
+  which enables fine-grained, per-builtin, feature availability
 
 NVPTX Support
 ^^^^^^^^^^^^^^



More information about the llvm-commits mailing list