[llvm] 0c583e7 - AMDGPU: Add llvm.amdgcn.s.wait.event intrinsic (#180170)

Sun Feb 8 23:45:20 PST 2026

Author: Matt Arsenault
Date: 2026-02-09T08:45:13+01:00
New Revision: 0c583e784ecec8fbc8115217710d364edc4929b4

URL: https://github.com/llvm/llvm-project/commit/0c583e784ecec8fbc8115217710d364edc4929b4
DIFF: https://github.com/llvm/llvm-project/commit/0c583e784ecec8fbc8115217710d364edc4929b4.diff

LOG: AMDGPU: Add llvm.amdgcn.s.wait.event intrinsic (#180170)

Exactly match the s_wait_event instruction. For some reason we already
had this instruction used through llvm.amdgcn.s.wait.event.export.ready,
but that hardcodes a specific value. This should really be a bitmask
that
can combine multiple wait types.

gfx11 -> gfx12 broke compatabilty in a weird way, by inverting the
interpretation of the bit but also shifting the used bit by 1. Simplify
the selection of the old intrinsic by just using the magic number 2,
which should satisfy both cases.

Added: 
    clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.td
    clang/include/clang/Basic/DiagnosticSemaKinds.td
    clang/lib/Sema/SemaAMDGPU.cpp
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 740d136f465c1..17f081a906364 100644

--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -339,6 +339,7 @@ def __builtin_amdgcn_image_bvh_intersect_ray_lh : AMDGPUBuiltin<"_ExtVector<4, u
 // TODO: This is a no-op in wave32. Should the builtin require wavefrontsize64?
 def __builtin_amdgcn_permlane64 : AMDGPUBuiltin<"unsigned int(unsigned int)", [Const], "gfx11-insts">;
 def __builtin_amdgcn_s_wait_event_export_ready : AMDGPUBuiltin<"void()", [], "gfx11-insts">;
+def __builtin_amdgcn_s_wait_event : AMDGPUBuiltin<"void(_Constant short)", [], "gfx11-insts">;
 
 //===----------------------------------------------------------------------===//
 // WMMA builtins.

diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 96a1e3481b2ca..f12677ac11600 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13931,6 +13931,13 @@ def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or
 
 def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
 
+def warn_amdgpu_s_wait_event_mask_no_effect_target :
+  Warning<"event mask has no effect for target">,
+  InGroup<DiagGroup<"amdgpu-wait-event-mask">>;
+
+def note_amdgpu_s_wait_event_suggested_value :
+  Note<"value of 2 valid for export_ready for gfx11 and gfx12+">;
+
 def warn_comparison_in_enum_initializer : Warning<
   "comparison operator '%0' is potentially a typo for a shift operator '%1'">,
   InGroup<DiagGroup<"enum-compare-typo">>;

diff  --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 4261e1849133f..cec8f9d2675e6 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -89,6 +89,30 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_s_setreg:
     return SemaRef.BuiltinConstantArgRange(TheCall, /*ArgNum=*/0, /*Low=*/0,
                                            /*High=*/UINT16_MAX);
+  case AMDGPU::BI__builtin_amdgcn_s_wait_event: {
+    llvm::APSInt Result;
+    if (SemaRef.BuiltinConstantArg(TheCall, 0, Result))
+      return true;
+
+    bool IsGFX12Plus = Builtin::evaluateRequiredTargetFeatures(
+        "gfx12-insts", CallerFeatureMap);
+
+    // gfx11 -> gfx12 changed the interpretation of the bitmask. gfx12 inverted
+    // the intepretation for export_ready, but shifted the used bit by 1. Thus
+    // waiting for the export_ready event can use a value of 2 universally.
+    if (((IsGFX12Plus && !Result[1]) || (!IsGFX12Plus && Result[0])) ||
+        Result.getZExtValue() > 2) {
+      Expr *ArgExpr = TheCall->getArg(0);
+      SemaRef.targetDiag(ArgExpr->getExprLoc(),
+                         diag::warn_amdgpu_s_wait_event_mask_no_effect_target)
+          << ArgExpr->getSourceRange();
+      SemaRef.targetDiag(ArgExpr->getExprLoc(),
+                         diag::note_amdgpu_s_wait_event_suggested_value)
+          << ArgExpr->getSourceRange();
+    }
+
+    return false;
+  }
   case AMDGPU::BI__builtin_amdgcn_mov_dpp:
     return checkMovDPPFunctionCall(TheCall, 5, 1);
   case AMDGPU::BI__builtin_amdgcn_mov_dpp8:

diff  --git a/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl b/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl
new file mode 100644
index 0000000000000..1a9d40cf90cbe
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-s-wait-event.cl
@@ -0,0 +1,27 @@
+// xUN: %clang_cc1 -fsyntax-only -triple amdgcn-- -target-cpu gfx1100 -verify=ALL,GFX11 %s
+// RUN: %clang_cc1 -fsyntax-only -triple amdgcn-- -target-cpu gfx1200 -verify=ALL,GFX12 %s
+
+void test(int x) {
+  // ALL-error at +1 {{argument to '__builtin_amdgcn_s_wait_event' must be a constant integer}}
+  __builtin_amdgcn_s_wait_event(x);
+
+  // GFX11-expected-no-diagnostics
+  // GFX12-warning at +2 {{event mask has no effect for target}}
+  // GFX12-note at +1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(0); // 0 does nothing on gfx12
+
+  // GFX11-expected-no-diagnostics
+  // GFX12-warning at +2 {{event mask has no effect for target}}
+  // GFX12-note at +1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(1); // 1 does nothing on gfx11
+
+  __builtin_amdgcn_s_wait_event(2); // expected-no-diagnostics
+
+  // ALL-warning at +2 {{event mask has no effect for target}}
+  // ALL-note at +1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(3);
+
+  // ALL-warning at +2 {{event mask has no effect for target}}
+  // ALL-note at +1 {{value of 2 valid for export_ready for gfx11 and gfx12+}}
+  __builtin_amdgcn_s_wait_event(-1);
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a8eba9ed126b7..e35376ba404c0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2904,6 +2904,17 @@ class IntDSBVHStackRtn<LLVMType vdst, LLVMType data1> :
 def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn<vdst  = llvm_i32_ty,
                                                    data1 = llvm_v4i32_ty>;
 
+// Emit s_wait_event instruction. Note that between gfx11 and gfx12,
+// the bit for the export_ready event changed. gfx11 expects bit 0 to
+// be 0, and gfx12 expects bit 1 to be 0. Thus, an immediate value of
+// 2 can be used as the universal value for export_ready.
+def int_amdgcn_s_wait_event :
+  ClangBuiltin<"__builtin_amdgcn_s_wait_event">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]
+>;
+
+// Emits same instruction as s_wait_event, with a hardcoded immediate
+// value. FIXME: This should be removed
 def int_amdgcn_s_wait_event_export_ready :
   ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d265440c03274..874249dc83c9f 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1835,7 +1835,7 @@ let SubtargetPredicate = isGFX10Plus in {
 let SubtargetPredicate = isGFX11Plus in {
 let OtherPredicates = [HasExportInsts] in
   def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16),
-                                 "$simm16"> {
+                                 "$simm16", [(int_amdgcn_s_wait_event timm:$simm16)]> {
                                    let hasSideEffects = 1;
                                  }
   def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16),
@@ -1958,9 +1958,7 @@ def : GCNPat<
   (S_SEXT_I32_I16 $src)
 >;
 
-let SubtargetPredicate = isNotGFX12Plus in
-  def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>;
-let SubtargetPredicate = isGFX12Plus in
+let SubtargetPredicate = isGFX11Plus in
   def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 2))>;
 
 // The first 10 bits of the mode register are the core FP mode on all

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
index 27a8b35467218..0656671fac0df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
@@ -1,14 +1,47 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_wait_event:
-; GFX11: s_wait_event 0x0
-; GFX12: s_wait_event 0x2
-
-define amdgpu_ps void @test_wait_event() {
+; GCN-LABEL: {{^}}test_wait_event_export_ready:
+; GCN: s_wait_event 0x2
+define amdgpu_ps void @test_wait_event_export_ready() {
 entry:
   call void @llvm.amdgcn.s.wait.event.export.ready()
   ret void
 }
+
+; GCN-LABEL: {{^}}test_wait_event_0:
+; GCN: s_wait_event 0x0
+define amdgpu_ps void @test_wait_event_0() {
+  call void @llvm.amdgcn.s.wait.event(i16 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_1:
+; GCN: s_wait_event 0x1
+define amdgpu_ps void @test_wait_event_1() {
+  call void @llvm.amdgcn.s.wait.event(i16 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_2:
+; GCN: s_wait_event 0x2
+define amdgpu_ps void @test_wait_event_2() {
+  call void @llvm.amdgcn.s.wait.event(i16 2)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_3:
+; GCN: s_wait_event 0x3
+define amdgpu_ps void @test_wait_event_3() {
+  call void @llvm.amdgcn.s.wait.event(i16 3)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_wait_event_max:
+; GCN: s_wait_event 0xffff
+define amdgpu_ps void @test_wait_event_max() {
+  call void @llvm.amdgcn.s.wait.event(i16 -1)
+  ret void
+}