[llvm] lower waveid on GFX9 (PR #165332)

Mon Oct 27 15:32:18 PDT 2025

https://github.com/zwu-2025 created https://github.com/llvm/llvm-project/pull/165332

There is no safeway to get `waveid`, which is computed in [getLaneAndWarpId](https://github.com/triton-lang/triton/blob/main/lib/Conversion/TritonGPUToLLVM/Utility.cpp#L352)
on GFX9 by reading a scalar register.
In this PR, we add a new intrinsic `gfx9_wave_id` with attribute `convergent`.
In the lowering phase, we will re-compute workitem.id and apply 
 `waveid = (workitem.id.x & (num_warps * threads_per_warp)) / wavefront_size`

>From 9010ac8fd29a1e7f5c55e62f0ae27704c3422df0 Mon Sep 17 00:00:00 2001
From: test <test at amd.com>
Date: Mon, 27 Oct 2025 00:00:29 -0500
Subject: [PATCH] waveid

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  3 +++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ded00b1274670..8974262df56f1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2839,6 +2839,9 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
 def int_amdgcn_pops_exiting_wave_id :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
 
+// i32 @llvm.amdgcn.gfx9_wave_id(i32)
+def int_amdgcn_gfx9_wave_id : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
+
 //===----------------------------------------------------------------------===//
 // GFX10 Intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 16530087444d2..a671de619d426 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9874,6 +9874,20 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                : DAG.getPOISON(VT);
   case Intrinsic::amdgcn_wave_id:
     return lowerWaveID(DAG, Op);
+  case Intrinsic::amdgcn_gfx9_wave_id: {
+    MVT VT = MVT::i32;
+    auto UpperBound = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VT);
+
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), 0);
+    const ArgDescriptor Arg = MFI->getArgInfo().WorkItemIDX;
+    SDValue Val = loadInputValue(DAG, &AMDGPU::SGPR_32RegClass, MVT::i32,
+                                 SDLoc(DAG.getEntryNode()), Arg);
+    SDValue Bounded = DAG.getNode(ISD::AND, DL, VT, Val, UpperBound);
+    SDValue WaveFrontSize =  DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
+                           SDLoc(Op), MVT::i32);
+    return DAG.getNode(ISD::SDIV, DL, VT, Bounded, WaveFrontSize);
+  }
   case Intrinsic::amdgcn_lds_kernel_id: {
     if (MFI->isEntryFunction())
       return getLDSKernelId(DAG, DL);