[llvm] [AMDGPU] Add intrinsics and MIs for ds_bvh_stack_* (PR #130007)

Mariusz Sikora via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 6 00:27:23 PST 2025


https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/130007

>From d7a76b701125a0b6dae8f5e60749a20ed6337e05 Mon Sep 17 00:00:00 2001
From: Mateja Marjanovic <mateja.marjanovic at amd.com>
Date: Sun, 2 Mar 2025 16:01:03 -0500
Subject: [PATCH 1/2] [AMDGPU] Add intrinsics and MIs for ds_bvh_stack_*

New intrinsics / instructions :
int_amdgcn_ds_bvh_stack_push4_pop1_rtn / ds_bvh_stack_push4_pop1_rtn_b32
int_amdgcn_ds_bvh_stack_push8_pop1_rtn / ds_bvh_stack_push8_pop1_rtn_b32
int_amdgcn_ds_bvh_stack_push8_pop2_rtn / ds_bvh_stack_push8_pop2_rtn_b64
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 30 +++++-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 21 +++-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 19 +++-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  5 +-
 llvm/lib/Target/AMDGPU/DSInstructions.td      | 24 ++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  5 +-
 .../llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll  | 98 +++++++++++++++++++
 llvm/test/MC/AMDGPU/gfx12_asm_ds.s            | 18 ++++
 .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt  | 24 +++++
 10 files changed, 234 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..f717a87342642 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2714,7 +2714,7 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
             [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
             "", [SDNPMemOperand]>;
 
-def int_amdgcn_ds_bvh_stack_rtn :
+class IntDSBVHStackRtn :
   Intrinsic<
     [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
     [
@@ -2726,6 +2726,8 @@ def int_amdgcn_ds_bvh_stack_rtn :
     [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
   >;
 
+def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn;
+
 def int_amdgcn_s_wait_event_export_ready :
   ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
@@ -2801,6 +2803,32 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty
 // GFX12 Intrinsics
 //===----------------------------------------------------------------------===//
 
+def int_amdgcn_ds_bvh_stack_push4_pop1_rtn : IntDSBVHStackRtn;
+
+def int_amdgcn_ds_bvh_stack_push8_pop1_rtn :
+  Intrinsic<
+    [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
+    [
+      llvm_i32_ty,   // %addr
+      llvm_i32_ty,   // %data0
+      llvm_v8i32_ty, // %data1
+      llvm_i32_ty,   // %offset
+    ],
+    [ImmArg<ArgIndex<3>>, IntrWillReturn]
+  >;
+
+def int_amdgcn_ds_bvh_stack_push8_pop2_rtn :
+  Intrinsic<
+    [llvm_i64_ty, llvm_i32_ty], // %vdst, %addr
+    [
+      llvm_i32_ty,   // %addr
+      llvm_i32_ty,   // %data0
+      llvm_v8i32_ty, // %data1
+      llvm_i32_ty,   // %offset
+    ],
+    [ImmArg<ArgIndex<3>>, IntrWillReturn]
+  >;
+
 // llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
 def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
   Intrinsic<[llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e90754103ff1..e93a401ee20fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2665,8 +2665,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
 
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
-void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
-  unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
+  unsigned Opc;
+  switch (IntrID) {
+  case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+    Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+    break;
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
+    break;
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
+    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
+    break;
+  }
   SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
                    N->getOperand(5), N->getOperand(0)};
 
@@ -2830,7 +2842,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
     return;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
-    SelectDSBvhStackIntrinsic(N);
+  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
+    SelectDSBvhStackIntrinsic(N, IntrID);
     return;
   case Intrinsic::amdgcn_init_whole_wave:
     CurDAG->getMachineFunction()
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7dcd208a9cdd4..f3b9364fdb92b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectFMAD_FMA(SDNode *N);
   void SelectFP_EXTEND(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
-  void SelectDSBvhStackIntrinsic(SDNode *N);
+  void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectInterpP1F16(SDNode *N);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index eb781cbd1c8da..8822b8ab7b8ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2266,7 +2266,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
   Register Data1 = MI.getOperand(5).getReg();
   unsigned Offset = MI.getOperand(6).getImm();
 
-  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
+  unsigned Opc;
+  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+  case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+    Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+    break;
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
+    break;
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
+    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
+    break;
+  }
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
                  .addDef(Dst1)
                  .addUse(Addr)
                  .addUse(Data0)
@@ -2321,6 +2335,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     }
     break;
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
     return selectDSBvhStackIntrinsic(I);
   case Intrinsic::amdgcn_s_barrier_signal_var:
     return selectNamedBarrierInit(I, IntrinsicID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d79200c319b65..5f85836510c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5252,7 +5252,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       break;
-    case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
+    case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+    case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+    case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+    case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
       OpdsMapping[0] =
           getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
       OpdsMapping[1] =
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d3487daee364f..b575f2bafa18e 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -258,10 +258,13 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
   }
 }
 
-class DS_BVH_STACK<string opName>
+class DS_BVH_STACK<string opName,
+                   RegisterClass rc,
+                   RegisterClass src>
 : DS_Pseudo<opName,
-  (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst, VGPR_32:$addr),
-  (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, VReg_128:$data1, Offset:$offset),
+  (outs getLdStRegisterOperand<rc>.ret:$vdst, VGPR_32:$addr),
+  (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0,
+   src:$data1, Offset:$offset),
   " $vdst, $addr, $data0, $data1$offset"> {
   let Constraints = "$addr = $addr_in";
   let DisableEncoding = "$addr_in";
@@ -722,7 +725,8 @@ def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
 let SubtargetPredicate = isGFX11Plus in {
 
 let OtherPredicates = [HasImageInsts] in
-def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
+def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32",
+                                        VGPR_32, VReg_128> ;
 
 } // let SubtargetPredicate = isGFX11Plus
 
@@ -732,6 +736,13 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
 
 let SubtargetPredicate = isGFX12Plus in {
 
+let OtherPredicates = [HasImageInsts] in {
+def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK<
+  "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>;
+def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK<
+  "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>;
+} // End OtherPredicates = [HasImageInsts].
+
 defm DS_COND_SUB_U32      : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
 defm DS_COND_SUB_RTN_U32  : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
 defm DS_SUB_CLAMP_U32     : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
@@ -1268,6 +1279,11 @@ defm DS_PK_ADD_BF16       : DS_Real_gfx12<0x09b>;
 defm DS_PK_ADD_RTN_BF16   : DS_Real_gfx12<0x0ab>;
 defm DS_BPERMUTE_FI_B32   : DS_Real_gfx12<0x0cd>;
 
+defm DS_BVH_STACK_RTN_B32             : DS_Real_gfx12<0x0e0,
+  "ds_bvh_stack_push4_pop1_rtn_b32", true>;
+defm DS_BVH_STACK_PUSH8_POP1_RTN_B32  : DS_Real_gfx12<0x0e1>;
+defm DS_BVH_STACK_PUSH8_POP2_RTN_B64  : DS_Real_gfx12<0x0e2>;
+
 // New aliases added in GFX12 without renaming the instructions.
 let AssemblerPredicate = isGFX12Plus in {
   def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fe095414e5172..76cc687ddc125 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1457,7 +1457,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
-  case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
+  case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
 
     const GCNTargetMachine &TM =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
new file mode 100644
index 0000000000000..e46bdae883c2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+
+declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32, i32, <4 x i32>, i32 immarg)
+declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32, i32, <8 x i32>, i32 immarg)
+declare { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32, i32, <8 x i32>, i32 immarg)
+declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1)
+
+define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push4_pop1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    export prim v1, off, off, off done
+; CHECK-NEXT:    s_endpgm
+  %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
+  %vdst = extractvalue { i32, i32 } %pair, 0
+  %newaddr = extractvalue { i32, i32 } %pair, 1
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push4_pop1_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    export prim v1, off, off, off done
+; CHECK-NEXT:    s_endpgm
+  %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
+  %vdst = extractvalue { i32, i32 } %pair, 0
+  %newaddr = extractvalue { i32, i32 } %pair, 1
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    export prim v1, off, off, off done
+; CHECK-NEXT:    s_endpgm
+  %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+  %vdst = extractvalue { i32, i32 } %pair, 0
+  %newaddr = extractvalue { i32, i32 } %pair, 1
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop1_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    export prim v1, off, off, off done
+; CHECK-NEXT:    s_endpgm
+  %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
+  %vdst = extractvalue { i32, i32 } %pair, 0
+  %newaddr = extractvalue { i32, i32 } %pair, 1
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop2:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9]
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    export prim v1, off, off, off done
+; CHECK-NEXT:    s_endpgm
+  %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+  %vdst = extractvalue { i64, i32 } %pair, 0
+  %newaddr = extractvalue { i64, i32 } %pair, 1
+  %vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
+  %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
+  %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop2_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] offset:1
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    export prim v1, off, off, off done
+; CHECK-NEXT:    s_endpgm
+  %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
+  %vdst = extractvalue { i64, i32 } %pair, 0
+  %newaddr = extractvalue { i64, i32 } %pair, 1
+  %vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
+  %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
+  %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+  ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
index 34c42affdd46c..364463f9404bc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
@@ -1922,3 +1922,21 @@ ds_bpermute_fi_b32 v5, v1, v2 offset:0
 
 ds_bpermute_fi_b32 v255, v255, v255 offset:4
 // GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
+
+ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
+// GFX12: encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
+// GFX12: encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
+// GFX12: encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
+// GFX12: encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251]
+// GFX12: encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
+
+ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127
+// GFX12: encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
index d66748135ffd4..d9381b50ca29f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
@@ -3242,3 +3242,27 @@
 
 # GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
 0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff
+
+# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 ; encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] ; encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 ; encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] ; encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
+0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
+0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
+0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] ; encoding: [0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
+0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01

>From 97bf706de1417fbfcad00cb9640af1a4784d700d Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Thu, 6 Mar 2025 03:27:01 -0500
Subject: [PATCH 2/2] Use poison instead of undef

---
 .../AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
index e46bdae883c2b..44971e7304dbd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
@@ -17,7 +17,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x
   %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
   %vdst = extractvalue { i32, i32 } %pair, 0
   %newaddr = extractvalue { i32, i32 } %pair, 1
-  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
   ret void
 }
 
@@ -31,7 +31,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4
   %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
   %vdst = extractvalue { i32, i32 } %pair, 0
   %newaddr = extractvalue { i32, i32 } %pair, 1
-  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
   ret void
 }
 
@@ -45,7 +45,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x
   %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
   %vdst = extractvalue { i32, i32 } %pair, 0
   %newaddr = extractvalue { i32, i32 } %pair, 1
-  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
   ret void
 }
 
@@ -59,7 +59,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8
   %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
   %vdst = extractvalue { i32, i32 } %pair, 0
   %newaddr = extractvalue { i32, i32 } %pair, 1
-  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
   ret void
 }
 
@@ -76,7 +76,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x
   %vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
   %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
   %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
-  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
   ret void
 }
 
@@ -93,6 +93,6 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8
   %vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
   %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
   %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
-  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
   ret void
 }



More information about the llvm-commits mailing list